PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
gist.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * gist.c
4  * interface routines for the postgres GiST index access method.
5  *
6  *
7  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/gist/gist.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/genam.h"
18 #include "access/gist_private.h"
19 #include "access/xloginsert.h"
20 #include "catalog/index.h"
21 #include "catalog/pg_collation.h"
22 #include "miscadmin.h"
23 #include "storage/bufmgr.h"
24 #include "storage/indexfsm.h"
25 #include "utils/memutils.h"
26 #include "utils/rel.h"
27 
28 /* non-export function prototypes */
29 static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
31  GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum);
33  GISTSTATE *giststate,
34  IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
36  bool unlockbuf, bool unlockleftchild);
38  GISTSTATE *giststate, List *splitinfo, bool releasebuf);
39 static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
40 
41 
42 #define ROTATEDIST(d) do { \
43  SplitedPageLayout *tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \
44  memset(tmp,0,sizeof(SplitedPageLayout)); \
45  tmp->block.blkno = InvalidBlockNumber; \
46  tmp->buffer = InvalidBuffer; \
47  tmp->next = (d); \
48  (d)=tmp; \
49 } while(0)
50 
51 
52 /*
53  * Create and return a temporary memory context for use by GiST. We
54  * _always_ invoke user-provided methods in a temporary memory
55  * context, so that memory leaks in those functions cannot cause
56  * problems. Also, we use some additional temporary contexts in the
57  * GiST code itself, to avoid the need to do some awkward manual
58  * memory management.
59  */
62 {
64  "GiST temporary context",
68 }
69 
70 /*
71  * gistbuildempty() -- build an empty gist index in the initialization fork
72  */
73 Datum
75 {
77  Buffer buffer;
78 
79  /* Initialize the root page */
82 
83  /* Initialize and xlog buffer */
85  GISTInitBuffer(buffer, F_LEAF);
86  MarkBufferDirty(buffer);
87  log_newpage_buffer(buffer, true);
89 
90  /* Unlock and release the buffer */
91  UnlockReleaseBuffer(buffer);
92 
94 }
95 
96 /*
97  * gistinsert -- wrapper for GiST tuple insertion.
98  *
99  * This is the public interface routine for tuple insertion in GiSTs.
100  * It doesn't do any work; just locks the relation and passes the buck.
101  */
102 Datum
104 {
107  bool *isnull = (bool *) PG_GETARG_POINTER(2);
109 
110 #ifdef NOT_USED
111  Relation heapRel = (Relation) PG_GETARG_POINTER(4);
113 #endif
114  IndexTuple itup;
115  GISTSTATE *giststate;
116  MemoryContext oldCxt;
117 
118  giststate = initGISTstate(r);
119 
120  /*
121  * We use the giststate's scan context as temp context too. This means
122  * that any memory leaked by the support functions is not reclaimed until
123  * end of insert. In most cases, we aren't going to call the support
124  * functions very many times before finishing the insert, so this seems
125  * cheaper than resetting a temp context for each function call.
126  */
127  oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
128 
129  itup = gistFormTuple(giststate, r,
130  values, isnull, true /* size is currently bogus */ );
131  itup->t_tid = *ht_ctid;
132 
133  gistdoinsert(r, itup, 0, giststate);
134 
135  /* cleanup */
136  MemoryContextSwitchTo(oldCxt);
137  freeGISTstate(giststate);
138 
139  PG_RETURN_BOOL(false);
140 }
141 
142 
143 /*
144  * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
145  * at that offset is atomically removed along with inserting the new tuples.
146  * This is used to replace a tuple with a new one.
147  *
148  * If 'leftchildbuf' is valid, we're inserting the downlink for the page
149  * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
150  * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
151  *
152  * If 'markfollowright' is true and the page is split, the left child is
153  * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
154  * index build, however, there is no concurrent access and the page splitting
155  * is done in a slightly simpler fashion, and false is passed.
156  *
157  * If there is not enough room on the page, it is split. All the split
158  * pages are kept pinned and locked and returned in *splitinfo, the caller
159  * is responsible for inserting the downlinks for them. However, if
160  * 'buffer' is the root page and it needs to be split, gistplacetopage()
161  * performs the split as one atomic operation, and *splitinfo is set to NIL.
162  * In that case, we continue to hold the root page locked, and the child
163  * pages are released; note that new tuple(s) are *not* on the root page
164  * but in one of the new child pages.
165  *
166  * If 'newblkno' is not NULL, returns the block number of page the first
167  * new/updated tuple was inserted to. Usually it's the given page, but could
168  * be its right sibling if the page was split.
169  *
170  * Returns 'true' if the page was split, 'false' otherwise.
171  */
172 bool
173 gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
174  Buffer buffer,
175  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
176  BlockNumber *newblkno,
177  Buffer leftchildbuf,
178  List **splitinfo,
179  bool markfollowright)
180 {
182  Page page = BufferGetPage(buffer);
183  bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
184  XLogRecPtr recptr;
185  int i;
186  bool is_split;
187 
188  /*
189  * Refuse to modify a page that's incompletely split. This should not
190  * happen because we finish any incomplete splits while we walk down the
191  * tree. However, it's remotely possible that another concurrent inserter
192  * splits a parent page, and errors out before completing the split. We
193  * will just throw an error in that case, and leave any split we had in
194  * progress unfinished too. The next insert that comes along will clean up
195  * the mess.
196  */
197  if (GistFollowRight(page))
198  elog(ERROR, "concurrent GiST page split was incomplete");
199 
200  *splitinfo = NIL;
201 
202  /*
203  * if isupdate, remove old key: This node's key has been modified, either
204  * because a child split occurred or because we needed to adjust our key
205  * for an insert in a child node. Therefore, remove the old version of
206  * this node's key.
207  *
208  * for WAL replay, in the non-split case we handle this by setting up a
209  * one-element todelete array; in the split case, it's handled implicitly
210  * because the tuple vector passed to gistSplit won't include this tuple.
211  */
212  is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
213 
214  /*
215  * If leaf page is full, try at first to delete dead tuples. And then
216  * check again.
217  */
218  if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
219  {
220  gistvacuumpage(rel, page, buffer);
221  is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
222  }
223 
224  if (is_split)
225  {
226  /* no space for insertion */
227  IndexTuple *itvec;
228  int tlen;
229  SplitedPageLayout *dist = NULL,
230  *ptr;
231  BlockNumber oldrlink = InvalidBlockNumber;
232  GistNSN oldnsn = 0;
233  SplitedPageLayout rootpg;
234  bool is_rootsplit;
235  int npage;
236 
237  is_rootsplit = (blkno == GIST_ROOT_BLKNO);
238 
239  /*
240  * Form index tuples vector to split. If we're replacing an old tuple,
241  * remove the old version from the vector.
242  */
243  itvec = gistextractpage(page, &tlen);
244  if (OffsetNumberIsValid(oldoffnum))
245  {
246  /* on inner page we should remove old tuple */
247  int pos = oldoffnum - FirstOffsetNumber;
248 
249  tlen--;
250  if (pos != tlen)
251  memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
252  }
253  itvec = gistjoinvector(itvec, &tlen, itup, ntup);
254  dist = gistSplit(rel, page, itvec, tlen, giststate);
255 
256  /*
257  * Check that split didn't produce too many pages.
258  */
259  npage = 0;
260  for (ptr = dist; ptr; ptr = ptr->next)
261  npage++;
262  /* in a root split, we'll add one more page to the list below */
263  if (is_rootsplit)
264  npage++;
265  if (npage > GIST_MAX_SPLIT_PAGES)
266  elog(ERROR, "GiST page split into too many halves (%d, maximum %d)",
267  npage, GIST_MAX_SPLIT_PAGES);
268 
269  /*
270  * Set up pages to work with. Allocate new buffers for all but the
271  * leftmost page. The original page becomes the new leftmost page, and
272  * is just replaced with the new contents.
273  *
274  * For a root-split, allocate new buffers for all child pages, the
275  * original page is overwritten with new root page containing
276  * downlinks to the new child pages.
277  */
278  ptr = dist;
279  if (!is_rootsplit)
280  {
281  /* save old rightlink and NSN */
282  oldrlink = GistPageGetOpaque(page)->rightlink;
283  oldnsn = GistPageGetNSN(page);
284 
285  dist->buffer = buffer;
286  dist->block.blkno = BufferGetBlockNumber(buffer);
288 
289  /* clean all flags except F_LEAF */
290  GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
291 
292  ptr = ptr->next;
293  }
294  for (; ptr; ptr = ptr->next)
295  {
296  /* Allocate new page */
297  ptr->buffer = gistNewBuffer(rel);
298  GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
299  ptr->page = BufferGetPage(ptr->buffer);
300  ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
301  }
302 
303  /*
304  * Now that we know which blocks the new pages go to, set up downlink
305  * tuples to point to them.
306  */
307  for (ptr = dist; ptr; ptr = ptr->next)
308  {
309  ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
310  GistTupleSetValid(ptr->itup);
311  }
312 
313  /*
314  * If this is a root split, we construct the new root page with the
315  * downlinks here directly, instead of requiring the caller to insert
316  * them. Add the new root page to the list along with the child pages.
317  */
318  if (is_rootsplit)
319  {
320  IndexTuple *downlinks;
321  int ndownlinks = 0;
322  int i;
323 
324  rootpg.buffer = buffer;
326  GistPageGetOpaque(rootpg.page)->flags = 0;
327 
328  /* Prepare a vector of all the downlinks */
329  for (ptr = dist; ptr; ptr = ptr->next)
330  ndownlinks++;
331  downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
332  for (i = 0, ptr = dist; ptr; ptr = ptr->next)
333  downlinks[i++] = ptr->itup;
334 
335  rootpg.block.blkno = GIST_ROOT_BLKNO;
336  rootpg.block.num = ndownlinks;
337  rootpg.list = gistfillitupvec(downlinks, ndownlinks,
338  &(rootpg.lenlist));
339  rootpg.itup = NULL;
340 
341  rootpg.next = dist;
342  dist = &rootpg;
343  }
344  else
345  {
346  /* Prepare split-info to be returned to caller */
347  for (ptr = dist; ptr; ptr = ptr->next)
348  {
350 
351  si->buf = ptr->buffer;
352  si->downlink = ptr->itup;
353  *splitinfo = lappend(*splitinfo, si);
354  }
355  }
356 
357  /*
358  * Fill all pages. All the pages are new, ie. freshly allocated empty
359  * pages, or a temporary copy of the old page.
360  */
361  for (ptr = dist; ptr; ptr = ptr->next)
362  {
363  char *data = (char *) (ptr->list);
364 
365  for (i = 0; i < ptr->block.num; i++)
366  {
367  IndexTuple thistup = (IndexTuple) data;
368 
369  if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
370  elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
371 
372  /*
373  * If this is the first inserted/updated tuple, let the caller
374  * know which page it landed on.
375  */
376  if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid))
377  *newblkno = ptr->block.blkno;
378 
379  data += IndexTupleSize(thistup);
380  }
381 
382  /* Set up rightlinks */
383  if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
384  GistPageGetOpaque(ptr->page)->rightlink =
385  ptr->next->block.blkno;
386  else
387  GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
388 
389  /*
390  * Mark the all but the right-most page with the follow-right
391  * flag. It will be cleared as soon as the downlink is inserted
392  * into the parent, but this ensures that if we error out before
393  * that, the index is still consistent. (in buffering build mode,
394  * any error will abort the index build anyway, so this is not
395  * needed.)
396  */
397  if (ptr->next && !is_rootsplit && markfollowright)
398  GistMarkFollowRight(ptr->page);
399  else
400  GistClearFollowRight(ptr->page);
401 
402  /*
403  * Copy the NSN of the original page to all pages. The
404  * F_FOLLOW_RIGHT flags ensure that scans will follow the
405  * rightlinks until the downlinks are inserted.
406  */
407  GistPageSetNSN(ptr->page, oldnsn);
408  }
409 
410  /*
411  * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL
412  * insertion for that. NB: The number of pages and data segments
413  * specified here must match the calculations in gistXLogSplit()!
414  */
415  if (RelationNeedsWAL(rel))
416  XLogEnsureRecordSpace(npage, 1 + npage * 2);
417 
419 
420  /*
421  * Must mark buffers dirty before XLogInsert, even though we'll still
422  * be changing their opaque fields below.
423  */
424  for (ptr = dist; ptr; ptr = ptr->next)
425  MarkBufferDirty(ptr->buffer);
426  if (BufferIsValid(leftchildbuf))
427  MarkBufferDirty(leftchildbuf);
428 
429  /*
430  * The first page in the chain was a temporary working copy meant to
431  * replace the old page. Copy it over the old page.
432  */
434  dist->page = BufferGetPage(dist->buffer);
435 
436  /* Write the WAL record */
437  if (RelationNeedsWAL(rel))
438  recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
439  dist, oldrlink, oldnsn, leftchildbuf,
440  markfollowright);
441  else
442  recptr = gistGetFakeLSN(rel);
443 
444  for (ptr = dist; ptr; ptr = ptr->next)
445  {
446  PageSetLSN(ptr->page, recptr);
447  }
448 
449  /*
450  * Return the new child buffers to the caller.
451  *
452  * If this was a root split, we've already inserted the downlink
453  * pointers, in the form of a new root page. Therefore we can release
454  * all the new buffers, and keep just the root page locked.
455  */
456  if (is_rootsplit)
457  {
458  for (ptr = dist->next; ptr; ptr = ptr->next)
459  UnlockReleaseBuffer(ptr->buffer);
460  }
461  }
462  else
463  {
464  /*
465  * Enough space. We also get here if ntuples==0.
466  */
468 
469  /*
470  * While we delete only one tuple at once we could mix calls
471  * PageIndexTupleDelete() here and PageIndexMultiDelete() in
472  * gistRedoPageUpdateRecord()
473  */
474  if (OffsetNumberIsValid(oldoffnum))
475  PageIndexTupleDelete(page, oldoffnum);
476  gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
477 
478  MarkBufferDirty(buffer);
479 
480  if (BufferIsValid(leftchildbuf))
481  MarkBufferDirty(leftchildbuf);
482 
483  if (RelationNeedsWAL(rel))
484  {
485  OffsetNumber ndeloffs = 0,
486  deloffs[1];
487 
488  if (OffsetNumberIsValid(oldoffnum))
489  {
490  deloffs[0] = oldoffnum;
491  ndeloffs = 1;
492  }
493 
494  recptr = gistXLogUpdate(rel->rd_node, buffer,
495  deloffs, ndeloffs, itup, ntup,
496  leftchildbuf);
497 
498  PageSetLSN(page, recptr);
499  }
500  else
501  {
502  recptr = gistGetFakeLSN(rel);
503  PageSetLSN(page, recptr);
504  }
505 
506  if (newblkno)
507  *newblkno = blkno;
508  }
509 
510  /*
511  * If we inserted the downlink for a child page, set NSN and clear
512  * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
513  * follow the rightlink if and only if they looked at the parent page
514  * before we inserted the downlink.
515  *
516  * Note that we do this *after* writing the WAL record. That means that
517  * the possible full page image in the WAL record does not include these
518  * changes, and they must be replayed even if the page is restored from
519  * the full page image. There's a chicken-and-egg problem: if we updated
520  * the child pages first, we wouldn't know the recptr of the WAL record
521  * we're about to write.
522  */
523  if (BufferIsValid(leftchildbuf))
524  {
525  Page leftpg = BufferGetPage(leftchildbuf);
526 
527  GistPageSetNSN(leftpg, recptr);
528  GistClearFollowRight(leftpg);
529 
530  PageSetLSN(leftpg, recptr);
531  }
532 
534 
535  return is_split;
536 }
537 
538 /*
539  * Workhouse routine for doing insertion into a GiST index. Note that
540  * this routine assumes it is invoked in a short-lived memory context,
541  * so it does not bother releasing palloc'd allocations.
542  */
543 void
544 gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
545 {
546  ItemId iid;
547  IndexTuple idxtuple;
548  GISTInsertStack firststack;
549  GISTInsertStack *stack;
551  bool xlocked = false;
552 
553  memset(&state, 0, sizeof(GISTInsertState));
554  state.freespace = freespace;
555  state.r = r;
556 
557  /* Start from the root */
558  firststack.blkno = GIST_ROOT_BLKNO;
559  firststack.lsn = 0;
560  firststack.parent = NULL;
561  firststack.downlinkoffnum = InvalidOffsetNumber;
562  state.stack = stack = &firststack;
563 
564  /*
565  * Walk down along the path of smallest penalty, updating the parent
566  * pointers with the key we're inserting as we go. If we crash in the
567  * middle, the tree is consistent, although the possible parent updates
568  * were a waste.
569  */
570  for (;;)
571  {
572  if (XLogRecPtrIsInvalid(stack->lsn))
573  stack->buffer = ReadBuffer(state.r, stack->blkno);
574 
575  /*
576  * Be optimistic and grab shared lock first. Swap it for an exclusive
577  * lock later if we need to update the page.
578  */
579  if (!xlocked)
580  {
581  LockBuffer(stack->buffer, GIST_SHARE);
582  gistcheckpage(state.r, stack->buffer);
583  }
584 
585  stack->page = (Page) BufferGetPage(stack->buffer);
586  stack->lsn = PageGetLSN(stack->page);
587  Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
588 
589  /*
590  * If this page was split but the downlink was never inserted to the
591  * parent because the inserting backend crashed before doing that, fix
592  * that now.
593  */
594  if (GistFollowRight(stack->page))
595  {
596  if (!xlocked)
597  {
598  LockBuffer(stack->buffer, GIST_UNLOCK);
600  xlocked = true;
601  /* someone might've completed the split when we unlocked */
602  if (!GistFollowRight(stack->page))
603  continue;
604  }
605  gistfixsplit(&state, giststate);
606 
607  UnlockReleaseBuffer(stack->buffer);
608  xlocked = false;
609  state.stack = stack = stack->parent;
610  continue;
611  }
612 
613  if (stack->blkno != GIST_ROOT_BLKNO &&
614  stack->parent->lsn < GistPageGetNSN(stack->page))
615  {
616  /*
617  * Concurrent split detected. There's no guarantee that the
618  * downlink for this page is consistent with the tuple we're
619  * inserting anymore, so go back to parent and rechoose the best
620  * child.
621  */
622  UnlockReleaseBuffer(stack->buffer);
623  xlocked = false;
624  state.stack = stack = stack->parent;
625  continue;
626  }
627 
628  if (!GistPageIsLeaf(stack->page))
629  {
630  /*
631  * This is an internal page so continue to walk down the tree.
632  * Find the child node that has the minimum insertion penalty.
633  */
634  BlockNumber childblkno;
635  IndexTuple newtup;
636  GISTInsertStack *item;
637  OffsetNumber downlinkoffnum;
638 
639  downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
640  iid = PageGetItemId(stack->page, downlinkoffnum);
641  idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
642  childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
643 
644  /*
645  * Check that it's not a leftover invalid tuple from pre-9.1
646  */
647  if (GistTupleIsInvalid(idxtuple))
648  ereport(ERROR,
649  (errmsg("index \"%s\" contains an inner tuple marked as invalid",
651  errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
652  errhint("Please REINDEX it.")));
653 
654  /*
655  * Check that the key representing the target child node is
656  * consistent with the key we're inserting. Update it if it's not.
657  */
658  newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
659  if (newtup)
660  {
661  /*
662  * Swap shared lock for an exclusive one. Beware, the page may
663  * change while we unlock/lock the page...
664  */
665  if (!xlocked)
666  {
667  LockBuffer(stack->buffer, GIST_UNLOCK);
669  xlocked = true;
670  stack->page = (Page) BufferGetPage(stack->buffer);
671 
672  if (PageGetLSN(stack->page) != stack->lsn)
673  {
674  /* the page was changed while we unlocked it, retry */
675  continue;
676  }
677  }
678 
679  /*
680  * Update the tuple.
681  *
682  * We still hold the lock after gistinserttuple(), but it
683  * might have to split the page to make the updated tuple fit.
684  * In that case the updated tuple might migrate to the other
685  * half of the split, so we have to go back to the parent and
686  * descend back to the half that's a better fit for the new
687  * tuple.
688  */
689  if (gistinserttuple(&state, stack, giststate, newtup,
690  downlinkoffnum))
691  {
692  /*
693  * If this was a root split, the root page continues to be
694  * the parent and the updated tuple went to one of the
695  * child pages, so we just need to retry from the root
696  * page.
697  */
698  if (stack->blkno != GIST_ROOT_BLKNO)
699  {
700  UnlockReleaseBuffer(stack->buffer);
701  xlocked = false;
702  state.stack = stack = stack->parent;
703  }
704  continue;
705  }
706  }
707  LockBuffer(stack->buffer, GIST_UNLOCK);
708  xlocked = false;
709 
710  /* descend to the chosen child */
711  item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
712  item->blkno = childblkno;
713  item->parent = stack;
714  item->downlinkoffnum = downlinkoffnum;
715  state.stack = stack = item;
716  }
717  else
718  {
719  /*
720  * Leaf page. Insert the new key. We've already updated all the
721  * parents on the way down, but we might have to split the page if
722  * it doesn't fit. gistinserthere() will take care of that.
723  */
724 
725  /*
726  * Swap shared lock for an exclusive one. Be careful, the page may
727  * change while we unlock/lock the page...
728  */
729  if (!xlocked)
730  {
731  LockBuffer(stack->buffer, GIST_UNLOCK);
733  xlocked = true;
734  stack->page = (Page) BufferGetPage(stack->buffer);
735  stack->lsn = PageGetLSN(stack->page);
736 
737  if (stack->blkno == GIST_ROOT_BLKNO)
738  {
739  /*
740  * the only page that can become inner instead of leaf is
741  * the root page, so for root we should recheck it
742  */
743  if (!GistPageIsLeaf(stack->page))
744  {
745  /*
746  * very rare situation: during unlock/lock index with
747  * number of pages = 1 was increased
748  */
749  LockBuffer(stack->buffer, GIST_UNLOCK);
750  xlocked = false;
751  continue;
752  }
753 
754  /*
755  * we don't need to check root split, because checking
756  * leaf/inner is enough to recognize split for root
757  */
758  }
759  else if (GistFollowRight(stack->page) ||
760  stack->parent->lsn < GistPageGetNSN(stack->page))
761  {
762  /*
763  * The page was split while we momentarily unlocked the
764  * page. Go back to parent.
765  */
766  UnlockReleaseBuffer(stack->buffer);
767  xlocked = false;
768  state.stack = stack = stack->parent;
769  continue;
770  }
771  }
772 
773  /* now state.stack->(page, buffer and blkno) points to leaf page */
774 
775  gistinserttuple(&state, stack, giststate, itup,
777  LockBuffer(stack->buffer, GIST_UNLOCK);
778 
779  /* Release any pins we might still hold before exiting */
780  for (; stack; stack = stack->parent)
781  ReleaseBuffer(stack->buffer);
782  break;
783  }
784  }
785 }
786 
787 /*
788  * Traverse the tree to find path from root page to specified "child" block.
789  *
790  * returns a new insertion stack, starting from the parent of "child", up
791  * to the root. *downlinkoffnum is set to the offset of the downlink in the
792  * direct parent of child.
793  *
794  * To prevent deadlocks, this should lock only one page at a time.
795  */
796 static GISTInsertStack *
798 {
799  Page page;
800  Buffer buffer;
801  OffsetNumber i,
802  maxoff;
803  ItemId iid;
804  IndexTuple idxtuple;
805  List *fifo;
806  GISTInsertStack *top,
807  *ptr;
809 
810  top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
811  top->blkno = GIST_ROOT_BLKNO;
813 
814  fifo = list_make1(top);
815  while (fifo != NIL)
816  {
817  /* Get next page to visit */
818  top = linitial(fifo);
819  fifo = list_delete_first(fifo);
820 
821  buffer = ReadBuffer(r, top->blkno);
822  LockBuffer(buffer, GIST_SHARE);
823  gistcheckpage(r, buffer);
824  page = (Page) BufferGetPage(buffer);
825 
826  if (GistPageIsLeaf(page))
827  {
828  /*
829  * Because we scan the index top-down, all the rest of the pages
830  * in the queue must be leaf pages as well.
831  */
832  UnlockReleaseBuffer(buffer);
833  break;
834  }
835 
836  top->lsn = PageGetLSN(page);
837 
838  /*
839  * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
840  * downlink. This should not normally happen..
841  */
842  if (GistFollowRight(page))
843  elog(ERROR, "concurrent GiST page split was incomplete");
844 
845  if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
846  GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
847  {
848  /*
849  * Page was split while we looked elsewhere. We didn't see the
850  * downlink to the right page when we scanned the parent, so add
851  * it to the queue now.
852  *
853  * Put the right page ahead of the queue, so that we visit it
854  * next. That's important, because if this is the lowest internal
855  * level, just above leaves, we might already have queued up some
856  * leaf pages, and we assume that there can't be any non-leaf
857  * pages behind leaf pages.
858  */
859  ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
860  ptr->blkno = GistPageGetOpaque(page)->rightlink;
862  ptr->parent = top->parent;
863 
864  fifo = lcons(ptr, fifo);
865  }
866 
867  maxoff = PageGetMaxOffsetNumber(page);
868 
869  for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
870  {
871  iid = PageGetItemId(page, i);
872  idxtuple = (IndexTuple) PageGetItem(page, iid);
873  blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
874  if (blkno == child)
875  {
876  /* Found it! */
877  UnlockReleaseBuffer(buffer);
878  *downlinkoffnum = i;
879  return top;
880  }
881  else
882  {
883  /* Append this child to the list of pages to visit later */
884  ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
885  ptr->blkno = blkno;
886  ptr->downlinkoffnum = i;
887  ptr->parent = top;
888 
889  fifo = lappend(fifo, ptr);
890  }
891  }
892 
893  UnlockReleaseBuffer(buffer);
894  }
895 
896  elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
897  RelationGetRelationName(r), child);
898  return NULL; /* keep compiler quiet */
899 }
900 
901 /*
902  * Updates the stack so that child->parent is the correct parent of the
903  * child. child->parent must be exclusively locked on entry, and will
904  * remain so at exit, but it might not be the same page anymore.
905  */
906 static void
908 {
909  GISTInsertStack *parent = child->parent;
910 
911  gistcheckpage(r, parent->buffer);
912  parent->page = (Page) BufferGetPage(parent->buffer);
913 
914  /* here we don't need to distinguish between split and page update */
915  if (child->downlinkoffnum == InvalidOffsetNumber ||
916  parent->lsn != PageGetLSN(parent->page))
917  {
918  /* parent is changed, look child in right links until found */
919  OffsetNumber i,
920  maxoff;
921  ItemId iid;
922  IndexTuple idxtuple;
923  GISTInsertStack *ptr;
924 
925  while (true)
926  {
927  maxoff = PageGetMaxOffsetNumber(parent->page);
928  for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
929  {
930  iid = PageGetItemId(parent->page, i);
931  idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
932  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
933  {
934  /* yes!!, found */
935  child->downlinkoffnum = i;
936  return;
937  }
938  }
939 
940  parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
941  UnlockReleaseBuffer(parent->buffer);
942  if (parent->blkno == InvalidBlockNumber)
943  {
944  /*
945  * End of chain and still didn't find parent. It's a very-very
946  * rare situation when root splited.
947  */
948  break;
949  }
950  parent->buffer = ReadBuffer(r, parent->blkno);
951  LockBuffer(parent->buffer, GIST_EXCLUSIVE);
952  gistcheckpage(r, parent->buffer);
953  parent->page = (Page) BufferGetPage(parent->buffer);
954  }
955 
956  /*
957  * awful!!, we need search tree to find parent ... , but before we
958  * should release all old parent
959  */
960 
961  ptr = child->parent->parent; /* child->parent already released
962  * above */
963  while (ptr)
964  {
965  ReleaseBuffer(ptr->buffer);
966  ptr = ptr->parent;
967  }
968 
969  /* ok, find new path */
970  ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);
971 
972  /* read all buffers as expected by caller */
973  /* note we don't lock them or gistcheckpage them here! */
974  while (ptr)
975  {
976  ptr->buffer = ReadBuffer(r, ptr->blkno);
977  ptr->page = (Page) BufferGetPage(ptr->buffer);
978  ptr = ptr->parent;
979  }
980 
981  /* install new chain of parents to stack */
982  child->parent = parent;
983 
984  /* make recursive call to normal processing */
986  gistFindCorrectParent(r, child);
987  }
988 
989  return;
990 }
991 
992 /*
993  * Form a downlink pointer for the page in 'buf'.
994  */
995 static IndexTuple
997  GISTInsertStack *stack)
998 {
999  Page page = BufferGetPage(buf);
1000  OffsetNumber maxoff;
1001  OffsetNumber offset;
1002  IndexTuple downlink = NULL;
1003 
1004  maxoff = PageGetMaxOffsetNumber(page);
1005  for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
1006  {
1007  IndexTuple ituple = (IndexTuple)
1008  PageGetItem(page, PageGetItemId(page, offset));
1009 
1010  if (downlink == NULL)
1011  downlink = CopyIndexTuple(ituple);
1012  else
1013  {
1014  IndexTuple newdownlink;
1015 
1016  newdownlink = gistgetadjusted(rel, downlink, ituple,
1017  giststate);
1018  if (newdownlink)
1019  downlink = newdownlink;
1020  }
1021  }
1022 
1023  /*
1024  * If the page is completely empty, we can't form a meaningful downlink
1025  * for it. But we have to insert a downlink for the page. Any key will do,
1026  * as long as its consistent with the downlink of parent page, so that we
1027  * can legally insert it to the parent. A minimal one that matches as few
1028  * scans as possible would be best, to keep scans from doing useless work,
1029  * but we don't know how to construct that. So we just use the downlink of
1030  * the original page that was split - that's as far from optimal as it can
1031  * get but will do..
1032  */
1033  if (!downlink)
1034  {
1035  ItemId iid;
1036 
1038  gistFindCorrectParent(rel, stack);
1039  iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum);
1040  downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
1041  downlink = CopyIndexTuple(downlink);
1042  LockBuffer(stack->parent->buffer, GIST_UNLOCK);
1043  }
1044 
1046  GistTupleSetValid(downlink);
1047 
1048  return downlink;
1049 }
1050 
1051 
1052 /*
1053  * Complete the incomplete split of state->stack->page.
1054  */
1055 static void
1057 {
1058  GISTInsertStack *stack = state->stack;
1059  Buffer buf;
1060  Page page;
1061  List *splitinfo = NIL;
1062 
1063  elog(LOG, "fixing incomplete split in index \"%s\", block %u",
1064  RelationGetRelationName(state->r), stack->blkno);
1065 
1066  Assert(GistFollowRight(stack->page));
1068 
1069  buf = stack->buffer;
1070 
1071  /*
1072  * Read the chain of split pages, following the rightlinks. Construct a
1073  * downlink tuple for each page.
1074  */
1075  for (;;)
1076  {
1078  IndexTuple downlink;
1079 
1080  page = BufferGetPage(buf);
1081 
1082  /* Form the new downlink tuples to insert to parent */
1083  downlink = gistformdownlink(state->r, buf, giststate, stack);
1084 
1085  si->buf = buf;
1086  si->downlink = downlink;
1087 
1088  splitinfo = lappend(splitinfo, si);
1089 
1090  if (GistFollowRight(page))
1091  {
1092  /* lock next page */
1093  buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
1094  LockBuffer(buf, GIST_EXCLUSIVE);
1095  }
1096  else
1097  break;
1098  }
1099 
1100  /* Insert the downlinks */
1101  gistfinishsplit(state, stack, giststate, splitinfo, false);
1102 }
1103 
1104 /*
1105  * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the
1106  * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new.
1107  * 'stack' represents the path from the root to the page being updated.
1108  *
1109  * The caller must hold an exclusive lock on stack->buffer. The lock is still
1110  * held on return, but the page might not contain the inserted tuple if the
1111  * page was split. The function returns true if the page was split, false
1112  * otherwise.
1113  */
1114 static bool
1116  GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
1117 {
1118  return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum,
1119  InvalidBuffer, InvalidBuffer, false, false);
1120 }
1121 
1122 /* ----------------
1123  * An extended workhorse version of gistinserttuple(). This version allows
1124  * inserting multiple tuples, or replacing a single tuple with multiple tuples.
1125  * This is used to recursively update the downlinks in the parent when a page
1126  * is split.
1127  *
1128  * If leftchild and rightchild are valid, we're inserting/replacing the
1129  * downlink for rightchild, and leftchild is its left sibling. We clear the
1130  * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the
1131  * insertion of the downlink.
1132  *
1133  * To avoid holding locks for longer than necessary, when recursing up the
1134  * tree to update the parents, the locking is a bit peculiar here. On entry,
1135  * the caller must hold an exclusive lock on stack->buffer, as well as
1136  * leftchild and rightchild if given. On return:
1137  *
1138  * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is
1139  * always kept pinned, however.
1140  * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page
1141  * is kept pinned.
1142  * - Lock and pin on 'rightchild' are always released.
1143  *
1144  * Returns 'true' if the page had to be split. Note that if the page was
1145  * split, the inserted/updated tuples might've been inserted to a right
1146  * sibling of stack->buffer instead of stack->buffer itself.
1147  */
1148 static bool
1150  GISTSTATE *giststate,
1151  IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
1153  bool unlockbuf, bool unlockleftchild)
1154 {
1155  List *splitinfo;
1156  bool is_split;
1157 
1158  /* Insert the tuple(s) to the page, splitting the page if necessary */
1159  is_split = gistplacetopage(state->r, state->freespace, giststate,
1160  stack->buffer,
1161  tuples, ntup,
1162  oldoffnum, NULL,
1163  leftchild,
1164  &splitinfo,
1165  true);
1166 
1167  /*
1168  * Before recursing up in case the page was split, release locks on the
1169  * child pages. We don't need to keep them locked when updating the
1170  * parent.
1171  */
1172  if (BufferIsValid(rightchild))
1173  UnlockReleaseBuffer(rightchild);
1174  if (BufferIsValid(leftchild) && unlockleftchild)
1175  LockBuffer(leftchild, GIST_UNLOCK);
1176 
1177  /*
1178  * If we had to split, insert/update the downlinks in the parent. If the
1179  * caller requested us to release the lock on stack->buffer, tell
1180  * gistfinishsplit() to do that as soon as it's safe to do so. If we
1181  * didn't have to split, release it ourselves.
1182  */
1183  if (splitinfo)
1184  gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf);
1185  else if (unlockbuf)
1186  LockBuffer(stack->buffer, GIST_UNLOCK);
1187 
1188  return is_split;
1189 }
1190 
1191 /*
1192  * Finish an incomplete split by inserting/updating the downlinks in parent
1193  * page. 'splitinfo' contains all the child pages involved in the split,
1194  * from left-to-right.
1195  *
1196  * On entry, the caller must hold a lock on stack->buffer and all the child
1197  * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is
1198  * released on return. The child pages are always unlocked and unpinned.
1199  */
1200 static void
1202  GISTSTATE *giststate, List *splitinfo, bool unlockbuf)
1203 {
1204  ListCell *lc;
1205  List *reversed;
1206  GISTPageSplitInfo *right;
1207  GISTPageSplitInfo *left;
1208  IndexTuple tuples[2];
1209 
1210  /* A split always contains at least two halves */
1211  Assert(list_length(splitinfo) >= 2);
1212 
1213  /*
1214  * We need to insert downlinks for each new page, and update the downlink
1215  * for the original (leftmost) page in the split. Begin at the rightmost
1216  * page, inserting one downlink at a time until there's only two pages
1217  * left. Finally insert the downlink for the last new page and update the
1218  * downlink for the original page as one operation.
1219  */
1220 
1221  /* for convenience, create a copy of the list in reverse order */
1222  reversed = NIL;
1223  foreach(lc, splitinfo)
1224  {
1225  reversed = lcons(lfirst(lc), reversed);
1226  }
1227 
1229  gistFindCorrectParent(state->r, stack);
1230 
1231  /*
1232  * insert downlinks for the siblings from right to left, until there are
1233  * only two siblings left.
1234  */
1235  while (list_length(reversed) > 2)
1236  {
1237  right = (GISTPageSplitInfo *) linitial(reversed);
1238  left = (GISTPageSplitInfo *) lsecond(reversed);
1239 
1240  if (gistinserttuples(state, stack->parent, giststate,
1241  &right->downlink, 1,
1243  left->buf, right->buf, false, false))
1244  {
1245  /*
1246  * If the parent page was split, need to relocate the original
1247  * parent pointer.
1248  */
1249  gistFindCorrectParent(state->r, stack);
1250  }
1251  /* gistinserttuples() released the lock on right->buf. */
1252  reversed = list_delete_first(reversed);
1253  }
1254 
1255  right = (GISTPageSplitInfo *) linitial(reversed);
1256  left = (GISTPageSplitInfo *) lsecond(reversed);
1257 
1258  /*
1259  * Finally insert downlink for the remaining right page and update the
1260  * downlink for the original page to not contain the tuples that were
1261  * moved to the new pages.
1262  */
1263  tuples[0] = left->downlink;
1264  tuples[1] = right->downlink;
1265  gistinserttuples(state, stack->parent, giststate,
1266  tuples, 2,
1267  stack->downlinkoffnum,
1268  left->buf, right->buf,
1269  true, /* Unlock parent */
1270  unlockbuf /* Unlock stack->buffer if caller wants that */
1271  );
1272  Assert(left->buf == stack->buffer);
1273 }
1274 
1275 /*
1276  * gistSplit -- split a page in the tree and fill struct
1277  * used for XLOG and real writes buffers. Function is recursive, ie
1278  * it will split page until keys will fit in every page.
1279  */
1282  Page page,
1283  IndexTuple *itup, /* contains compressed entry */
1284  int len,
1285  GISTSTATE *giststate)
1286 {
1287  IndexTuple *lvectup,
1288  *rvectup;
1289  GistSplitVector v;
1290  int i;
1291  SplitedPageLayout *res = NULL;
1292 
1293  /* this should never recurse very deeply, but better safe than sorry */
1295 
1296  /* there's no point in splitting an empty page */
1297  Assert(len > 0);
1298 
1299  /*
1300  * If a single tuple doesn't fit on a page, no amount of splitting will
1301  * help.
1302  */
1303  if (len == 1)
1304  ereport(ERROR,
1305  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1306  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
1307  IndexTupleSize(itup[0]), GiSTPageSize,
1309 
1310  memset(v.spl_lisnull, TRUE, sizeof(bool) * giststate->tupdesc->natts);
1311  memset(v.spl_risnull, TRUE, sizeof(bool) * giststate->tupdesc->natts);
1312  gistSplitByKey(r, page, itup, len, giststate, &v, 0);
1313 
1314  /* form left and right vector */
1315  lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
1316  rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
1317 
1318  for (i = 0; i < v.splitVector.spl_nleft; i++)
1319  lvectup[i] = itup[v.splitVector.spl_left[i] - 1];
1320 
1321  for (i = 0; i < v.splitVector.spl_nright; i++)
1322  rvectup[i] = itup[v.splitVector.spl_right[i] - 1];
1323 
1324  /* finalize splitting (may need another split) */
1325  if (!gistfitpage(rvectup, v.splitVector.spl_nright))
1326  {
1327  res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate);
1328  }
1329  else
1330  {
1331  ROTATEDIST(res);
1332  res->block.num = v.splitVector.spl_nright;
1333  res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
1334  res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
1335  }
1336 
1337  if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
1338  {
1339  SplitedPageLayout *resptr,
1340  *subres;
1341 
1342  resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate);
1343 
1344  /* install on list's tail */
1345  while (resptr->next)
1346  resptr = resptr->next;
1347 
1348  resptr->next = res;
1349  res = subres;
1350  }
1351  else
1352  {
1353  ROTATEDIST(res);
1354  res->block.num = v.splitVector.spl_nleft;
1355  res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
1356  res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
1357  }
1358 
1359  return res;
1360 }
1361 
1362 /*
1363  * Create a GISTSTATE and fill it with information about the index
1364  */
1365 GISTSTATE *
1367 {
1368  GISTSTATE *giststate;
1369  MemoryContext scanCxt;
1370  MemoryContext oldCxt;
1371  int i;
1372 
1373  /* safety check to protect fixed-size arrays in GISTSTATE */
1374  if (index->rd_att->natts > INDEX_MAX_KEYS)
1375  elog(ERROR, "numberOfAttributes %d > %d",
1376  index->rd_att->natts, INDEX_MAX_KEYS);
1377 
1378  /* Create the memory context that will hold the GISTSTATE */
1380  "GiST scan context",
1384  oldCxt = MemoryContextSwitchTo(scanCxt);
1385 
1386  /* Create and fill in the GISTSTATE */
1387  giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
1388 
1389  giststate->scanCxt = scanCxt;
1390  giststate->tempCxt = scanCxt; /* caller must change this if needed */
1391  giststate->tupdesc = index->rd_att;
1392 
1393  for (i = 0; i < index->rd_att->natts; i++)
1394  {
1395  fmgr_info_copy(&(giststate->consistentFn[i]),
1396  index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
1397  scanCxt);
1398  fmgr_info_copy(&(giststate->unionFn[i]),
1399  index_getprocinfo(index, i + 1, GIST_UNION_PROC),
1400  scanCxt);
1401  fmgr_info_copy(&(giststate->compressFn[i]),
1402  index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
1403  scanCxt);
1404  fmgr_info_copy(&(giststate->decompressFn[i]),
1405  index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
1406  scanCxt);
1407  fmgr_info_copy(&(giststate->penaltyFn[i]),
1408  index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
1409  scanCxt);
1410  fmgr_info_copy(&(giststate->picksplitFn[i]),
1411  index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
1412  scanCxt);
1413  fmgr_info_copy(&(giststate->equalFn[i]),
1414  index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
1415  scanCxt);
1416  /* opclasses are not required to provide a Distance method */
1417  if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
1418  fmgr_info_copy(&(giststate->distanceFn[i]),
1419  index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
1420  scanCxt);
1421  else
1422  giststate->distanceFn[i].fn_oid = InvalidOid;
1423 
1424  /* opclasses are not required to provide a Fetch method */
1425  if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC)))
1426  fmgr_info_copy(&(giststate->fetchFn[i]),
1427  index_getprocinfo(index, i + 1, GIST_FETCH_PROC),
1428  scanCxt);
1429  else
1430  giststate->fetchFn[i].fn_oid = InvalidOid;
1431 
1432  /*
1433  * If the index column has a specified collation, we should honor that
1434  * while doing comparisons. However, we may have a collatable storage
1435  * type for a noncollatable indexed data type. If there's no index
1436  * collation then specify default collation in case the support
1437  * functions need collation. This is harmless if the support
1438  * functions don't care about collation, so we just do it
1439  * unconditionally. (We could alternatively call get_typcollation,
1440  * but that seems like expensive overkill --- there aren't going to be
1441  * any cases where a GiST storage type has a nondefault collation.)
1442  */
1443  if (OidIsValid(index->rd_indcollation[i]))
1444  giststate->supportCollation[i] = index->rd_indcollation[i];
1445  else
1447  }
1448 
1449  MemoryContextSwitchTo(oldCxt);
1450 
1451  return giststate;
1452 }
1453 
1454 void
1456 {
1457  /* It's sufficient to delete the scanCxt */
1458  MemoryContextDelete(giststate->scanCxt);
1459 }
1460 
1461 /*
1462  * gistvacuumpage() -- try to remove LP_DEAD items from the given page.
1463  * Function assumes that buffer is exclusively locked.
1464  */
1465 static void
1467 {
1469  int ndeletable = 0;
1470  OffsetNumber offnum, maxoff;
1471 
1472  Assert(GistPageIsLeaf(page));
1473 
1474  /*
1475  * Scan over all items to see which ones need to be deleted according to
1476  * LP_DEAD flags.
1477  */
1478  maxoff = PageGetMaxOffsetNumber(page);
1479  for (offnum = FirstOffsetNumber;
1480  offnum <= maxoff;
1481  offnum = OffsetNumberNext(offnum))
1482  {
1483  ItemId itemId = PageGetItemId(page, offnum);
1484 
1485  if (ItemIdIsDead(itemId))
1486  deletable[ndeletable++] = offnum;
1487  }
1488 
1489  if (ndeletable > 0)
1490  {
1492 
1493  PageIndexMultiDelete(page, deletable, ndeletable);
1494 
1495  /*
1496  * Mark the page as not containing any LP_DEAD items. This is not
1497  * certainly true (there might be some that have recently been marked,
1498  * but weren't included in our target-item list), but it will almost
1499  * always be true and it doesn't seem worth an additional page scan to
1500  * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
1501  */
1503 
1504  MarkBufferDirty(buffer);
1505 
1506  /* XLOG stuff */
1507  if (RelationNeedsWAL(rel))
1508  {
1509  XLogRecPtr recptr;
1510 
1511  recptr = gistXLogUpdate(rel->rd_node, buffer,
1512  deletable, ndeletable,
1513  NULL, 0, InvalidBuffer);
1514 
1515  PageSetLSN(page, recptr);
1516  }
1517  else
1518  PageSetLSN(page, gistGetFakeLSN(rel));
1519 
1520  END_CRIT_SECTION();
1521  }
1522 
1523  /*
1524  * Note: if we didn't find any LP_DEAD items, then the page's
1525  * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
1526  * separate write to clear it, however. We will clear it when we split
1527  * the page.
1528  */
1529 }
#define GistFollowRight(page)
Definition: gist.h:147
#define NIL
Definition: pg_list.h:69
TupleDesc tupdesc
Definition: gist_private.h:81
BlockNumber blkno
Definition: gist_private.h:254
#define GistPageGetNSN(page)
Definition: gist.h:151
#define PG_GETARG_INT32(n)
Definition: fmgr.h:225
static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
Definition: gist.c:1115
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:203
int errhint(const char *fmt,...)
Definition: elog.c:982
#define GIST_FETCH_PROC
Definition: gist.h:36
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:974
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:810
Datum spl_lattr[INDEX_MAX_KEYS]
Definition: gist_private.h:276
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:391
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
Definition: gist.c:1056
OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap)
Definition: bufpage.c:176
BlockNumber blkno
Definition: gist_private.h:230
FmgrInfo fetchFn[INDEX_MAX_KEYS]
Definition: gist_private.h:93
#define GistClearPageHasGarbage(page)
Definition: gist.h:145
#define GIST_EQUAL_PROC
Definition: gist.h:34
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:685
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1381
MemoryContext createTempGistContext(void)
Definition: gist.c:61
Oid supportCollation[INDEX_MAX_KEYS]
Definition: gist_private.h:96
FmgrInfo compressFn[INDEX_MAX_KEYS]
Definition: gist_private.h:87
FmgrInfo equalFn[INDEX_MAX_KEYS]
Definition: gist_private.h:91
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:599
ItemPointerData t_tid
Definition: itup.h:37
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
Definition: gistutil.c:29
#define END_CRIT_SECTION()
Definition: miscadmin.h:131
OffsetNumber * spl_left
Definition: gist.h:105
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Pointer Item
Definition: item.h:17
#define GistPageSetNSN(page, val)
Definition: gist.h:152
#define InvalidBuffer
Definition: buf.h:25
GIST_SPLITVEC splitVector
Definition: gist_private.h:274
#define START_CRIT_SECTION()
Definition: miscadmin.h:129
int errcode(int sqlerrcode)
Definition: elog.c:573
#define GistTupleIsInvalid(itup)
Definition: gist_private.h:323
#define GIST_UNLOCK
Definition: gist_private.h:45
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:232
#define GistPageHasGarbage(page)
Definition: gist.h:143
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:2995
#define P_NEW
Definition: bufmgr.h:73
void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
Definition: gist.c:544
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:80
int spl_nleft
Definition: gist.h:106
#define LOG
Definition: elog.h:26
IndexTupleData * list
Definition: gist_private.h:238
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
#define OidIsValid(objectId)
Definition: c.h:519
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:337
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf)
Definition: gist.c:1201
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
Definition: gistutil.c:310
void gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate, GistSplitVector *v, int attno)
Definition: gistsplit.c:623
Datum spl_rattr[INDEX_MAX_KEYS]
Definition: gist_private.h:280
int natts
Definition: tupdesc.h:73
Datum gistbuildempty(PG_FUNCTION_ARGS)
Definition: gist.c:74
XLogRecPtr gistGetFakeLSN(Relation rel)
Definition: gistutil.c:848
#define lsecond(l)
Definition: pg_list.h:114
#define ALLOCSET_DEFAULT_MINSIZE
Definition: memutils.h:142
uint16 OffsetNumber
Definition: off.h:24
ItemPointerData * ItemPointer
Definition: itemptr.h:52
Definition: type.h:90
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:369
gistxlogPage block
Definition: gist_private.h:237
#define list_make1(x1)
Definition: pg_list.h:133
IndexUniqueCheck
Definition: genam.h:106
Datum gistinsert(PG_FUNCTION_ARGS)
Definition: gist.c:103
IndexTuple * gistextractpage(Page page, int *len)
Definition: gistutil.c:90
struct RelationData * Relation
Definition: relcache.h:21
FmgrInfo consistentFn[INDEX_MAX_KEYS]
Definition: gist_private.h:85
#define linitial(l)
Definition: pg_list.h:110
#define GIST_PICKSPLIT_PROC
Definition: gist.h:33
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3018
Oid * rd_indcollation
Definition: rel.h:161
#define ERROR
Definition: elog.h:41
static IndexTuple gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, GISTInsertStack *stack)
Definition: gist.c:996
#define GIST_COMPRESS_PROC
Definition: gist.h:30
#define GIST_MAX_SPLIT_PAGES
Definition: gist_private.h:40
int spl_nright
Definition: gist.h:111
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition: fmgr.c:584
MemoryContext tempCxt
Definition: gist_private.h:79
BlockNumber blkno
Definition: ginvacuum.c:290
IndexTuple downlink
Definition: gist_private.h:444
bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright)
Definition: gist.c:173
GISTInsertStack * stack
Definition: gist_private.h:293
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:434
static char * buf
Definition: pg_test_fsync.c:65
#define memmove(d, s, c)
Definition: c.h:1027
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:68
void check_stack_depth(void)
Definition: postgres.c:3092
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:868
FmgrInfo picksplitFn[INDEX_MAX_KEYS]
Definition: gist_private.h:90
bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
Definition: gistutil.c:54
#define RelationGetRelationName(relation)
Definition: rel.h:365
FmgrInfo penaltyFn[INDEX_MAX_KEYS]
Definition: gist_private.h:89
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
struct SplitedPageLayout * next
Definition: gist_private.h:244
#define BufferGetPage(buffer)
Definition: bufmgr.h:148
void freeGISTstate(GISTSTATE *giststate)
Definition: gist.c:1455
#define ereport(elevel, rest)
Definition: elog.h:132
FmgrInfo decompressFn[INDEX_MAX_KEYS]
Definition: gist_private.h:88
static void gistvacuumpage(Relation rel, Page page, Buffer buffer)
Definition: gist.c:1466
List * lappend(List *list, void *datum)
Definition: list.c:128
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
Definition: gistutil.c:368
#define GistPageIsLeaf(page)
Definition: gist.h:132
#define GistTupleSetValid(itup)
Definition: gist_private.h:324
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
OffsetNumber downlinkoffnum
Definition: gist_private.h:265
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
static void gistFindCorrectParent(Relation r, GISTInsertStack *child)
Definition: gist.c:907
#define GiSTPageSize
Definition: gist_private.h:482
#define GistClearFollowRight(page)
Definition: gist.h:149
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
Definition: gistutil.c:109
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, Buffer leftchild, Buffer rightchild, bool unlockbuf, bool unlockleftchild)
Definition: gist.c:1149
MemoryContext AllocSetContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: aset.c:436
void * palloc0(Size size)
Definition: mcxt.c:921
#define PG_RETURN_BOOL(x)
Definition: fmgr.h:303
uintptr_t Datum
Definition: postgres.h:374
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3225
#define ROTATEDIST(d)
Definition: gist.c:42
TupleDesc rd_att
Definition: rel.h:101
GISTSTATE * initGISTstate(Relation index)
Definition: gist.c:1366
struct DataPageDeleteStack * child
Definition: ginvacuum.c:287
SplitedPageLayout * gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate)
Definition: gist.c:1281
#define rightchild(x)
Definition: fsmpage.c:30
#define InvalidOffsetNumber
Definition: off.h:26
static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
Definition: gist.c:797
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:56
#define GIST_CONSISTENT_PROC
Definition: gist.h:28
bool gistfitpage(IndexTuple *itvec, int len)
Definition: gistutil.c:74
#define GIST_UNION_PROC
Definition: gist.h:29
#define PG_RETURN_VOID()
Definition: fmgr.h:293
#define GistPageGetOpaque(page)
Definition: gist.h:130
List * lcons(void *datum, List *list)
Definition: list.c:259
RelFileNode rd_node
Definition: rel.h:72
#define NULL
Definition: c.h:215
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:656
#define lfirst(lc)
Definition: pg_list.h:106
Definition: regguts.h:305
struct DataPageDeleteStack * parent
Definition: ginvacuum.c:288
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:791
void gistcheckpage(Relation rel, Buffer buf)
Definition: gistutil.c:719
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:553
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:341
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
Definition: gistutil.c:122
#define GIST_PENALTY_PROC
Definition: gist.h:32
#define InvalidBlockNumber
Definition: block.h:33
static int list_length(const List *l)
Definition: pg_list.h:89
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:146
#define leftchild(x)
Definition: fsmpage.c:29
#define GIST_DISTANCE_PROC
Definition: gist.h:35
OffsetNumber * spl_right
Definition: gist.h:110
#define BufferIsValid(bufnum)
Definition: bufmgr.h:105
#define GIST_SHARE
Definition: gist_private.h:43
FmgrInfo distanceFn[INDEX_MAX_KEYS]
Definition: gist_private.h:92
#define ItemPointerSetBlockNumber(pointer, blockNumber)
Definition: itemptr.h:101
#define GistMarkFollowRight(page)
Definition: gist.h:148
#define RelationNeedsWAL(relation)
Definition: rel.h:434
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
XLogRecPtr GistNSN
Definition: gist.h:50
#define PageGetLSN(page)
Definition: bufpage.h:346
static Datum values[MAXATTR]
Definition: bootstrap.c:159
bool spl_risnull[INDEX_MAX_KEYS]
Definition: gist_private.h:282
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2325
#define MaxIndexTuplesPerPage
Definition: itup.h:137
void * palloc(Size size)
Definition: mcxt.c:892
int errmsg(const char *fmt,...)
Definition: elog.c:795
#define F_LEAF
Definition: gist.h:42
#define ALLOCSET_DEFAULT_INITSIZE
Definition: memutils.h:143
int i
#define GIST_ROOT_BLKNO
Definition: gist_private.h:297
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:40
MemoryContext scanCxt
Definition: gist_private.h:78
void GISTInitBuffer(Buffer b, uint32 f)
Definition: gistutil.c:697
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum attdata[], bool isnull[], bool isleaf)
Definition: gistutil.c:563
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
#define ALLOCSET_DEFAULT_MAXSIZE
Definition: memutils.h:144
#define TRUE
Definition: c.h:203
#define GIST_DECOMPRESS_PROC
Definition: gist.h:31
#define elog
Definition: elog.h:228
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:70
XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ituplen, Buffer leftchildbuf)
Definition: gistxlog.c:390
bool spl_lisnull[INDEX_MAX_KEYS]
Definition: gist_private.h:278
FmgrInfo unionFn[INDEX_MAX_KEYS]
Definition: gist_private.h:86
#define GIST_EXCLUSIVE
Definition: gist_private.h:44
#define PageSetLSN(page, lsn)
Definition: bufpage.h:348
Definition: pg_list.h:45
int Buffer
Definition: buf.h:23
struct GISTInsertStack * parent
Definition: gist_private.h:268
Buffer gistNewBuffer(Relation r)
Definition: gistutil.c:758
#define PageGetItem(page, itemId)
Definition: bufpage.h:320
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:70
List * list_delete_first(List *list)
Definition: list.c:666
XLogRecPtr gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, Buffer leftchildbuf, bool markfollowright)
Definition: gistxlog.c:326
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:776