PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
gist.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * gist.c
4  * interface routines for the postgres GiST index access method.
5  *
6  *
7  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  * src/backend/access/gist/gist.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include "access/gist_private.h"
18 #include "access/gistscan.h"
19 #include "catalog/pg_collation.h"
20 #include "miscadmin.h"
21 #include "utils/index_selfuncs.h"
22 #include "utils/memutils.h"
23 #include "utils/rel.h"
24 
25 
26 /* non-export function prototypes */
27 static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
29  GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum);
31  GISTSTATE *giststate,
32  IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
34  bool unlockbuf, bool unlockleftchild);
36  GISTSTATE *giststate, List *splitinfo, bool releasebuf);
37 static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
38 
39 
40 #define ROTATEDIST(d) do { \
41  SplitedPageLayout *tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \
42  memset(tmp,0,sizeof(SplitedPageLayout)); \
43  tmp->block.blkno = InvalidBlockNumber; \
44  tmp->buffer = InvalidBuffer; \
45  tmp->next = (d); \
46  (d)=tmp; \
47 } while(0)
48 
49 
50 /*
51  * GiST handler function: return IndexAmRoutine with access method parameters
52  * and callbacks.
53  */
54 Datum
56 {
58 
59  amroutine->amstrategies = 0;
60  amroutine->amsupport = GISTNProcs;
61  amroutine->amcanorder = false;
62  amroutine->amcanorderbyop = true;
63  amroutine->amcanbackward = false;
64  amroutine->amcanunique = false;
65  amroutine->amcanmulticol = true;
66  amroutine->amoptionalkey = true;
67  amroutine->amsearcharray = false;
68  amroutine->amsearchnulls = true;
69  amroutine->amstorage = true;
70  amroutine->amclusterable = true;
71  amroutine->ampredlocks = false;
72  amroutine->amkeytype = InvalidOid;
73 
74  amroutine->ambuild = gistbuild;
75  amroutine->ambuildempty = gistbuildempty;
76  amroutine->aminsert = gistinsert;
77  amroutine->ambulkdelete = gistbulkdelete;
79  amroutine->amcanreturn = gistcanreturn;
80  amroutine->amcostestimate = gistcostestimate;
81  amroutine->amoptions = gistoptions;
82  amroutine->amvalidate = gistvalidate;
83  amroutine->ambeginscan = gistbeginscan;
84  amroutine->amrescan = gistrescan;
85  amroutine->amgettuple = gistgettuple;
86  amroutine->amgetbitmap = gistgetbitmap;
87  amroutine->amendscan = gistendscan;
88  amroutine->ammarkpos = NULL;
89  amroutine->amrestrpos = NULL;
90 
91  PG_RETURN_POINTER(amroutine);
92 }
93 
94 /*
95  * Create and return a temporary memory context for use by GiST. We
96  * _always_ invoke user-provided methods in a temporary memory
97  * context, so that memory leaks in those functions cannot cause
98  * problems. Also, we use some additional temporary contexts in the
99  * GiST code itself, to avoid the need to do some awkward manual
100  * memory management.
101  */
104 {
106  "GiST temporary context",
110 }
111 
112 /*
113  * gistbuildempty() -- build an empty gist index in the initialization fork
114  */
115 void
117 {
118  Buffer buffer;
119 
120  /* Initialize the root page */
123 
124  /* Initialize and xlog buffer */
126  GISTInitBuffer(buffer, F_LEAF);
127  MarkBufferDirty(buffer);
128  log_newpage_buffer(buffer, true);
130 
131  /* Unlock and release the buffer */
132  UnlockReleaseBuffer(buffer);
133 }
134 
135 /*
136  * gistinsert -- wrapper for GiST tuple insertion.
137  *
138  * This is the public interface routine for tuple insertion in GiSTs.
139  * It doesn't do any work; just locks the relation and passes the buck.
140  */
141 bool
142 gistinsert(Relation r, Datum *values, bool *isnull,
143  ItemPointer ht_ctid, Relation heapRel,
144  IndexUniqueCheck checkUnique)
145 {
146  IndexTuple itup;
147  GISTSTATE *giststate;
148  MemoryContext oldCxt;
149 
150  giststate = initGISTstate(r);
151 
152  /*
153  * We use the giststate's scan context as temp context too. This means
154  * that any memory leaked by the support functions is not reclaimed until
155  * end of insert. In most cases, we aren't going to call the support
156  * functions very many times before finishing the insert, so this seems
157  * cheaper than resetting a temp context for each function call.
158  */
159  oldCxt = MemoryContextSwitchTo(giststate->tempCxt);
160 
161  itup = gistFormTuple(giststate, r,
162  values, isnull, true /* size is currently bogus */ );
163  itup->t_tid = *ht_ctid;
164 
165  gistdoinsert(r, itup, 0, giststate);
166 
167  /* cleanup */
168  MemoryContextSwitchTo(oldCxt);
169  freeGISTstate(giststate);
170 
171  return false;
172 }
173 
174 
175 /*
176  * Place tuples from 'itup' to 'buffer'. If 'oldoffnum' is valid, the tuple
177  * at that offset is atomically removed along with inserting the new tuples.
178  * This is used to replace a tuple with a new one.
179  *
180  * If 'leftchildbuf' is valid, we're inserting the downlink for the page
181  * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
182  * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
183  *
184  * If 'markfollowright' is true and the page is split, the left child is
185  * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
186  * index build, however, there is no concurrent access and the page splitting
187  * is done in a slightly simpler fashion, and false is passed.
188  *
189  * If there is not enough room on the page, it is split. All the split
190  * pages are kept pinned and locked and returned in *splitinfo, the caller
191  * is responsible for inserting the downlinks for them. However, if
192  * 'buffer' is the root page and it needs to be split, gistplacetopage()
193  * performs the split as one atomic operation, and *splitinfo is set to NIL.
194  * In that case, we continue to hold the root page locked, and the child
195  * pages are released; note that new tuple(s) are *not* on the root page
196  * but in one of the new child pages.
197  *
198  * If 'newblkno' is not NULL, returns the block number of page the first
199  * new/updated tuple was inserted to. Usually it's the given page, but could
200  * be its right sibling if the page was split.
201  *
202  * Returns 'true' if the page was split, 'false' otherwise.
203  */
204 bool
205 gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
206  Buffer buffer,
207  IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
208  BlockNumber *newblkno,
209  Buffer leftchildbuf,
210  List **splitinfo,
211  bool markfollowright)
212 {
214  Page page = BufferGetPage(buffer);
215  bool is_leaf = (GistPageIsLeaf(page)) ? true : false;
216  XLogRecPtr recptr;
217  int i;
218  bool is_split;
219 
220  /*
221  * Refuse to modify a page that's incompletely split. This should not
222  * happen because we finish any incomplete splits while we walk down the
223  * tree. However, it's remotely possible that another concurrent inserter
224  * splits a parent page, and errors out before completing the split. We
225  * will just throw an error in that case, and leave any split we had in
226  * progress unfinished too. The next insert that comes along will clean up
227  * the mess.
228  */
229  if (GistFollowRight(page))
230  elog(ERROR, "concurrent GiST page split was incomplete");
231 
232  *splitinfo = NIL;
233 
234  /*
235  * if isupdate, remove old key: This node's key has been modified, either
236  * because a child split occurred or because we needed to adjust our key
237  * for an insert in a child node. Therefore, remove the old version of
238  * this node's key.
239  *
240  * for WAL replay, in the non-split case we handle this by setting up a
241  * one-element todelete array; in the split case, it's handled implicitly
242  * because the tuple vector passed to gistSplit won't include this tuple.
243  */
244  is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
245 
246  /*
247  * If leaf page is full, try at first to delete dead tuples. And then
248  * check again.
249  */
250  if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
251  {
252  gistvacuumpage(rel, page, buffer);
253  is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
254  }
255 
256  if (is_split)
257  {
258  /* no space for insertion */
259  IndexTuple *itvec;
260  int tlen;
261  SplitedPageLayout *dist = NULL,
262  *ptr;
263  BlockNumber oldrlink = InvalidBlockNumber;
264  GistNSN oldnsn = 0;
265  SplitedPageLayout rootpg;
266  bool is_rootsplit;
267  int npage;
268 
269  is_rootsplit = (blkno == GIST_ROOT_BLKNO);
270 
271  /*
272  * Form index tuples vector to split. If we're replacing an old tuple,
273  * remove the old version from the vector.
274  */
275  itvec = gistextractpage(page, &tlen);
276  if (OffsetNumberIsValid(oldoffnum))
277  {
278  /* on inner page we should remove old tuple */
279  int pos = oldoffnum - FirstOffsetNumber;
280 
281  tlen--;
282  if (pos != tlen)
283  memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
284  }
285  itvec = gistjoinvector(itvec, &tlen, itup, ntup);
286  dist = gistSplit(rel, page, itvec, tlen, giststate);
287 
288  /*
289  * Check that split didn't produce too many pages.
290  */
291  npage = 0;
292  for (ptr = dist; ptr; ptr = ptr->next)
293  npage++;
294  /* in a root split, we'll add one more page to the list below */
295  if (is_rootsplit)
296  npage++;
297  if (npage > GIST_MAX_SPLIT_PAGES)
298  elog(ERROR, "GiST page split into too many halves (%d, maximum %d)",
299  npage, GIST_MAX_SPLIT_PAGES);
300 
301  /*
302  * Set up pages to work with. Allocate new buffers for all but the
303  * leftmost page. The original page becomes the new leftmost page, and
304  * is just replaced with the new contents.
305  *
306  * For a root-split, allocate new buffers for all child pages, the
307  * original page is overwritten with new root page containing
308  * downlinks to the new child pages.
309  */
310  ptr = dist;
311  if (!is_rootsplit)
312  {
313  /* save old rightlink and NSN */
314  oldrlink = GistPageGetOpaque(page)->rightlink;
315  oldnsn = GistPageGetNSN(page);
316 
317  dist->buffer = buffer;
318  dist->block.blkno = BufferGetBlockNumber(buffer);
320 
321  /* clean all flags except F_LEAF */
322  GistPageGetOpaque(dist->page)->flags = (is_leaf) ? F_LEAF : 0;
323 
324  ptr = ptr->next;
325  }
326  for (; ptr; ptr = ptr->next)
327  {
328  /* Allocate new page */
329  ptr->buffer = gistNewBuffer(rel);
330  GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
331  ptr->page = BufferGetPage(ptr->buffer);
332  ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
333  }
334 
335  /*
336  * Now that we know which blocks the new pages go to, set up downlink
337  * tuples to point to them.
338  */
339  for (ptr = dist; ptr; ptr = ptr->next)
340  {
341  ItemPointerSetBlockNumber(&(ptr->itup->t_tid), ptr->block.blkno);
342  GistTupleSetValid(ptr->itup);
343  }
344 
345  /*
346  * If this is a root split, we construct the new root page with the
347  * downlinks here directly, instead of requiring the caller to insert
348  * them. Add the new root page to the list along with the child pages.
349  */
350  if (is_rootsplit)
351  {
352  IndexTuple *downlinks;
353  int ndownlinks = 0;
354  int i;
355 
356  rootpg.buffer = buffer;
358  GistPageGetOpaque(rootpg.page)->flags = 0;
359 
360  /* Prepare a vector of all the downlinks */
361  for (ptr = dist; ptr; ptr = ptr->next)
362  ndownlinks++;
363  downlinks = palloc(sizeof(IndexTuple) * ndownlinks);
364  for (i = 0, ptr = dist; ptr; ptr = ptr->next)
365  downlinks[i++] = ptr->itup;
366 
367  rootpg.block.blkno = GIST_ROOT_BLKNO;
368  rootpg.block.num = ndownlinks;
369  rootpg.list = gistfillitupvec(downlinks, ndownlinks,
370  &(rootpg.lenlist));
371  rootpg.itup = NULL;
372 
373  rootpg.next = dist;
374  dist = &rootpg;
375  }
376  else
377  {
378  /* Prepare split-info to be returned to caller */
379  for (ptr = dist; ptr; ptr = ptr->next)
380  {
382 
383  si->buf = ptr->buffer;
384  si->downlink = ptr->itup;
385  *splitinfo = lappend(*splitinfo, si);
386  }
387  }
388 
389  /*
390  * Fill all pages. All the pages are new, ie. freshly allocated empty
391  * pages, or a temporary copy of the old page.
392  */
393  for (ptr = dist; ptr; ptr = ptr->next)
394  {
395  char *data = (char *) (ptr->list);
396 
397  for (i = 0; i < ptr->block.num; i++)
398  {
399  IndexTuple thistup = (IndexTuple) data;
400 
401  if (PageAddItem(ptr->page, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
402  elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
403 
404  /*
405  * If this is the first inserted/updated tuple, let the caller
406  * know which page it landed on.
407  */
408  if (newblkno && ItemPointerEquals(&thistup->t_tid, &(*itup)->t_tid))
409  *newblkno = ptr->block.blkno;
410 
411  data += IndexTupleSize(thistup);
412  }
413 
414  /* Set up rightlinks */
415  if (ptr->next && ptr->block.blkno != GIST_ROOT_BLKNO)
416  GistPageGetOpaque(ptr->page)->rightlink =
417  ptr->next->block.blkno;
418  else
419  GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
420 
421  /*
422  * Mark the all but the right-most page with the follow-right
423  * flag. It will be cleared as soon as the downlink is inserted
424  * into the parent, but this ensures that if we error out before
425  * that, the index is still consistent. (in buffering build mode,
426  * any error will abort the index build anyway, so this is not
427  * needed.)
428  */
429  if (ptr->next && !is_rootsplit && markfollowright)
430  GistMarkFollowRight(ptr->page);
431  else
432  GistClearFollowRight(ptr->page);
433 
434  /*
435  * Copy the NSN of the original page to all pages. The
436  * F_FOLLOW_RIGHT flags ensure that scans will follow the
437  * rightlinks until the downlinks are inserted.
438  */
439  GistPageSetNSN(ptr->page, oldnsn);
440  }
441 
442  /*
443  * gistXLogSplit() needs to WAL log a lot of pages, prepare WAL
444  * insertion for that. NB: The number of pages and data segments
445  * specified here must match the calculations in gistXLogSplit()!
446  */
447  if (RelationNeedsWAL(rel))
448  XLogEnsureRecordSpace(npage, 1 + npage * 2);
449 
451 
452  /*
453  * Must mark buffers dirty before XLogInsert, even though we'll still
454  * be changing their opaque fields below.
455  */
456  for (ptr = dist; ptr; ptr = ptr->next)
457  MarkBufferDirty(ptr->buffer);
458  if (BufferIsValid(leftchildbuf))
459  MarkBufferDirty(leftchildbuf);
460 
461  /*
462  * The first page in the chain was a temporary working copy meant to
463  * replace the old page. Copy it over the old page.
464  */
466  dist->page = BufferGetPage(dist->buffer);
467 
468  /* Write the WAL record */
469  if (RelationNeedsWAL(rel))
470  recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
471  dist, oldrlink, oldnsn, leftchildbuf,
472  markfollowright);
473  else
474  recptr = gistGetFakeLSN(rel);
475 
476  for (ptr = dist; ptr; ptr = ptr->next)
477  {
478  PageSetLSN(ptr->page, recptr);
479  }
480 
481  /*
482  * Return the new child buffers to the caller.
483  *
484  * If this was a root split, we've already inserted the downlink
485  * pointers, in the form of a new root page. Therefore we can release
486  * all the new buffers, and keep just the root page locked.
487  */
488  if (is_rootsplit)
489  {
490  for (ptr = dist->next; ptr; ptr = ptr->next)
491  UnlockReleaseBuffer(ptr->buffer);
492  }
493  }
494  else
495  {
496  /*
497  * Enough space. We also get here if ntuples==0.
498  */
500 
501  /*
502  * While we delete only one tuple at once we could mix calls
503  * PageIndexTupleDelete() here and PageIndexMultiDelete() in
504  * gistRedoPageUpdateRecord()
505  */
506  if (OffsetNumberIsValid(oldoffnum))
507  PageIndexTupleDelete(page, oldoffnum);
508  gistfillbuffer(page, itup, ntup, InvalidOffsetNumber);
509 
510  MarkBufferDirty(buffer);
511 
512  if (BufferIsValid(leftchildbuf))
513  MarkBufferDirty(leftchildbuf);
514 
515  if (RelationNeedsWAL(rel))
516  {
517  OffsetNumber ndeloffs = 0,
518  deloffs[1];
519 
520  if (OffsetNumberIsValid(oldoffnum))
521  {
522  deloffs[0] = oldoffnum;
523  ndeloffs = 1;
524  }
525 
526  recptr = gistXLogUpdate(rel->rd_node, buffer,
527  deloffs, ndeloffs, itup, ntup,
528  leftchildbuf);
529 
530  PageSetLSN(page, recptr);
531  }
532  else
533  {
534  recptr = gistGetFakeLSN(rel);
535  PageSetLSN(page, recptr);
536  }
537 
538  if (newblkno)
539  *newblkno = blkno;
540  }
541 
542  /*
543  * If we inserted the downlink for a child page, set NSN and clear
544  * F_FOLLOW_RIGHT flag on the left child, so that concurrent scans know to
545  * follow the rightlink if and only if they looked at the parent page
546  * before we inserted the downlink.
547  *
548  * Note that we do this *after* writing the WAL record. That means that
549  * the possible full page image in the WAL record does not include these
550  * changes, and they must be replayed even if the page is restored from
551  * the full page image. There's a chicken-and-egg problem: if we updated
552  * the child pages first, we wouldn't know the recptr of the WAL record
553  * we're about to write.
554  */
555  if (BufferIsValid(leftchildbuf))
556  {
557  Page leftpg = BufferGetPage(leftchildbuf);
558 
559  GistPageSetNSN(leftpg, recptr);
560  GistClearFollowRight(leftpg);
561 
562  PageSetLSN(leftpg, recptr);
563  }
564 
566 
567  return is_split;
568 }
569 
570 /*
571  * Workhouse routine for doing insertion into a GiST index. Note that
572  * this routine assumes it is invoked in a short-lived memory context,
573  * so it does not bother releasing palloc'd allocations.
574  */
575 void
576 gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
577 {
578  ItemId iid;
579  IndexTuple idxtuple;
580  GISTInsertStack firststack;
581  GISTInsertStack *stack;
583  bool xlocked = false;
584 
585  memset(&state, 0, sizeof(GISTInsertState));
586  state.freespace = freespace;
587  state.r = r;
588 
589  /* Start from the root */
590  firststack.blkno = GIST_ROOT_BLKNO;
591  firststack.lsn = 0;
592  firststack.parent = NULL;
593  firststack.downlinkoffnum = InvalidOffsetNumber;
594  state.stack = stack = &firststack;
595 
596  /*
597  * Walk down along the path of smallest penalty, updating the parent
598  * pointers with the key we're inserting as we go. If we crash in the
599  * middle, the tree is consistent, although the possible parent updates
600  * were a waste.
601  */
602  for (;;)
603  {
604  if (XLogRecPtrIsInvalid(stack->lsn))
605  stack->buffer = ReadBuffer(state.r, stack->blkno);
606 
607  /*
608  * Be optimistic and grab shared lock first. Swap it for an exclusive
609  * lock later if we need to update the page.
610  */
611  if (!xlocked)
612  {
613  LockBuffer(stack->buffer, GIST_SHARE);
614  gistcheckpage(state.r, stack->buffer);
615  }
616 
617  stack->page = (Page) BufferGetPage(stack->buffer);
618  stack->lsn = PageGetLSN(stack->page);
619  Assert(!RelationNeedsWAL(state.r) || !XLogRecPtrIsInvalid(stack->lsn));
620 
621  /*
622  * If this page was split but the downlink was never inserted to the
623  * parent because the inserting backend crashed before doing that, fix
624  * that now.
625  */
626  if (GistFollowRight(stack->page))
627  {
628  if (!xlocked)
629  {
630  LockBuffer(stack->buffer, GIST_UNLOCK);
632  xlocked = true;
633  /* someone might've completed the split when we unlocked */
634  if (!GistFollowRight(stack->page))
635  continue;
636  }
637  gistfixsplit(&state, giststate);
638 
639  UnlockReleaseBuffer(stack->buffer);
640  xlocked = false;
641  state.stack = stack = stack->parent;
642  continue;
643  }
644 
645  if (stack->blkno != GIST_ROOT_BLKNO &&
646  stack->parent->lsn < GistPageGetNSN(stack->page))
647  {
648  /*
649  * Concurrent split detected. There's no guarantee that the
650  * downlink for this page is consistent with the tuple we're
651  * inserting anymore, so go back to parent and rechoose the best
652  * child.
653  */
654  UnlockReleaseBuffer(stack->buffer);
655  xlocked = false;
656  state.stack = stack = stack->parent;
657  continue;
658  }
659 
660  if (!GistPageIsLeaf(stack->page))
661  {
662  /*
663  * This is an internal page so continue to walk down the tree.
664  * Find the child node that has the minimum insertion penalty.
665  */
666  BlockNumber childblkno;
667  IndexTuple newtup;
668  GISTInsertStack *item;
669  OffsetNumber downlinkoffnum;
670 
671  downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
672  iid = PageGetItemId(stack->page, downlinkoffnum);
673  idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
674  childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
675 
676  /*
677  * Check that it's not a leftover invalid tuple from pre-9.1
678  */
679  if (GistTupleIsInvalid(idxtuple))
680  ereport(ERROR,
681  (errmsg("index \"%s\" contains an inner tuple marked as invalid",
683  errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
684  errhint("Please REINDEX it.")));
685 
686  /*
687  * Check that the key representing the target child node is
688  * consistent with the key we're inserting. Update it if it's not.
689  */
690  newtup = gistgetadjusted(state.r, idxtuple, itup, giststate);
691  if (newtup)
692  {
693  /*
694  * Swap shared lock for an exclusive one. Beware, the page may
695  * change while we unlock/lock the page...
696  */
697  if (!xlocked)
698  {
699  LockBuffer(stack->buffer, GIST_UNLOCK);
701  xlocked = true;
702  stack->page = (Page) BufferGetPage(stack->buffer);
703 
704  if (PageGetLSN(stack->page) != stack->lsn)
705  {
706  /* the page was changed while we unlocked it, retry */
707  continue;
708  }
709  }
710 
711  /*
712  * Update the tuple.
713  *
714  * We still hold the lock after gistinserttuple(), but it
715  * might have to split the page to make the updated tuple fit.
716  * In that case the updated tuple might migrate to the other
717  * half of the split, so we have to go back to the parent and
718  * descend back to the half that's a better fit for the new
719  * tuple.
720  */
721  if (gistinserttuple(&state, stack, giststate, newtup,
722  downlinkoffnum))
723  {
724  /*
725  * If this was a root split, the root page continues to be
726  * the parent and the updated tuple went to one of the
727  * child pages, so we just need to retry from the root
728  * page.
729  */
730  if (stack->blkno != GIST_ROOT_BLKNO)
731  {
732  UnlockReleaseBuffer(stack->buffer);
733  xlocked = false;
734  state.stack = stack = stack->parent;
735  }
736  continue;
737  }
738  }
739  LockBuffer(stack->buffer, GIST_UNLOCK);
740  xlocked = false;
741 
742  /* descend to the chosen child */
743  item = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
744  item->blkno = childblkno;
745  item->parent = stack;
746  item->downlinkoffnum = downlinkoffnum;
747  state.stack = stack = item;
748  }
749  else
750  {
751  /*
752  * Leaf page. Insert the new key. We've already updated all the
753  * parents on the way down, but we might have to split the page if
754  * it doesn't fit. gistinserthere() will take care of that.
755  */
756 
757  /*
758  * Swap shared lock for an exclusive one. Be careful, the page may
759  * change while we unlock/lock the page...
760  */
761  if (!xlocked)
762  {
763  LockBuffer(stack->buffer, GIST_UNLOCK);
765  xlocked = true;
766  stack->page = (Page) BufferGetPage(stack->buffer);
767  stack->lsn = PageGetLSN(stack->page);
768 
769  if (stack->blkno == GIST_ROOT_BLKNO)
770  {
771  /*
772  * the only page that can become inner instead of leaf is
773  * the root page, so for root we should recheck it
774  */
775  if (!GistPageIsLeaf(stack->page))
776  {
777  /*
778  * very rare situation: during unlock/lock index with
779  * number of pages = 1 was increased
780  */
781  LockBuffer(stack->buffer, GIST_UNLOCK);
782  xlocked = false;
783  continue;
784  }
785 
786  /*
787  * we don't need to check root split, because checking
788  * leaf/inner is enough to recognize split for root
789  */
790  }
791  else if (GistFollowRight(stack->page) ||
792  stack->parent->lsn < GistPageGetNSN(stack->page))
793  {
794  /*
795  * The page was split while we momentarily unlocked the
796  * page. Go back to parent.
797  */
798  UnlockReleaseBuffer(stack->buffer);
799  xlocked = false;
800  state.stack = stack = stack->parent;
801  continue;
802  }
803  }
804 
805  /* now state.stack->(page, buffer and blkno) points to leaf page */
806 
807  gistinserttuple(&state, stack, giststate, itup,
809  LockBuffer(stack->buffer, GIST_UNLOCK);
810 
811  /* Release any pins we might still hold before exiting */
812  for (; stack; stack = stack->parent)
813  ReleaseBuffer(stack->buffer);
814  break;
815  }
816  }
817 }
818 
819 /*
820  * Traverse the tree to find path from root page to specified "child" block.
821  *
822  * returns a new insertion stack, starting from the parent of "child", up
823  * to the root. *downlinkoffnum is set to the offset of the downlink in the
824  * direct parent of child.
825  *
826  * To prevent deadlocks, this should lock only one page at a time.
827  */
828 static GISTInsertStack *
830 {
831  Page page;
832  Buffer buffer;
833  OffsetNumber i,
834  maxoff;
835  ItemId iid;
836  IndexTuple idxtuple;
837  List *fifo;
838  GISTInsertStack *top,
839  *ptr;
841 
842  top = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
843  top->blkno = GIST_ROOT_BLKNO;
845 
846  fifo = list_make1(top);
847  while (fifo != NIL)
848  {
849  /* Get next page to visit */
850  top = linitial(fifo);
851  fifo = list_delete_first(fifo);
852 
853  buffer = ReadBuffer(r, top->blkno);
854  LockBuffer(buffer, GIST_SHARE);
855  gistcheckpage(r, buffer);
856  page = (Page) BufferGetPage(buffer);
857 
858  if (GistPageIsLeaf(page))
859  {
860  /*
861  * Because we scan the index top-down, all the rest of the pages
862  * in the queue must be leaf pages as well.
863  */
864  UnlockReleaseBuffer(buffer);
865  break;
866  }
867 
868  top->lsn = PageGetLSN(page);
869 
870  /*
871  * If F_FOLLOW_RIGHT is set, the page to the right doesn't have a
872  * downlink. This should not normally happen..
873  */
874  if (GistFollowRight(page))
875  elog(ERROR, "concurrent GiST page split was incomplete");
876 
877  if (top->parent && top->parent->lsn < GistPageGetNSN(page) &&
878  GistPageGetOpaque(page)->rightlink != InvalidBlockNumber /* sanity check */ )
879  {
880  /*
881  * Page was split while we looked elsewhere. We didn't see the
882  * downlink to the right page when we scanned the parent, so add
883  * it to the queue now.
884  *
885  * Put the right page ahead of the queue, so that we visit it
886  * next. That's important, because if this is the lowest internal
887  * level, just above leaves, we might already have queued up some
888  * leaf pages, and we assume that there can't be any non-leaf
889  * pages behind leaf pages.
890  */
891  ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
892  ptr->blkno = GistPageGetOpaque(page)->rightlink;
894  ptr->parent = top->parent;
895 
896  fifo = lcons(ptr, fifo);
897  }
898 
899  maxoff = PageGetMaxOffsetNumber(page);
900 
901  for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
902  {
903  iid = PageGetItemId(page, i);
904  idxtuple = (IndexTuple) PageGetItem(page, iid);
905  blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
906  if (blkno == child)
907  {
908  /* Found it! */
909  UnlockReleaseBuffer(buffer);
910  *downlinkoffnum = i;
911  return top;
912  }
913  else
914  {
915  /* Append this child to the list of pages to visit later */
916  ptr = (GISTInsertStack *) palloc0(sizeof(GISTInsertStack));
917  ptr->blkno = blkno;
918  ptr->downlinkoffnum = i;
919  ptr->parent = top;
920 
921  fifo = lappend(fifo, ptr);
922  }
923  }
924 
925  UnlockReleaseBuffer(buffer);
926  }
927 
928  elog(ERROR, "failed to re-find parent of a page in index \"%s\", block %u",
929  RelationGetRelationName(r), child);
930  return NULL; /* keep compiler quiet */
931 }
932 
933 /*
934  * Updates the stack so that child->parent is the correct parent of the
935  * child. child->parent must be exclusively locked on entry, and will
936  * remain so at exit, but it might not be the same page anymore.
937  */
938 static void
940 {
941  GISTInsertStack *parent = child->parent;
942 
943  gistcheckpage(r, parent->buffer);
944  parent->page = (Page) BufferGetPage(parent->buffer);
945 
946  /* here we don't need to distinguish between split and page update */
947  if (child->downlinkoffnum == InvalidOffsetNumber ||
948  parent->lsn != PageGetLSN(parent->page))
949  {
950  /* parent is changed, look child in right links until found */
951  OffsetNumber i,
952  maxoff;
953  ItemId iid;
954  IndexTuple idxtuple;
955  GISTInsertStack *ptr;
956 
957  while (true)
958  {
959  maxoff = PageGetMaxOffsetNumber(parent->page);
960  for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
961  {
962  iid = PageGetItemId(parent->page, i);
963  idxtuple = (IndexTuple) PageGetItem(parent->page, iid);
964  if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
965  {
966  /* yes!!, found */
967  child->downlinkoffnum = i;
968  return;
969  }
970  }
971 
972  parent->blkno = GistPageGetOpaque(parent->page)->rightlink;
973  UnlockReleaseBuffer(parent->buffer);
974  if (parent->blkno == InvalidBlockNumber)
975  {
976  /*
977  * End of chain and still didn't find parent. It's a very-very
978  * rare situation when root splited.
979  */
980  break;
981  }
982  parent->buffer = ReadBuffer(r, parent->blkno);
983  LockBuffer(parent->buffer, GIST_EXCLUSIVE);
984  gistcheckpage(r, parent->buffer);
985  parent->page = (Page) BufferGetPage(parent->buffer);
986  }
987 
988  /*
989  * awful!!, we need search tree to find parent ... , but before we
990  * should release all old parent
991  */
992 
993  ptr = child->parent->parent; /* child->parent already released
994  * above */
995  while (ptr)
996  {
997  ReleaseBuffer(ptr->buffer);
998  ptr = ptr->parent;
999  }
1000 
1001  /* ok, find new path */
1002  ptr = parent = gistFindPath(r, child->blkno, &child->downlinkoffnum);
1003 
1004  /* read all buffers as expected by caller */
1005  /* note we don't lock them or gistcheckpage them here! */
1006  while (ptr)
1007  {
1008  ptr->buffer = ReadBuffer(r, ptr->blkno);
1009  ptr->page = (Page) BufferGetPage(ptr->buffer);
1010  ptr = ptr->parent;
1011  }
1012 
1013  /* install new chain of parents to stack */
1014  child->parent = parent;
1015 
1016  /* make recursive call to normal processing */
1018  gistFindCorrectParent(r, child);
1019  }
1020 
1021  return;
1022 }
1023 
1024 /*
1025  * Form a downlink pointer for the page in 'buf'.
1026  */
1027 static IndexTuple
1029  GISTInsertStack *stack)
1030 {
1031  Page page = BufferGetPage(buf);
1032  OffsetNumber maxoff;
1033  OffsetNumber offset;
1034  IndexTuple downlink = NULL;
1035 
1036  maxoff = PageGetMaxOffsetNumber(page);
1037  for (offset = FirstOffsetNumber; offset <= maxoff; offset = OffsetNumberNext(offset))
1038  {
1039  IndexTuple ituple = (IndexTuple)
1040  PageGetItem(page, PageGetItemId(page, offset));
1041 
1042  if (downlink == NULL)
1043  downlink = CopyIndexTuple(ituple);
1044  else
1045  {
1046  IndexTuple newdownlink;
1047 
1048  newdownlink = gistgetadjusted(rel, downlink, ituple,
1049  giststate);
1050  if (newdownlink)
1051  downlink = newdownlink;
1052  }
1053  }
1054 
1055  /*
1056  * If the page is completely empty, we can't form a meaningful downlink
1057  * for it. But we have to insert a downlink for the page. Any key will do,
1058  * as long as its consistent with the downlink of parent page, so that we
1059  * can legally insert it to the parent. A minimal one that matches as few
1060  * scans as possible would be best, to keep scans from doing useless work,
1061  * but we don't know how to construct that. So we just use the downlink of
1062  * the original page that was split - that's as far from optimal as it can
1063  * get but will do..
1064  */
1065  if (!downlink)
1066  {
1067  ItemId iid;
1068 
1070  gistFindCorrectParent(rel, stack);
1071  iid = PageGetItemId(stack->parent->page, stack->downlinkoffnum);
1072  downlink = (IndexTuple) PageGetItem(stack->parent->page, iid);
1073  downlink = CopyIndexTuple(downlink);
1074  LockBuffer(stack->parent->buffer, GIST_UNLOCK);
1075  }
1076 
1078  GistTupleSetValid(downlink);
1079 
1080  return downlink;
1081 }
1082 
1083 
1084 /*
1085  * Complete the incomplete split of state->stack->page.
1086  */
1087 static void
1089 {
1090  GISTInsertStack *stack = state->stack;
1091  Buffer buf;
1092  Page page;
1093  List *splitinfo = NIL;
1094 
1095  elog(LOG, "fixing incomplete split in index \"%s\", block %u",
1096  RelationGetRelationName(state->r), stack->blkno);
1097 
1098  Assert(GistFollowRight(stack->page));
1100 
1101  buf = stack->buffer;
1102 
1103  /*
1104  * Read the chain of split pages, following the rightlinks. Construct a
1105  * downlink tuple for each page.
1106  */
1107  for (;;)
1108  {
1110  IndexTuple downlink;
1111 
1112  page = BufferGetPage(buf);
1113 
1114  /* Form the new downlink tuples to insert to parent */
1115  downlink = gistformdownlink(state->r, buf, giststate, stack);
1116 
1117  si->buf = buf;
1118  si->downlink = downlink;
1119 
1120  splitinfo = lappend(splitinfo, si);
1121 
1122  if (GistFollowRight(page))
1123  {
1124  /* lock next page */
1125  buf = ReadBuffer(state->r, GistPageGetOpaque(page)->rightlink);
1126  LockBuffer(buf, GIST_EXCLUSIVE);
1127  }
1128  else
1129  break;
1130  }
1131 
1132  /* Insert the downlinks */
1133  gistfinishsplit(state, stack, giststate, splitinfo, false);
1134 }
1135 
1136 /*
1137  * Insert or replace a tuple in stack->buffer. If 'oldoffnum' is valid, the
1138  * tuple at 'oldoffnum' is replaced, otherwise the tuple is inserted as new.
1139  * 'stack' represents the path from the root to the page being updated.
1140  *
1141  * The caller must hold an exclusive lock on stack->buffer. The lock is still
1142  * held on return, but the page might not contain the inserted tuple if the
1143  * page was split. The function returns true if the page was split, false
1144  * otherwise.
1145  */
1146 static bool
1148  GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
1149 {
1150  return gistinserttuples(state, stack, giststate, &tuple, 1, oldoffnum,
1151  InvalidBuffer, InvalidBuffer, false, false);
1152 }
1153 
1154 /* ----------------
1155  * An extended workhorse version of gistinserttuple(). This version allows
1156  * inserting multiple tuples, or replacing a single tuple with multiple tuples.
1157  * This is used to recursively update the downlinks in the parent when a page
1158  * is split.
1159  *
1160  * If leftchild and rightchild are valid, we're inserting/replacing the
1161  * downlink for rightchild, and leftchild is its left sibling. We clear the
1162  * F_FOLLOW_RIGHT flag and update NSN on leftchild, atomically with the
1163  * insertion of the downlink.
1164  *
1165  * To avoid holding locks for longer than necessary, when recursing up the
1166  * tree to update the parents, the locking is a bit peculiar here. On entry,
1167  * the caller must hold an exclusive lock on stack->buffer, as well as
1168  * leftchild and rightchild if given. On return:
1169  *
1170  * - Lock on stack->buffer is released, if 'unlockbuf' is true. The page is
1171  * always kept pinned, however.
1172  * - Lock on 'leftchild' is released, if 'unlockleftchild' is true. The page
1173  * is kept pinned.
1174  * - Lock and pin on 'rightchild' are always released.
1175  *
1176  * Returns 'true' if the page had to be split. Note that if the page was
1177  * split, the inserted/updated tuples might've been inserted to a right
1178  * sibling of stack->buffer instead of stack->buffer itself.
1179  */
1180 static bool
1182  GISTSTATE *giststate,
1183  IndexTuple *tuples, int ntup, OffsetNumber oldoffnum,
1185  bool unlockbuf, bool unlockleftchild)
1186 {
1187  List *splitinfo;
1188  bool is_split;
1189 
1190  /* Insert the tuple(s) to the page, splitting the page if necessary */
1191  is_split = gistplacetopage(state->r, state->freespace, giststate,
1192  stack->buffer,
1193  tuples, ntup,
1194  oldoffnum, NULL,
1195  leftchild,
1196  &splitinfo,
1197  true);
1198 
1199  /*
1200  * Before recursing up in case the page was split, release locks on the
1201  * child pages. We don't need to keep them locked when updating the
1202  * parent.
1203  */
1204  if (BufferIsValid(rightchild))
1205  UnlockReleaseBuffer(rightchild);
1206  if (BufferIsValid(leftchild) && unlockleftchild)
1207  LockBuffer(leftchild, GIST_UNLOCK);
1208 
1209  /*
1210  * If we had to split, insert/update the downlinks in the parent. If the
1211  * caller requested us to release the lock on stack->buffer, tell
1212  * gistfinishsplit() to do that as soon as it's safe to do so. If we
1213  * didn't have to split, release it ourselves.
1214  */
1215  if (splitinfo)
1216  gistfinishsplit(state, stack, giststate, splitinfo, unlockbuf);
1217  else if (unlockbuf)
1218  LockBuffer(stack->buffer, GIST_UNLOCK);
1219 
1220  return is_split;
1221 }
1222 
1223 /*
1224  * Finish an incomplete split by inserting/updating the downlinks in parent
1225  * page. 'splitinfo' contains all the child pages involved in the split,
1226  * from left-to-right.
1227  *
1228  * On entry, the caller must hold a lock on stack->buffer and all the child
1229  * pages in 'splitinfo'. If 'unlockbuf' is true, the lock on stack->buffer is
1230  * released on return. The child pages are always unlocked and unpinned.
1231  */
1232 static void
1234  GISTSTATE *giststate, List *splitinfo, bool unlockbuf)
1235 {
1236  ListCell *lc;
1237  List *reversed;
1238  GISTPageSplitInfo *right;
1239  GISTPageSplitInfo *left;
1240  IndexTuple tuples[2];
1241 
1242  /* A split always contains at least two halves */
1243  Assert(list_length(splitinfo) >= 2);
1244 
1245  /*
1246  * We need to insert downlinks for each new page, and update the downlink
1247  * for the original (leftmost) page in the split. Begin at the rightmost
1248  * page, inserting one downlink at a time until there's only two pages
1249  * left. Finally insert the downlink for the last new page and update the
1250  * downlink for the original page as one operation.
1251  */
1252 
1253  /* for convenience, create a copy of the list in reverse order */
1254  reversed = NIL;
1255  foreach(lc, splitinfo)
1256  {
1257  reversed = lcons(lfirst(lc), reversed);
1258  }
1259 
1261  gistFindCorrectParent(state->r, stack);
1262 
1263  /*
1264  * insert downlinks for the siblings from right to left, until there are
1265  * only two siblings left.
1266  */
1267  while (list_length(reversed) > 2)
1268  {
1269  right = (GISTPageSplitInfo *) linitial(reversed);
1270  left = (GISTPageSplitInfo *) lsecond(reversed);
1271 
1272  if (gistinserttuples(state, stack->parent, giststate,
1273  &right->downlink, 1,
1275  left->buf, right->buf, false, false))
1276  {
1277  /*
1278  * If the parent page was split, need to relocate the original
1279  * parent pointer.
1280  */
1281  gistFindCorrectParent(state->r, stack);
1282  }
1283  /* gistinserttuples() released the lock on right->buf. */
1284  reversed = list_delete_first(reversed);
1285  }
1286 
1287  right = (GISTPageSplitInfo *) linitial(reversed);
1288  left = (GISTPageSplitInfo *) lsecond(reversed);
1289 
1290  /*
1291  * Finally insert downlink for the remaining right page and update the
1292  * downlink for the original page to not contain the tuples that were
1293  * moved to the new pages.
1294  */
1295  tuples[0] = left->downlink;
1296  tuples[1] = right->downlink;
1297  gistinserttuples(state, stack->parent, giststate,
1298  tuples, 2,
1299  stack->downlinkoffnum,
1300  left->buf, right->buf,
1301  true, /* Unlock parent */
1302  unlockbuf /* Unlock stack->buffer if caller wants that */
1303  );
1304  Assert(left->buf == stack->buffer);
1305 }
1306 
1307 /*
1308  * gistSplit -- split a page in the tree and fill struct
1309  * used for XLOG and real writes buffers. Function is recursive, ie
1310  * it will split page until keys will fit in every page.
1311  */
1314  Page page,
1315  IndexTuple *itup, /* contains compressed entry */
1316  int len,
1317  GISTSTATE *giststate)
1318 {
1319  IndexTuple *lvectup,
1320  *rvectup;
1321  GistSplitVector v;
1322  int i;
1323  SplitedPageLayout *res = NULL;
1324 
1325  /* this should never recurse very deeply, but better safe than sorry */
1327 
1328  /* there's no point in splitting an empty page */
1329  Assert(len > 0);
1330 
1331  /*
1332  * If a single tuple doesn't fit on a page, no amount of splitting will
1333  * help.
1334  */
1335  if (len == 1)
1336  ereport(ERROR,
1337  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1338  errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
1339  IndexTupleSize(itup[0]), GiSTPageSize,
1341 
1342  memset(v.spl_lisnull, TRUE, sizeof(bool) * giststate->tupdesc->natts);
1343  memset(v.spl_risnull, TRUE, sizeof(bool) * giststate->tupdesc->natts);
1344  gistSplitByKey(r, page, itup, len, giststate, &v, 0);
1345 
1346  /* form left and right vector */
1347  lvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
1348  rvectup = (IndexTuple *) palloc(sizeof(IndexTuple) * (len + 1));
1349 
1350  for (i = 0; i < v.splitVector.spl_nleft; i++)
1351  lvectup[i] = itup[v.splitVector.spl_left[i] - 1];
1352 
1353  for (i = 0; i < v.splitVector.spl_nright; i++)
1354  rvectup[i] = itup[v.splitVector.spl_right[i] - 1];
1355 
1356  /* finalize splitting (may need another split) */
1357  if (!gistfitpage(rvectup, v.splitVector.spl_nright))
1358  {
1359  res = gistSplit(r, page, rvectup, v.splitVector.spl_nright, giststate);
1360  }
1361  else
1362  {
1363  ROTATEDIST(res);
1364  res->block.num = v.splitVector.spl_nright;
1365  res->list = gistfillitupvec(rvectup, v.splitVector.spl_nright, &(res->lenlist));
1366  res->itup = gistFormTuple(giststate, r, v.spl_rattr, v.spl_risnull, false);
1367  }
1368 
1369  if (!gistfitpage(lvectup, v.splitVector.spl_nleft))
1370  {
1371  SplitedPageLayout *resptr,
1372  *subres;
1373 
1374  resptr = subres = gistSplit(r, page, lvectup, v.splitVector.spl_nleft, giststate);
1375 
1376  /* install on list's tail */
1377  while (resptr->next)
1378  resptr = resptr->next;
1379 
1380  resptr->next = res;
1381  res = subres;
1382  }
1383  else
1384  {
1385  ROTATEDIST(res);
1386  res->block.num = v.splitVector.spl_nleft;
1387  res->list = gistfillitupvec(lvectup, v.splitVector.spl_nleft, &(res->lenlist));
1388  res->itup = gistFormTuple(giststate, r, v.spl_lattr, v.spl_lisnull, false);
1389  }
1390 
1391  return res;
1392 }
1393 
1394 /*
1395  * Create a GISTSTATE and fill it with information about the index
1396  */
1397 GISTSTATE *
1399 {
1400  GISTSTATE *giststate;
1401  MemoryContext scanCxt;
1402  MemoryContext oldCxt;
1403  int i;
1404 
1405  /* safety check to protect fixed-size arrays in GISTSTATE */
1406  if (index->rd_att->natts > INDEX_MAX_KEYS)
1407  elog(ERROR, "numberOfAttributes %d > %d",
1408  index->rd_att->natts, INDEX_MAX_KEYS);
1409 
1410  /* Create the memory context that will hold the GISTSTATE */
1412  "GiST scan context",
1416  oldCxt = MemoryContextSwitchTo(scanCxt);
1417 
1418  /* Create and fill in the GISTSTATE */
1419  giststate = (GISTSTATE *) palloc(sizeof(GISTSTATE));
1420 
1421  giststate->scanCxt = scanCxt;
1422  giststate->tempCxt = scanCxt; /* caller must change this if needed */
1423  giststate->tupdesc = index->rd_att;
1424 
1425  for (i = 0; i < index->rd_att->natts; i++)
1426  {
1427  fmgr_info_copy(&(giststate->consistentFn[i]),
1428  index_getprocinfo(index, i + 1, GIST_CONSISTENT_PROC),
1429  scanCxt);
1430  fmgr_info_copy(&(giststate->unionFn[i]),
1431  index_getprocinfo(index, i + 1, GIST_UNION_PROC),
1432  scanCxt);
1433  fmgr_info_copy(&(giststate->compressFn[i]),
1434  index_getprocinfo(index, i + 1, GIST_COMPRESS_PROC),
1435  scanCxt);
1436  fmgr_info_copy(&(giststate->decompressFn[i]),
1437  index_getprocinfo(index, i + 1, GIST_DECOMPRESS_PROC),
1438  scanCxt);
1439  fmgr_info_copy(&(giststate->penaltyFn[i]),
1440  index_getprocinfo(index, i + 1, GIST_PENALTY_PROC),
1441  scanCxt);
1442  fmgr_info_copy(&(giststate->picksplitFn[i]),
1443  index_getprocinfo(index, i + 1, GIST_PICKSPLIT_PROC),
1444  scanCxt);
1445  fmgr_info_copy(&(giststate->equalFn[i]),
1446  index_getprocinfo(index, i + 1, GIST_EQUAL_PROC),
1447  scanCxt);
1448  /* opclasses are not required to provide a Distance method */
1449  if (OidIsValid(index_getprocid(index, i + 1, GIST_DISTANCE_PROC)))
1450  fmgr_info_copy(&(giststate->distanceFn[i]),
1451  index_getprocinfo(index, i + 1, GIST_DISTANCE_PROC),
1452  scanCxt);
1453  else
1454  giststate->distanceFn[i].fn_oid = InvalidOid;
1455 
1456  /* opclasses are not required to provide a Fetch method */
1457  if (OidIsValid(index_getprocid(index, i + 1, GIST_FETCH_PROC)))
1458  fmgr_info_copy(&(giststate->fetchFn[i]),
1459  index_getprocinfo(index, i + 1, GIST_FETCH_PROC),
1460  scanCxt);
1461  else
1462  giststate->fetchFn[i].fn_oid = InvalidOid;
1463 
1464  /*
1465  * If the index column has a specified collation, we should honor that
1466  * while doing comparisons. However, we may have a collatable storage
1467  * type for a noncollatable indexed data type. If there's no index
1468  * collation then specify default collation in case the support
1469  * functions need collation. This is harmless if the support
1470  * functions don't care about collation, so we just do it
1471  * unconditionally. (We could alternatively call get_typcollation,
1472  * but that seems like expensive overkill --- there aren't going to be
1473  * any cases where a GiST storage type has a nondefault collation.)
1474  */
1475  if (OidIsValid(index->rd_indcollation[i]))
1476  giststate->supportCollation[i] = index->rd_indcollation[i];
1477  else
1479  }
1480 
1481  MemoryContextSwitchTo(oldCxt);
1482 
1483  return giststate;
1484 }
1485 
1486 void
1488 {
1489  /* It's sufficient to delete the scanCxt */
1490  MemoryContextDelete(giststate->scanCxt);
1491 }
1492 
1493 /*
1494  * gistvacuumpage() -- try to remove LP_DEAD items from the given page.
1495  * Function assumes that buffer is exclusively locked.
1496  */
1497 static void
1499 {
1501  int ndeletable = 0;
1502  OffsetNumber offnum, maxoff;
1503 
1504  Assert(GistPageIsLeaf(page));
1505 
1506  /*
1507  * Scan over all items to see which ones need to be deleted according to
1508  * LP_DEAD flags.
1509  */
1510  maxoff = PageGetMaxOffsetNumber(page);
1511  for (offnum = FirstOffsetNumber;
1512  offnum <= maxoff;
1513  offnum = OffsetNumberNext(offnum))
1514  {
1515  ItemId itemId = PageGetItemId(page, offnum);
1516 
1517  if (ItemIdIsDead(itemId))
1518  deletable[ndeletable++] = offnum;
1519  }
1520 
1521  if (ndeletable > 0)
1522  {
1524 
1525  PageIndexMultiDelete(page, deletable, ndeletable);
1526 
1527  /*
1528  * Mark the page as not containing any LP_DEAD items. This is not
1529  * certainly true (there might be some that have recently been marked,
1530  * but weren't included in our target-item list), but it will almost
1531  * always be true and it doesn't seem worth an additional page scan to
1532  * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
1533  */
1535 
1536  MarkBufferDirty(buffer);
1537 
1538  /* XLOG stuff */
1539  if (RelationNeedsWAL(rel))
1540  {
1541  XLogRecPtr recptr;
1542 
1543  recptr = gistXLogUpdate(rel->rd_node, buffer,
1544  deletable, ndeletable,
1545  NULL, 0, InvalidBuffer);
1546 
1547  PageSetLSN(page, recptr);
1548  }
1549  else
1550  PageSetLSN(page, gistGetFakeLSN(rel));
1551 
1552  END_CRIT_SECTION();
1553  }
1554 
1555  /*
1556  * Note: if we didn't find any LP_DEAD items, then the page's
1557  * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
1558  * separate write to clear it, however. We will clear it when we split
1559  * the page.
1560  */
1561 }
#define GistFollowRight(page)
Definition: gist.h:147
ambeginscan_function ambeginscan
Definition: amapi.h:158
#define PG_RETURN_POINTER(x)
Definition: fmgr.h:305
#define NIL
Definition: pg_list.h:69
TupleDesc tupdesc
Definition: gist_private.h:82
BlockNumber blkno
Definition: gist_private.h:255
#define GistPageGetNSN(page)
Definition: gist.h:151
ambulkdelete_function ambulkdelete
Definition: amapi.h:152
bool amcanmulticol
Definition: amapi.h:132
static bool gistinserttuple(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, IndexTuple tuple, OffsetNumber oldoffnum)
Definition: gist.c:1147
uint16 amsupport
Definition: amapi.h:122
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:203
int errhint(const char *fmt,...)
Definition: elog.c:987
#define GIST_FETCH_PROC
Definition: gist.h:36
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:974
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:729
Datum spl_lattr[INDEX_MAX_KEYS]
Definition: gist_private.h:277
void PageRestoreTempPage(Page tempPage, Page oldPage)
Definition: bufpage.c:423
static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate)
Definition: gist.c:1088
OffsetNumber PageAddItem(Page page, Item item, Size size, OffsetNumber offsetNumber, bool overwrite, bool is_heap)
Definition: bufpage.c:350
amgettuple_function amgettuple
Definition: amapi.h:160
bool gistinsert(Relation r, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique)
Definition: gist.c:142
BlockNumber blkno
Definition: gist_private.h:231
FmgrInfo fetchFn[INDEX_MAX_KEYS]
Definition: gist_private.h:94
IndexBulkDeleteResult * gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
Definition: gistvacuum.c:29
bool amcanorderbyop
Definition: amapi.h:126
#define GistClearPageHasGarbage(page)
Definition: gist.h:145
#define GIST_EQUAL_PROC
Definition: gist.h:34
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Definition: bufpage.c:717
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:1445
MemoryContext createTempGistContext(void)
Definition: gist.c:103
Oid supportCollation[INDEX_MAX_KEYS]
Definition: gist_private.h:97
FmgrInfo compressFn[INDEX_MAX_KEYS]
Definition: gist_private.h:88
IndexBulkDeleteResult * gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
Definition: gistvacuum.c:139
Datum gisthandler(PG_FUNCTION_ARGS)
Definition: gist.c:55
FmgrInfo equalFn[INDEX_MAX_KEYS]
Definition: gist_private.h:92
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:640
ItemPointerData t_tid
Definition: itup.h:37
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
Definition: gistutil.c:29
#define END_CRIT_SECTION()
Definition: miscadmin.h:132
bool amstorage
Definition: amapi.h:140
bool gistgettuple(IndexScanDesc scan, ScanDirection dir)
Definition: gistget.c:623
OffsetNumber * spl_left
Definition: gist.h:105
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:109
Pointer Item
Definition: item.h:17
#define GistPageSetNSN(page, val)
Definition: gist.h:152
#define InvalidBuffer
Definition: buf.h:25
GIST_SPLITVEC splitVector
Definition: gist_private.h:275
#define START_CRIT_SECTION()
Definition: miscadmin.h:130
int errcode(int sqlerrcode)
Definition: elog.c:575
bool ampredlocks
Definition: amapi.h:144
#define GistTupleIsInvalid(itup)
Definition: gist_private.h:324
#define GIST_UNLOCK
Definition: gist_private.h:46
#define GistPageHasGarbage(page)
Definition: gist.h:143
uint32 BlockNumber
Definition: block.h:31
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3292
#define P_NEW
Definition: bufmgr.h:96
aminsert_function aminsert
Definition: amapi.h:151
void gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
Definition: gist.c:576
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:103
int spl_nleft
Definition: gist.h:106
#define LOG
Definition: elog.h:26
bytea * gistoptions(Datum reloptions, bool validate)
Definition: gistutil.c:812
IndexTupleData * list
Definition: gist_private.h:239
#define ItemIdIsDead(itemId)
Definition: itemid.h:112
Oid amkeytype
Definition: amapi.h:146
#define OidIsValid(objectId)
Definition: c.h:530
#define PageGetMaxOffsetNumber(page)
Definition: bufpage.h:354
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, List *splitinfo, bool releasebuf)
Definition: gist.c:1233
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
Definition: gistutil.c:310
bool amoptionalkey
Definition: amapi.h:134
void gistSplitByKey(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate, GistSplitVector *v, int attno)
Definition: gistsplit.c:623
amvalidate_function amvalidate
Definition: amapi.h:157
Datum spl_rattr[INDEX_MAX_KEYS]
Definition: gist_private.h:281
int natts
Definition: tupdesc.h:73
XLogRecPtr gistGetFakeLSN(Relation rel)
Definition: gistutil.c:845
#define lsecond(l)
Definition: pg_list.h:114
#define ALLOCSET_DEFAULT_MINSIZE
Definition: memutils.h:142
uint16 OffsetNumber
Definition: off.h:24
Definition: type.h:90
Page PageGetTempPageCopySpecial(Page page)
Definition: bufpage.c:401
gistxlogPage block
Definition: gist_private.h:238
#define list_make1(x1)
Definition: pg_list.h:133
IndexBuildResult * gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Definition: gistbuild.c:113
IndexUniqueCheck
Definition: genam.h:106
IndexTuple * gistextractpage(Page page, int *len)
Definition: gistutil.c:90
amgetbitmap_function amgetbitmap
Definition: amapi.h:161
FmgrInfo consistentFn[INDEX_MAX_KEYS]
Definition: gist_private.h:86
#define linitial(l)
Definition: pg_list.h:110
#define GIST_PICKSPLIT_PROC
Definition: gist.h:33
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:3315
Oid * rd_indcollation
Definition: rel.h:148
#define ERROR
Definition: elog.h:43
static IndexTuple gistformdownlink(Relation rel, Buffer buf, GISTSTATE *giststate, GISTInsertStack *stack)
Definition: gist.c:1028
ambuild_function ambuild
Definition: amapi.h:149
#define GIST_COMPRESS_PROC
Definition: gist.h:30
#define GIST_MAX_SPLIT_PAGES
Definition: gist_private.h:41
amoptions_function amoptions
Definition: amapi.h:156
int spl_nright
Definition: gist.h:111
void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)
Definition: fmgr.c:584
MemoryContext tempCxt
Definition: gist_private.h:80
BlockNumber blkno
Definition: ginvacuum.c:290
amcostestimate_function amcostestimate
Definition: amapi.h:155
IndexTuple downlink
Definition: gist_private.h:447
bool amcanunique
Definition: amapi.h:130
void gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, ScanKey orderbys, int norderbys)
Definition: gistscan.c:111
bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate, Buffer buffer, IndexTuple *itup, int ntup, OffsetNumber oldoffnum, BlockNumber *newblkno, Buffer leftchildbuf, List **splitinfo, bool markfollowright)
Definition: gist.c:205
GISTInsertStack * stack
Definition: gist_private.h:294
IndexTuple CopyIndexTuple(IndexTuple source)
Definition: indextuple.c:434
static char * buf
Definition: pg_test_fsync.c:65
amvacuumcleanup_function amvacuumcleanup
Definition: amapi.h:153
amendscan_function amendscan
Definition: amapi.h:162
#define memmove(d, s, c)
Definition: c.h:1038
bool amcanbackward
Definition: amapi.h:128
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:68
void check_stack_depth(void)
Definition: postgres.c:3110
#define FirstOffsetNumber
Definition: off.h:27
IndexTupleData * IndexTuple
Definition: itup.h:53
int errdetail(const char *fmt,...)
Definition: elog.c:873
FmgrInfo picksplitFn[INDEX_MAX_KEYS]
Definition: gist_private.h:91
bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
Definition: gistutil.c:54
#define RelationGetRelationName(relation)
Definition: rel.h:361
FmgrInfo penaltyFn[INDEX_MAX_KEYS]
Definition: gist_private.h:90
MemoryContext CurrentMemoryContext
Definition: mcxt.c:37
struct SplitedPageLayout * next
Definition: gist_private.h:245
#define BufferGetPage(buffer)
Definition: bufmgr.h:174
void freeGISTstate(GISTSTATE *giststate)
Definition: gist.c:1487
#define ereport(elevel, rest)
Definition: elog.h:122
amrescan_function amrescan
Definition: amapi.h:159
FmgrInfo decompressFn[INDEX_MAX_KEYS]
Definition: gist_private.h:89
static void gistvacuumpage(Relation rel, Page page, Buffer buffer)
Definition: gist.c:1498
int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
Definition: gistget.c:753
List * lappend(List *list, void *datum)
Definition: list.c:128
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
Definition: gistutil.c:368
void gistbuildempty(Relation index)
Definition: gist.c:116
#define GistPageIsLeaf(page)
Definition: gist.h:132
#define GistTupleSetValid(itup)
Definition: gist_private.h:325
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
OffsetNumber downlinkoffnum
Definition: gist_private.h:266
#define PageGetItemId(page, offsetNumber)
Definition: bufpage.h:232
static void gistFindCorrectParent(Relation r, GISTInsertStack *child)
Definition: gist.c:939
bool amsearchnulls
Definition: amapi.h:138
#define GiSTPageSize
Definition: gist_private.h:489
#define GistClearFollowRight(page)
Definition: gist.h:149
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
Definition: gistutil.c:109
static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack, GISTSTATE *giststate, IndexTuple *tuples, int ntup, OffsetNumber oldoffnum, Buffer leftchild, Buffer rightchild, bool unlockbuf, bool unlockleftchild)
Definition: gist.c:1181
MemoryContext AllocSetContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
Definition: aset.c:436
void * palloc0(Size size)
Definition: mcxt.c:923
uintptr_t Datum
Definition: postgres.h:374
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:3529
#define ROTATEDIST(d)
Definition: gist.c:40
bool amclusterable
Definition: amapi.h:142
TupleDesc rd_att
Definition: rel.h:84
GISTSTATE * initGISTstate(Relation index)
Definition: gist.c:1398
struct DataPageDeleteStack * child
Definition: ginvacuum.c:287
bool amsearcharray
Definition: amapi.h:136
SplitedPageLayout * gistSplit(Relation r, Page page, IndexTuple *itup, int len, GISTSTATE *giststate)
Definition: gist.c:1313
#define rightchild(x)
Definition: fsmpage.c:30
#define InvalidOffsetNumber
Definition: off.h:26
static GISTInsertStack * gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
Definition: gist.c:829
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:56
#define GIST_CONSISTENT_PROC
Definition: gist.h:28
bool gistfitpage(IndexTuple *itvec, int len)
Definition: gistutil.c:74
#define GIST_UNION_PROC
Definition: gist.h:29
bool gistcanreturn(Relation index, int attno)
Definition: gistget.c:800
#define GistPageGetOpaque(page)
Definition: gist.h:130
List * lcons(void *datum, List *list)
Definition: list.c:259
RelFileNode rd_node
Definition: rel.h:55
#define makeNode(_type_)
Definition: nodes.h:539
#define NULL
Definition: c.h:226
uint64 XLogRecPtr
Definition: xlogdefs.h:21
#define Assert(condition)
Definition: c.h:667
#define lfirst(lc)
Definition: pg_list.h:106
Definition: regguts.h:313
struct DataPageDeleteStack * parent
Definition: ginvacuum.c:288
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
Definition: bufpage.c:823
void gistcheckpage(Relation rel, Buffer buf)
Definition: gistutil.c:719
void gistcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation)
Definition: selfuncs.c:6794
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:594
#define INDEX_MAX_KEYS
#define OffsetNumberNext(offsetNumber)
Definition: off.h:53
size_t Size
Definition: c.h:352
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
Definition: gistutil.c:122
#define GIST_PENALTY_PROC
Definition: gist.h:32
#define InvalidBlockNumber
Definition: block.h:33
static int list_length(const List *l)
Definition: pg_list.h:89
void XLogEnsureRecordSpace(int max_block_id, int ndatas)
Definition: xloginsert.c:146
#define leftchild(x)
Definition: fsmpage.c:29
#define GIST_DISTANCE_PROC
Definition: gist.h:35
OffsetNumber * spl_right
Definition: gist.h:110
#define BufferIsValid(bufnum)
Definition: bufmgr.h:128
#define GIST_SHARE
Definition: gist_private.h:44
ammarkpos_function ammarkpos
Definition: amapi.h:163
bool amcanorder
Definition: amapi.h:124
FmgrInfo distanceFn[INDEX_MAX_KEYS]
Definition: gist_private.h:93
#define ItemPointerSetBlockNumber(pointer, blockNumber)
Definition: itemptr.h:101
#define GistMarkFollowRight(page)
Definition: gist.h:148
#define RelationNeedsWAL(relation)
Definition: rel.h:430
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
Definition: itemptr.c:29
XLogRecPtr GistNSN
Definition: gist.h:50
#define PageGetLSN(page)
Definition: bufpage.h:363
#define GISTNProcs
Definition: gist.h:37
static Datum values[MAXATTR]
Definition: bootstrap.c:160
uint16 amstrategies
Definition: amapi.h:120
bool spl_risnull[INDEX_MAX_KEYS]
Definition: gist_private.h:283
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:2588
bool gistvalidate(Oid opclassoid)
Definition: gistvalidate.c:33
#define MaxIndexTuplesPerPage
Definition: itup.h:137
void * palloc(Size size)
Definition: mcxt.c:894
int errmsg(const char *fmt,...)
Definition: elog.c:797
#define F_LEAF
Definition: gist.h:42
ambuildempty_function ambuildempty
Definition: amapi.h:150
#define ALLOCSET_DEFAULT_INITSIZE
Definition: memutils.h:143
int i
#define GIST_ROOT_BLKNO
Definition: gist_private.h:298
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:40
MemoryContext scanCxt
Definition: gist_private.h:79
void GISTInitBuffer(Buffer b, uint32 f)
Definition: gistutil.c:697
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, Datum attdata[], bool isnull[], bool isleaf)
Definition: gistutil.c:563
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
#define ALLOCSET_DEFAULT_MAXSIZE
Definition: memutils.h:144
#define TRUE
Definition: c.h:214
#define GIST_DECOMPRESS_PROC
Definition: gist.h:31
IndexScanDesc gistbeginscan(Relation r, int nkeys, int norderbys)
Definition: gistscan.c:58
void gistendscan(IndexScanDesc scan)
Definition: gistscan.c:322
#define elog
Definition: elog.h:218
#define ItemPointerGetBlockNumber(pointer)
Definition: itemptr.h:70
XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer, OffsetNumber *todelete, int ntodelete, IndexTuple *itup, int ituplen, Buffer leftchildbuf)
Definition: gistxlog.c:390
bool spl_lisnull[INDEX_MAX_KEYS]
Definition: gist_private.h:279
FmgrInfo unionFn[INDEX_MAX_KEYS]
Definition: gist_private.h:87
#define GIST_EXCLUSIVE
Definition: gist_private.h:45
#define PageSetLSN(page, lsn)
Definition: bufpage.h:365
Definition: pg_list.h:45
int Buffer
Definition: buf.h:23
struct GISTInsertStack * parent
Definition: gist_private.h:269
amcanreturn_function amcanreturn
Definition: amapi.h:154
Buffer gistNewBuffer(Relation r)
Definition: gistutil.c:758
#define PageGetItem(page, itemId)
Definition: bufpage.h:337
Pointer Page
Definition: bufpage.h:74
#define IndexTupleSize(itup)
Definition: itup.h:70
List * list_delete_first(List *list)
Definition: list.c:666
XLogRecPtr gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf, SplitedPageLayout *dist, BlockNumber origrlink, GistNSN orignsn, Buffer leftchildbuf, bool markfollowright)
Definition: gistxlog.c:326
amrestrpos_function amrestrpos
Definition: amapi.h:164
RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum)
Definition: indexam.c:695