PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
array_selfuncs.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * array_selfuncs.c
4  * Functions for selectivity estimation of array operators
5  *
6  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/array_selfuncs.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16 
17 #include <math.h>
18 
19 #include "access/htup_details.h"
20 #include "catalog/pg_collation.h"
21 #include "catalog/pg_operator.h"
22 #include "catalog/pg_statistic.h"
23 #include "optimizer/clauses.h"
24 #include "utils/array.h"
25 #include "utils/lsyscache.h"
26 #include "utils/selfuncs.h"
27 #include "utils/typcache.h"
28 
29 
30 /* Default selectivity constant for "@>" and "<@" operators */
31 #define DEFAULT_CONTAIN_SEL 0.005
32 
33 /* Default selectivity constant for "&&" operator */
34 #define DEFAULT_OVERLAP_SEL 0.01
35 
36 /* Default selectivity for given operator */
37 #define DEFAULT_SEL(operator) \
38  ((operator) == OID_ARRAY_OVERLAP_OP ? \
39  DEFAULT_OVERLAP_SEL : DEFAULT_CONTAIN_SEL)
40 
41 static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval,
42  Oid elemtype, Oid operator);
44  TypeCacheEntry *typentry,
45  Datum *mcelem, int nmcelem,
46  float4 *numbers, int nnumbers,
47  float4 *hist, int nhist,
48  Oid operator, FmgrInfo *cmpfunc);
49 static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
50  float4 *numbers, int nnumbers,
51  Datum *array_data, int nitems,
52  Oid operator, FmgrInfo *cmpfunc);
53 static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
54  float4 *numbers, int nnumbers,
55  Datum *array_data, int nitems,
56  float4 *hist, int nhist,
57  Oid operator, FmgrInfo *cmpfunc);
58 static float *calc_hist(const float4 *hist, int nhist, int n);
59 static float *calc_distr(const float *p, int n, int m, float rest);
60 static int floor_log2(uint32 n);
61 static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value,
62  int *index, FmgrInfo *cmpfunc);
63 static int element_compare(const void *key1, const void *key2, void *arg);
64 static int float_compare_desc(const void *key1, const void *key2);
65 
66 
67 /*
68  * scalararraysel_containment
69  * Estimate selectivity of ScalarArrayOpExpr via array containment.
70  *
71  * If we have const =/<> ANY/ALL (array_var) then we can estimate the
72  * selectivity as though this were an array containment operator,
73  * array_var op ARRAY[const].
74  *
75  * scalararraysel() has already verified that the ScalarArrayOpExpr's operator
76  * is the array element type's default equality or inequality operator, and
77  * has aggressively simplified both inputs to constants.
78  *
79  * Returns selectivity (0..1), or -1 if we fail to estimate selectivity.
80  */
83  Node *leftop, Node *rightop,
84  Oid elemtype, bool isEquality, bool useOr,
85  int varRelid)
86 {
87  Selectivity selec;
88  VariableStatData vardata;
89  Datum constval;
90  TypeCacheEntry *typentry;
91  FmgrInfo *cmpfunc;
92 
93  /*
94  * rightop must be a variable, else punt.
95  */
96  examine_variable(root, rightop, varRelid, &vardata);
97  if (!vardata.rel)
98  {
99  ReleaseVariableStats(vardata);
100  return -1.0;
101  }
102 
103  /*
104  * leftop must be a constant, else punt.
105  */
106  if (!IsA(leftop, Const))
107  {
108  ReleaseVariableStats(vardata);
109  return -1.0;
110  }
111  if (((Const *) leftop)->constisnull)
112  {
113  /* qual can't succeed if null on left */
114  ReleaseVariableStats(vardata);
115  return (Selectivity) 0.0;
116  }
117  constval = ((Const *) leftop)->constvalue;
118 
119  /* Get element type's default comparison function */
120  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
121  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
122  {
123  ReleaseVariableStats(vardata);
124  return -1.0;
125  }
126  cmpfunc = &typentry->cmp_proc_finfo;
127 
128  /*
129  * If the operator is <>, swap ANY/ALL, then invert the result later.
130  */
131  if (!isEquality)
132  useOr = !useOr;
133 
134  /* Get array element stats for var, if available */
135  if (HeapTupleIsValid(vardata.statsTuple))
136  {
137  Form_pg_statistic stats;
138  Datum *values;
139  int nvalues;
140  float4 *numbers;
141  int nnumbers;
142  float4 *hist;
143  int nhist;
144 
145  stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
146 
147  /* MCELEM will be an array of same type as element */
148  if (get_attstatsslot(vardata.statsTuple,
149  elemtype, vardata.atttypmod,
151  NULL,
152  &values, &nvalues,
153  &numbers, &nnumbers))
154  {
155  /* For ALL case, also get histogram of distinct-element counts */
156  if (useOr ||
157  !get_attstatsslot(vardata.statsTuple,
158  elemtype, vardata.atttypmod,
160  NULL,
161  NULL, NULL,
162  &hist, &nhist))
163  {
164  hist = NULL;
165  nhist = 0;
166  }
167 
168  /*
169  * For = ANY, estimate as var @> ARRAY[const].
170  *
171  * For = ALL, estimate as var <@ ARRAY[const].
172  */
173  if (useOr)
174  selec = mcelem_array_contain_overlap_selec(values, nvalues,
175  numbers, nnumbers,
176  &constval, 1,
178  cmpfunc);
179  else
180  selec = mcelem_array_contained_selec(values, nvalues,
181  numbers, nnumbers,
182  &constval, 1,
183  hist, nhist,
185  cmpfunc);
186 
187  if (hist)
188  free_attstatsslot(elemtype, NULL, 0, hist, nhist);
189  free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
190  }
191  else
192  {
193  /* No most-common-elements info, so do without */
194  if (useOr)
196  NULL, 0,
197  &constval, 1,
199  cmpfunc);
200  else
202  NULL, 0,
203  &constval, 1,
204  NULL, 0,
206  cmpfunc);
207  }
208 
209  /*
210  * MCE stats count only non-null rows, so adjust for null rows.
211  */
212  selec *= (1.0 - stats->stanullfrac);
213  }
214  else
215  {
216  /* No stats at all, so do without */
217  if (useOr)
219  NULL, 0,
220  &constval, 1,
222  cmpfunc);
223  else
225  NULL, 0,
226  &constval, 1,
227  NULL, 0,
229  cmpfunc);
230  /* we assume no nulls here, so no stanullfrac correction */
231  }
232 
233  ReleaseVariableStats(vardata);
234 
235  /*
236  * If the operator is <>, invert the results.
237  */
238  if (!isEquality)
239  selec = 1.0 - selec;
240 
241  CLAMP_PROBABILITY(selec);
242 
243  return selec;
244 }
245 
246 /*
247  * arraycontsel -- restriction selectivity for array @>, &&, <@ operators
248  */
249 Datum
251 {
253  Oid operator = PG_GETARG_OID(1);
254  List *args = (List *) PG_GETARG_POINTER(2);
255  int varRelid = PG_GETARG_INT32(3);
256  VariableStatData vardata;
257  Node *other;
258  bool varonleft;
259  Selectivity selec;
260  Oid element_typeid;
261 
262  /*
263  * If expression is not (variable op something) or (something op
264  * variable), then punt and return a default estimate.
265  */
266  if (!get_restriction_variable(root, args, varRelid,
267  &vardata, &other, &varonleft))
268  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
269 
270  /*
271  * Can't do anything useful if the something is not a constant, either.
272  */
273  if (!IsA(other, Const))
274  {
275  ReleaseVariableStats(vardata);
276  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
277  }
278 
279  /*
280  * The "&&", "@>" and "<@" operators are strict, so we can cope with a
281  * NULL constant right away.
282  */
283  if (((Const *) other)->constisnull)
284  {
285  ReleaseVariableStats(vardata);
286  PG_RETURN_FLOAT8(0.0);
287  }
288 
289  /*
290  * If var is on the right, commute the operator, so that we can assume the
291  * var is on the left in what follows.
292  */
293  if (!varonleft)
294  {
295  if (operator == OID_ARRAY_CONTAINS_OP)
296  operator = OID_ARRAY_CONTAINED_OP;
297  else if (operator == OID_ARRAY_CONTAINED_OP)
298  operator = OID_ARRAY_CONTAINS_OP;
299  }
300 
301  /*
302  * OK, there's a Var and a Const we're dealing with here. We need the
303  * Const to be an array with same element type as column, else we can't do
304  * anything useful. (Such cases will likely fail at runtime, but here
305  * we'd rather just return a default estimate.)
306  */
307  element_typeid = get_base_element_type(((Const *) other)->consttype);
308  if (element_typeid != InvalidOid &&
309  element_typeid == get_base_element_type(vardata.vartype))
310  {
311  selec = calc_arraycontsel(&vardata, ((Const *) other)->constvalue,
312  element_typeid, operator);
313  }
314  else
315  {
316  selec = DEFAULT_SEL(operator);
317  }
318 
319  ReleaseVariableStats(vardata);
320 
321  CLAMP_PROBABILITY(selec);
322 
323  PG_RETURN_FLOAT8((float8) selec);
324 }
325 
326 /*
327  * arraycontjoinsel -- join selectivity for array @>, &&, <@ operators
328  */
329 Datum
331 {
332  /* For the moment this is just a stub */
333  Oid operator = PG_GETARG_OID(1);
334 
335  PG_RETURN_FLOAT8(DEFAULT_SEL(operator));
336 }
337 
338 /*
339  * Calculate selectivity for "arraycolumn @> const", "arraycolumn && const"
340  * or "arraycolumn <@ const" based on the statistics
341  *
342  * This function is mainly responsible for extracting the pg_statistic data
343  * to be used; we then pass the problem on to mcelem_array_selec().
344  */
345 static Selectivity
347  Oid elemtype, Oid operator)
348 {
349  Selectivity selec;
350  TypeCacheEntry *typentry;
351  FmgrInfo *cmpfunc;
352  ArrayType *array;
353 
354  /* Get element type's default comparison function */
355  typentry = lookup_type_cache(elemtype, TYPECACHE_CMP_PROC_FINFO);
356  if (!OidIsValid(typentry->cmp_proc_finfo.fn_oid))
357  return DEFAULT_SEL(operator);
358  cmpfunc = &typentry->cmp_proc_finfo;
359 
360  /*
361  * The caller made sure the const is an array with same element type, so
362  * get it now
363  */
364  array = DatumGetArrayTypeP(constval);
365 
366  if (HeapTupleIsValid(vardata->statsTuple))
367  {
368  Form_pg_statistic stats;
369  Datum *values;
370  int nvalues;
371  float4 *numbers;
372  int nnumbers;
373  float4 *hist;
374  int nhist;
375 
376  stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
377 
378  /* MCELEM will be an array of same type as column */
379  if (get_attstatsslot(vardata->statsTuple,
380  elemtype, vardata->atttypmod,
382  NULL,
383  &values, &nvalues,
384  &numbers, &nnumbers))
385  {
386  /*
387  * For "array <@ const" case we also need histogram of distinct
388  * element counts.
389  */
390  if (operator != OID_ARRAY_CONTAINED_OP ||
391  !get_attstatsslot(vardata->statsTuple,
392  elemtype, vardata->atttypmod,
394  NULL,
395  NULL, NULL,
396  &hist, &nhist))
397  {
398  hist = NULL;
399  nhist = 0;
400  }
401 
402  /* Use the most-common-elements slot for the array Var. */
403  selec = mcelem_array_selec(array, typentry,
404  values, nvalues,
405  numbers, nnumbers,
406  hist, nhist,
407  operator, cmpfunc);
408 
409  if (hist)
410  free_attstatsslot(elemtype, NULL, 0, hist, nhist);
411  free_attstatsslot(elemtype, values, nvalues, numbers, nnumbers);
412  }
413  else
414  {
415  /* No most-common-elements info, so do without */
416  selec = mcelem_array_selec(array, typentry,
417  NULL, 0, NULL, 0, NULL, 0,
418  operator, cmpfunc);
419  }
420 
421  /*
422  * MCE stats count only non-null rows, so adjust for null rows.
423  */
424  selec *= (1.0 - stats->stanullfrac);
425  }
426  else
427  {
428  /* No stats at all, so do without */
429  selec = mcelem_array_selec(array, typentry,
430  NULL, 0, NULL, 0, NULL, 0,
431  operator, cmpfunc);
432  /* we assume no nulls here, so no stanullfrac correction */
433  }
434 
435  /* If constant was toasted, release the copy we made */
436  if (PointerGetDatum(array) != constval)
437  pfree(array);
438 
439  return selec;
440 }
441 
442 /*
443  * Array selectivity estimation based on most common elements statistics
444  *
445  * This function just deconstructs and sorts the array constant's contents,
446  * and then passes the problem on to mcelem_array_contain_overlap_selec or
447  * mcelem_array_contained_selec depending on the operator.
448  */
449 static Selectivity
451  Datum *mcelem, int nmcelem,
452  float4 *numbers, int nnumbers,
453  float4 *hist, int nhist,
454  Oid operator, FmgrInfo *cmpfunc)
455 {
456  Selectivity selec;
457  int num_elems;
458  Datum *elem_values;
459  bool *elem_nulls;
460  bool null_present;
461  int nonnull_nitems;
462  int i;
463 
464  /*
465  * Prepare constant array data for sorting. Sorting lets us find unique
466  * elements and efficiently merge with the MCELEM array.
467  */
468  deconstruct_array(array,
469  typentry->type_id,
470  typentry->typlen,
471  typentry->typbyval,
472  typentry->typalign,
473  &elem_values, &elem_nulls, &num_elems);
474 
475  /* Collapse out any null elements */
476  nonnull_nitems = 0;
477  null_present = false;
478  for (i = 0; i < num_elems; i++)
479  {
480  if (elem_nulls[i])
481  null_present = true;
482  else
483  elem_values[nonnull_nitems++] = elem_values[i];
484  }
485 
486  /*
487  * Query "column @> '{anything, null}'" matches nothing. For the other
488  * two operators, presence of a null in the constant can be ignored.
489  */
490  if (null_present && operator == OID_ARRAY_CONTAINS_OP)
491  {
492  pfree(elem_values);
493  pfree(elem_nulls);
494  return (Selectivity) 0.0;
495  }
496 
497  /* Sort extracted elements using their default comparison function. */
498  qsort_arg(elem_values, nonnull_nitems, sizeof(Datum),
499  element_compare, cmpfunc);
500 
501  /* Separate cases according to operator */
502  if (operator == OID_ARRAY_CONTAINS_OP || operator == OID_ARRAY_OVERLAP_OP)
503  selec = mcelem_array_contain_overlap_selec(mcelem, nmcelem,
504  numbers, nnumbers,
505  elem_values, nonnull_nitems,
506  operator, cmpfunc);
507  else if (operator == OID_ARRAY_CONTAINED_OP)
508  selec = mcelem_array_contained_selec(mcelem, nmcelem,
509  numbers, nnumbers,
510  elem_values, nonnull_nitems,
511  hist, nhist,
512  operator, cmpfunc);
513  else
514  {
515  elog(ERROR, "arraycontsel called for unrecognized operator %u",
516  operator);
517  selec = 0.0; /* keep compiler quiet */
518  }
519 
520  pfree(elem_values);
521  pfree(elem_nulls);
522  return selec;
523 }
524 
525 /*
526  * Estimate selectivity of "column @> const" and "column && const" based on
527  * most common element statistics. This estimation assumes element
528  * occurrences are independent.
529  *
530  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
531  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
532  * not available. array_data (of length nitems) is the constant's elements.
533  *
534  * Both the mcelem and array_data arrays are assumed presorted according
535  * to the element type's cmpfunc. Null elements are not present.
536  *
537  * TODO: this estimate probably could be improved by using the distinct
538  * elements count histogram. For example, excepting the special case of
539  * "column @> '{}'", we can multiply the calculated selectivity by the
540  * fraction of nonempty arrays in the column.
541  */
542 static Selectivity
544  float4 *numbers, int nnumbers,
545  Datum *array_data, int nitems,
546  Oid operator, FmgrInfo *cmpfunc)
547 {
548  Selectivity selec,
549  elem_selec;
550  int mcelem_index,
551  i;
552  bool use_bsearch;
553  float4 minfreq;
554 
555  /*
556  * There should be three more Numbers than Values, because the last three
557  * cells should hold minimal and maximal frequency among the non-null
558  * elements, and then the frequency of null elements. Ignore the Numbers
559  * if not right.
560  */
561  if (nnumbers != nmcelem + 3)
562  {
563  numbers = NULL;
564  nnumbers = 0;
565  }
566 
567  if (numbers)
568  {
569  /* Grab the lowest observed frequency */
570  minfreq = numbers[nmcelem];
571  }
572  else
573  {
574  /* Without statistics make some default assumptions */
575  minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
576  }
577 
578  /* Decide whether it is faster to use binary search or not. */
579  if (nitems * floor_log2((uint32) nmcelem) < nmcelem + nitems)
580  use_bsearch = true;
581  else
582  use_bsearch = false;
583 
584  if (operator == OID_ARRAY_CONTAINS_OP)
585  {
586  /*
587  * Initial selectivity for "column @> const" query is 1.0, and it will
588  * be decreased with each element of constant array.
589  */
590  selec = 1.0;
591  }
592  else
593  {
594  /*
595  * Initial selectivity for "column && const" query is 0.0, and it will
596  * be increased with each element of constant array.
597  */
598  selec = 0.0;
599  }
600 
601  /* Scan mcelem and array in parallel. */
602  mcelem_index = 0;
603  for (i = 0; i < nitems; i++)
604  {
605  bool match = false;
606 
607  /* Ignore any duplicates in the array data. */
608  if (i > 0 &&
609  element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
610  continue;
611 
612  /* Find the smallest MCELEM >= this array item. */
613  if (use_bsearch)
614  {
615  match = find_next_mcelem(mcelem, nmcelem, array_data[i],
616  &mcelem_index, cmpfunc);
617  }
618  else
619  {
620  while (mcelem_index < nmcelem)
621  {
622  int cmp = element_compare(&mcelem[mcelem_index],
623  &array_data[i],
624  cmpfunc);
625 
626  if (cmp < 0)
627  mcelem_index++;
628  else
629  {
630  if (cmp == 0)
631  match = true; /* mcelem is found */
632  break;
633  }
634  }
635  }
636 
637  if (match && numbers)
638  {
639  /* MCELEM matches the array item; use its frequency. */
640  elem_selec = numbers[mcelem_index];
641  mcelem_index++;
642  }
643  else
644  {
645  /*
646  * The element is not in MCELEM. Punt, but assume that the
647  * selectivity cannot be more than minfreq / 2.
648  */
649  elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
650  }
651 
652  /*
653  * Update overall selectivity using the current element's selectivity
654  * and an assumption of element occurrence independence.
655  */
656  if (operator == OID_ARRAY_CONTAINS_OP)
657  selec *= elem_selec;
658  else
659  selec = selec + elem_selec - selec * elem_selec;
660 
661  /* Clamp intermediate results to stay sane despite roundoff error */
662  CLAMP_PROBABILITY(selec);
663  }
664 
665  return selec;
666 }
667 
668 /*
669  * Estimate selectivity of "column <@ const" based on most common element
670  * statistics.
671  *
672  * mcelem (of length nmcelem) and numbers (of length nnumbers) are from
673  * the array column's MCELEM statistics slot, or are NULL/0 if stats are
674  * not available. array_data (of length nitems) is the constant's elements.
675  * hist (of length nhist) is from the array column's DECHIST statistics slot,
676  * or is NULL/0 if those stats are not available.
677  *
678  * Both the mcelem and array_data arrays are assumed presorted according
679  * to the element type's cmpfunc. Null elements are not present.
680  *
681  * Independent element occurrence would imply a particular distribution of
682  * distinct element counts among matching rows. Real data usually falsifies
683  * that assumption. For example, in a set of 11-element integer arrays having
684  * elements in the range [0..10], element occurrences are typically not
685  * independent. If they were, a sufficiently-large set would include all
686  * distinct element counts 0 through 11. We correct for this using the
687  * histogram of distinct element counts.
688  *
689  * In the "column @> const" and "column && const" cases, we usually have a
690  * "const" with low number of elements (otherwise we have selectivity close
691  * to 0 or 1 respectively). That's why the effect of dependence related
692  * to distinct element count distribution is negligible there. In the
693  * "column <@ const" case, number of elements is usually high (otherwise we
694  * have selectivity close to 0). That's why we should do a correction with
695  * the array distinct element count distribution here.
696  *
697  * Using the histogram of distinct element counts produces a different
698  * distribution law than independent occurrences of elements. This
699  * distribution law can be described as follows:
700  *
701  * P(o1, o2, ..., on) = f1^o1 * (1 - f1)^(1 - o1) * f2^o2 *
702  * (1 - f2)^(1 - o2) * ... * fn^on * (1 - fn)^(1 - on) * hist[m] / ind[m]
703  *
704  * where:
705  * o1, o2, ..., on - occurrences of elements 1, 2, ..., n
706  * (1 - occurrence, 0 - no occurrence) in row
707  * f1, f2, ..., fn - frequencies of elements 1, 2, ..., n
708  * (scalar values in [0..1]) according to collected statistics
709  * m = o1 + o2 + ... + on = total number of distinct elements in row
710  * hist[m] - histogram data for occurrence of m elements.
711  * ind[m] - probability of m occurrences from n events assuming their
712  * probabilities to be equal to frequencies of array elements.
713  *
714  * ind[m] = sum(f1^o1 * (1 - f1)^(1 - o1) * f2^o2 * (1 - f2)^(1 - o2) *
715  * ... * fn^on * (1 - fn)^(1 - on), o1, o2, ..., on) | o1 + o2 + .. on = m
716  */
717 static Selectivity
718 mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
719  float4 *numbers, int nnumbers,
720  Datum *array_data, int nitems,
721  float4 *hist, int nhist,
722  Oid operator, FmgrInfo *cmpfunc)
723 {
724  int mcelem_index,
725  i,
726  unique_nitems = 0;
727  float selec,
728  minfreq,
729  nullelem_freq;
730  float *dist,
731  *mcelem_dist,
732  *hist_part;
733  float avg_count,
734  mult,
735  rest;
736  float *elem_selec;
737 
738  /*
739  * There should be three more Numbers than Values in the MCELEM slot,
740  * because the last three cells should hold minimal and maximal frequency
741  * among the non-null elements, and then the frequency of null elements.
742  * Punt if not right, because we can't do much without the element freqs.
743  */
744  if (numbers == NULL || nnumbers != nmcelem + 3)
745  return DEFAULT_CONTAIN_SEL;
746 
747  /* Can't do much without a count histogram, either */
748  if (hist == NULL || nhist < 3)
749  return DEFAULT_CONTAIN_SEL;
750 
751  /*
752  * Grab some of the summary statistics that compute_array_stats() stores:
753  * lowest frequency, frequency of null elements, and average distinct
754  * element count.
755  */
756  minfreq = numbers[nmcelem];
757  nullelem_freq = numbers[nmcelem + 2];
758  avg_count = hist[nhist - 1];
759 
760  /*
761  * "rest" will be the sum of the frequencies of all elements not
762  * represented in MCELEM. The average distinct element count is the sum
763  * of the frequencies of *all* elements. Begin with that; we will proceed
764  * to subtract the MCELEM frequencies.
765  */
766  rest = avg_count;
767 
768  /*
769  * mult is a multiplier representing estimate of probability that each
770  * mcelem that is not present in constant doesn't occur.
771  */
772  mult = 1.0f;
773 
774  /*
775  * elem_selec is array of estimated frequencies for elements in the
776  * constant.
777  */
778  elem_selec = (float *) palloc(sizeof(float) * nitems);
779 
780  /* Scan mcelem and array in parallel. */
781  mcelem_index = 0;
782  for (i = 0; i < nitems; i++)
783  {
784  bool match = false;
785 
786  /* Ignore any duplicates in the array data. */
787  if (i > 0 &&
788  element_compare(&array_data[i - 1], &array_data[i], cmpfunc) == 0)
789  continue;
790 
791  /*
792  * Iterate over MCELEM until we find an entry greater than or equal to
793  * this element of the constant. Update "rest" and "mult" for mcelem
794  * entries skipped over.
795  */
796  while (mcelem_index < nmcelem)
797  {
798  int cmp = element_compare(&mcelem[mcelem_index],
799  &array_data[i],
800  cmpfunc);
801 
802  if (cmp < 0)
803  {
804  mult *= (1.0f - numbers[mcelem_index]);
805  rest -= numbers[mcelem_index];
806  mcelem_index++;
807  }
808  else
809  {
810  if (cmp == 0)
811  match = true; /* mcelem is found */
812  break;
813  }
814  }
815 
816  if (match)
817  {
818  /* MCELEM matches the array item. */
819  elem_selec[unique_nitems] = numbers[mcelem_index];
820  /* "rest" is decremented for all mcelems, matched or not */
821  rest -= numbers[mcelem_index];
822  mcelem_index++;
823  }
824  else
825  {
826  /*
827  * The element is not in MCELEM. Punt, but assume that the
828  * selectivity cannot be more than minfreq / 2.
829  */
830  elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
831  minfreq / 2);
832  }
833 
834  unique_nitems++;
835  }
836 
837  /*
838  * If we handled all constant elements without exhausting the MCELEM
839  * array, finish walking it to complete calculation of "rest" and "mult".
840  */
841  while (mcelem_index < nmcelem)
842  {
843  mult *= (1.0f - numbers[mcelem_index]);
844  rest -= numbers[mcelem_index];
845  mcelem_index++;
846  }
847 
848  /*
849  * The presence of many distinct rare elements materially decreases
850  * selectivity. Use the Poisson distribution to estimate the probability
851  * of a column value having zero occurrences of such elements. See above
852  * for the definition of "rest".
853  */
854  mult *= exp(-rest);
855 
856  /*----------
857  * Using the distinct element count histogram requires
858  * O(unique_nitems * (nmcelem + unique_nitems))
859  * operations. Beyond a certain computational cost threshold, it's
860  * reasonable to sacrifice accuracy for decreased planning time. We limit
861  * the number of operations to EFFORT * nmcelem; since nmcelem is limited
862  * by the column's statistics target, the work done is user-controllable.
863  *
864  * If the number of operations would be too large, we can reduce it
865  * without losing all accuracy by reducing unique_nitems and considering
866  * only the most-common elements of the constant array. To make the
867  * results exactly match what we would have gotten with only those
868  * elements to start with, we'd have to remove any discarded elements'
869  * frequencies from "mult", but since this is only an approximation
870  * anyway, we don't bother with that. Therefore it's sufficient to qsort
871  * elem_selec[] and take the largest elements. (They will no longer match
872  * up with the elements of array_data[], but we don't care.)
873  *----------
874  */
875 #define EFFORT 100
876 
877  if ((nmcelem + unique_nitems) > 0 &&
878  unique_nitems > EFFORT * nmcelem / (nmcelem + unique_nitems))
879  {
880  /*
881  * Use the quadratic formula to solve for largest allowable N. We
882  * have A = 1, B = nmcelem, C = - EFFORT * nmcelem.
883  */
884  double b = (double) nmcelem;
885  int n;
886 
887  n = (int) ((sqrt(b * b + 4 * EFFORT * b) - b) / 2);
888 
889  /* Sort, then take just the first n elements */
890  qsort(elem_selec, unique_nitems, sizeof(float),
892  unique_nitems = n;
893  }
894 
895  /*
896  * Calculate probabilities of each distinct element count for both mcelems
897  * and constant elements. At this point, assume independent element
898  * occurrence.
899  */
900  dist = calc_distr(elem_selec, unique_nitems, unique_nitems, 0.0f);
901  mcelem_dist = calc_distr(numbers, nmcelem, unique_nitems, rest);
902 
903  /* ignore hist[nhist-1], which is the average not a histogram member */
904  hist_part = calc_hist(hist, nhist - 1, unique_nitems);
905 
906  selec = 0.0f;
907  for (i = 0; i <= unique_nitems; i++)
908  {
909  /*
910  * mult * dist[i] / mcelem_dist[i] gives us probability of qual
911  * matching from assumption of independent element occurrence with the
912  * condition that distinct element count = i.
913  */
914  if (mcelem_dist[i] > 0)
915  selec += hist_part[i] * mult * dist[i] / mcelem_dist[i];
916  }
917 
918  pfree(dist);
919  pfree(mcelem_dist);
920  pfree(hist_part);
921  pfree(elem_selec);
922 
923  /* Take into account occurrence of NULL element. */
924  selec *= (1.0f - nullelem_freq);
925 
926  CLAMP_PROBABILITY(selec);
927 
928  return selec;
929 }
930 
931 /*
932  * Calculate the first n distinct element count probabilities from a
933  * histogram of distinct element counts.
934  *
935  * Returns a palloc'd array of n+1 entries, with array[k] being the
936  * probability of element count k, k in [0..n].
937  *
938  * We assume that a histogram box with bounds a and b gives 1 / ((b - a + 1) *
939  * (nhist - 1)) probability to each value in (a,b) and an additional half of
940  * that to a and b themselves.
941  */
942 static float *
943 calc_hist(const float4 *hist, int nhist, int n)
944 {
945  float *hist_part;
946  int k,
947  i = 0;
948  float prev_interval = 0,
949  next_interval;
950  float frac;
951 
952  hist_part = (float *) palloc((n + 1) * sizeof(float));
953 
954  /*
955  * frac is a probability contribution for each interval between histogram
956  * values. We have nhist - 1 intervals, so contribution of each one will
957  * be 1 / (nhist - 1).
958  */
959  frac = 1.0f / ((float) (nhist - 1));
960 
961  for (k = 0; k <= n; k++)
962  {
963  int count = 0;
964 
965  /*
966  * Count the histogram boundaries equal to k. (Although the histogram
967  * should theoretically contain only exact integers, entries are
968  * floats so there could be roundoff error in large values. Treat any
969  * fractional value as equal to the next larger k.)
970  */
971  while (i < nhist && hist[i] <= k)
972  {
973  count++;
974  i++;
975  }
976 
977  if (count > 0)
978  {
979  /* k is an exact bound for at least one histogram box. */
980  float val;
981 
982  /* Find length between current histogram value and the next one */
983  if (i < nhist)
984  next_interval = hist[i] - hist[i - 1];
985  else
986  next_interval = 0;
987 
988  /*
989  * count - 1 histogram boxes contain k exclusively. They
990  * contribute a total of (count - 1) * frac probability. Also
991  * factor in the partial histogram boxes on either side.
992  */
993  val = (float) (count - 1);
994  if (next_interval > 0)
995  val += 0.5f / next_interval;
996  if (prev_interval > 0)
997  val += 0.5f / prev_interval;
998  hist_part[k] = frac * val;
999 
1000  prev_interval = next_interval;
1001  }
1002  else
1003  {
1004  /* k does not appear as an exact histogram bound. */
1005  if (prev_interval > 0)
1006  hist_part[k] = frac / prev_interval;
1007  else
1008  hist_part[k] = 0.0f;
1009  }
1010  }
1011 
1012  return hist_part;
1013 }
1014 
1015 /*
1016  * Consider n independent events with probabilities p[]. This function
1017  * calculates probabilities of exact k of events occurrence for k in [0..m].
1018  * Returns a palloc'd array of size m+1.
1019  *
1020  * "rest" is the sum of the probabilities of all low-probability events not
1021  * included in p.
1022  *
1023  * Imagine matrix M of size (n + 1) x (m + 1). Element M[i,j] denotes the
1024  * probability that exactly j of first i events occur. Obviously M[0,0] = 1.
1025  * For any constant j, each increment of i increases the probability iff the
1026  * event occurs. So, by the law of total probability:
1027  * M[i,j] = M[i - 1, j] * (1 - p[i]) + M[i - 1, j - 1] * p[i]
1028  * for i > 0, j > 0.
1029  * M[i,0] = M[i - 1, 0] * (1 - p[i]) for i > 0.
1030  */
1031 static float *
1032 calc_distr(const float *p, int n, int m, float rest)
1033 {
1034  float *row,
1035  *prev_row,
1036  *tmp;
1037  int i,
1038  j;
1039 
1040  /*
1041  * Since we return only the last row of the matrix and need only the
1042  * current and previous row for calculations, allocate two rows.
1043  */
1044  row = (float *) palloc((m + 1) * sizeof(float));
1045  prev_row = (float *) palloc((m + 1) * sizeof(float));
1046 
1047  /* M[0,0] = 1 */
1048  row[0] = 1.0f;
1049  for (i = 1; i <= n; i++)
1050  {
1051  float t = p[i - 1];
1052 
1053  /* Swap rows */
1054  tmp = row;
1055  row = prev_row;
1056  prev_row = tmp;
1057 
1058  /* Calculate next row */
1059  for (j = 0; j <= i && j <= m; j++)
1060  {
1061  float val = 0.0f;
1062 
1063  if (j < i)
1064  val += prev_row[j] * (1.0f - t);
1065  if (j > 0)
1066  val += prev_row[j - 1] * t;
1067  row[j] = val;
1068  }
1069  }
1070 
1071  /*
1072  * The presence of many distinct rare (not in "p") elements materially
1073  * decreases selectivity. Model their collective occurrence with the
1074  * Poisson distribution.
1075  */
1076  if (rest > DEFAULT_CONTAIN_SEL)
1077  {
1078  float t;
1079 
1080  /* Swap rows */
1081  tmp = row;
1082  row = prev_row;
1083  prev_row = tmp;
1084 
1085  for (i = 0; i <= m; i++)
1086  row[i] = 0.0f;
1087 
1088  /* Value of Poisson distribution for 0 occurrences */
1089  t = exp(-rest);
1090 
1091  /*
1092  * Calculate convolution of previously computed distribution and the
1093  * Poisson distribution.
1094  */
1095  for (i = 0; i <= m; i++)
1096  {
1097  for (j = 0; j <= m - i; j++)
1098  row[j + i] += prev_row[j] * t;
1099 
1100  /* Get Poisson distribution value for (i + 1) occurrences */
1101  t *= rest / (float) (i + 1);
1102  }
1103  }
1104 
1105  pfree(prev_row);
1106  return row;
1107 }
1108 
1109 /* Fast function for floor value of 2 based logarithm calculation. */
1110 static int
1112 {
1113  int logval = 0;
1114 
1115  if (n == 0)
1116  return -1;
1117  if (n >= (1 << 16))
1118  {
1119  n >>= 16;
1120  logval += 16;
1121  }
1122  if (n >= (1 << 8))
1123  {
1124  n >>= 8;
1125  logval += 8;
1126  }
1127  if (n >= (1 << 4))
1128  {
1129  n >>= 4;
1130  logval += 4;
1131  }
1132  if (n >= (1 << 2))
1133  {
1134  n >>= 2;
1135  logval += 2;
1136  }
1137  if (n >= (1 << 1))
1138  {
1139  logval += 1;
1140  }
1141  return logval;
1142 }
1143 
1144 /*
1145  * find_next_mcelem binary-searches a most common elements array, starting
1146  * from *index, for the first member >= value. It saves the position of the
1147  * match into *index and returns true if it's an exact match. (Note: we
1148  * assume the mcelem elements are distinct so there can't be more than one
1149  * exact match.)
1150  */
1151 static bool
1152 find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index,
1153  FmgrInfo *cmpfunc)
1154 {
1155  int l = *index,
1156  r = nmcelem - 1,
1157  i,
1158  res;
1159 
1160  while (l <= r)
1161  {
1162  i = (l + r) / 2;
1163  res = element_compare(&mcelem[i], &value, cmpfunc);
1164  if (res == 0)
1165  {
1166  *index = i;
1167  return true;
1168  }
1169  else if (res < 0)
1170  l = i + 1;
1171  else
1172  r = i - 1;
1173  }
1174  *index = l;
1175  return false;
1176 }
1177 
1178 /*
1179  * Comparison function for elements.
1180  *
1181  * We use the element type's default btree opclass, and the default collation
1182  * if the type is collation-sensitive.
1183  *
1184  * XXX consider using SortSupport infrastructure
1185  */
1186 static int
1187 element_compare(const void *key1, const void *key2, void *arg)
1188 {
1189  Datum d1 = *((const Datum *) key1);
1190  Datum d2 = *((const Datum *) key2);
1191  FmgrInfo *cmpfunc = (FmgrInfo *) arg;
1192  Datum c;
1193 
1194  c = FunctionCall2Coll(cmpfunc, DEFAULT_COLLATION_OID, d1, d2);
1195  return DatumGetInt32(c);
1196 }
1197 
1198 /*
1199  * Comparison function for sorting floats into descending order.
1200  */
1201 static int
1202 float_compare_desc(const void *key1, const void *key2)
1203 {
1204  float d1 = *((const float *) key1);
1205  float d2 = *((const float *) key2);
1206 
1207  if (d1 > d2)
1208  return -1;
1209  else if (d1 < d2)
1210  return 1;
1211  else
1212  return 0;
1213 }
#define PG_GETARG_INT32(n)
Definition: fmgr.h:225
Definition: fmgr.h:53
#define IsA(nodeptr, _type_)
Definition: nodes.h:515
#define GETSTRUCT(TUP)
Definition: htup_details.h:631
static int element_compare(const void *key1, const void *key2, void *arg)
#define DatumGetInt32(X)
Definition: postgres.h:480
HeapTuple statsTuple
Definition: selfuncs.h:71
#define PointerGetDatum(X)
Definition: postgres.h:564
bool get_restriction_variable(PlannerInfo *root, List *args, int varRelid, VariableStatData *vardata, Node **other, bool *varonleft)
Definition: selfuncs.c:4227
#define DEFAULT_CONTAIN_SEL
RelOptInfo * rel
Definition: selfuncs.h:70
#define PG_RETURN_FLOAT8(x)
Definition: fmgr.h:310
#define Min(x, y)
Definition: c.h:787
Definition: nodes.h:464
static float * calc_hist(const float4 *hist, int nhist, int n)
#define PG_GETARG_POINTER(n)
Definition: fmgr.h:232
double Selectivity
Definition: nodes.h:551
bool get_attstatsslot(HeapTuple statstuple, Oid atttype, int32 atttypmod, int reqkind, Oid reqop, Oid *actualop, Datum **values, int *nvalues, float4 **numbers, int *nnumbers)
Definition: lsyscache.c:2808
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Definition: fmgr.c:1307
unsigned int Oid
Definition: postgres_ext.h:31
int16 typlen
Definition: typcache.h:35
FormData_pg_statistic * Form_pg_statistic
Definition: pg_statistic.h:128
bool typbyval
Definition: typcache.h:36
#define OidIsValid(objectId)
Definition: c.h:519
int32 atttypmod
Definition: selfuncs.h:76
Datum arraycontsel(PG_FUNCTION_ARGS)
Definition: type.h:90
#define CLAMP_PROBABILITY(p)
Definition: selfuncs.h:57
FmgrInfo cmp_proc_finfo
Definition: typcache.h:68
#define OID_ARRAY_CONTAINS_OP
Definition: pg_operator.h:1542
void pfree(void *pointer)
Definition: mcxt.c:993
#define ERROR
Definition: elog.h:41
double float8
Definition: c.h:366
#define STATISTIC_KIND_DECHIST
Definition: pg_statistic.h:269
static struct @72 value
#define OID_ARRAY_OVERLAP_OP
Definition: pg_operator.h:1539
char * c
#define PG_GETARG_OID(n)
Definition: fmgr.h:231
#define DEFAULT_COLLATION_OID
Definition: pg_collation.h:68
unsigned int uint32
Definition: c.h:254
static float * calc_distr(const float *p, int n, int m, float rest)
static Selectivity mcelem_array_selec(ArrayType *array, TypeCacheEntry *typentry, Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc)
void qsort_arg(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
Definition: qsort_arg.c:113
#define OID_ARRAY_CONTAINED_OP
Definition: pg_operator.h:1545
float float4
Definition: c.h:365
uintptr_t Datum
Definition: postgres.h:374
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
Definition: typcache.c:182
#define InvalidOid
Definition: postgres_ext.h:36
Oid fn_oid
Definition: fmgr.h:56
static bool find_next_mcelem(Datum *mcelem, int nmcelem, Datum value, int *index, FmgrInfo *cmpfunc)
#define HeapTupleIsValid(tuple)
Definition: htup.h:77
void examine_variable(PlannerInfo *root, Node *node, int varRelid, VariableStatData *vardata)
Definition: selfuncs.c:4346
#define EFFORT
#define NULL
Definition: c.h:215
Selectivity scalararraysel_containment(PlannerInfo *root, Node *leftop, Node *rightop, Oid elemtype, bool isEquality, bool useOr, int varRelid)
static int floor_log2(uint32 n)
static Selectivity calc_arraycontsel(VariableStatData *vardata, Datum constval, Oid elemtype, Oid operator)
static Selectivity mcelem_array_contained_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, float4 *hist, int nhist, Oid operator, FmgrInfo *cmpfunc)
void deconstruct_array(ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp)
Definition: arrayfuncs.c:3446
static Datum values[MAXATTR]
Definition: bootstrap.c:159
Oid get_base_element_type(Oid typid)
Definition: lsyscache.c:2479
#define DEFAULT_SEL(operator)
#define ReleaseVariableStats(vardata)
Definition: selfuncs.h:80
char typalign
Definition: typcache.h:37
void * palloc(Size size)
Definition: mcxt.c:892
int i
#define STATISTIC_KIND_MCELEM
Definition: pg_statistic.h:256
void * arg
#define PG_FUNCTION_ARGS
Definition: fmgr.h:150
#define TYPECACHE_CMP_PROC_FINFO
Definition: typcache.h:116
#define elog
Definition: elog.h:228
#define qsort(a, b, c, d)
Definition: port.h:437
Definition: pg_list.h:45
long val
Definition: informix.c:689
void free_attstatsslot(Oid atttype, Datum *values, int nvalues, float4 *numbers, int nnumbers)
Definition: lsyscache.c:2932
Datum arraycontjoinsel(PG_FUNCTION_ARGS)
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:676
static int float_compare_desc(const void *key1, const void *key2)
static Selectivity mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem, float4 *numbers, int nnumbers, Datum *array_data, int nitems, Oid operator, FmgrInfo *cmpfunc)
#define DatumGetArrayTypeP(X)
Definition: array.h:242