PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
regexp.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * regexp.c
4  * Postgres' interface to the regular expression package.
5  *
6  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/utils/adt/regexp.c
12  *
13  * Alistair Crooks added the code for the regex caching
14  * agc - cached the regular expressions used - there's a good chance
15  * that we'll get a hit, so this saves a compile step for every
16  * attempted match. I haven't actually measured the speed improvement,
17  * but it `looks' a lot quicker visually when watching regression
18  * test output.
19  *
20  * agc - incorporated Keith Bostic's Berkeley regex code into
21  * the tree for all ports. To distinguish this regex code from any that
22  * is existent on a platform, I've prepended the string "pg_" to
23  * the functions regcomp, regerror, regexec and regfree.
24  * Fixed a bug that was originally a typo by me, where `i' was used
25  * instead of `oldest' when compiling regular expressions - benign
26  * results mostly, although occasionally it bit you...
27  *
28  *-------------------------------------------------------------------------
29  */
30 #include "postgres.h"
31 
32 #include "catalog/pg_type.h"
33 #include "funcapi.h"
34 #include "regex/regex.h"
35 #include "utils/array.h"
36 #include "utils/builtins.h"
37 
38 #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
39  (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
40 
41 
42 /* all the options of interest for regex functions */
43 typedef struct pg_re_flags
44 {
45  int cflags; /* compile flags for Spencer's regex code */
46  bool glob; /* do it globally (for each occurrence) */
47 } pg_re_flags;
48 
49 /* cross-call state for regexp_matches(), also regexp_split() */
50 typedef struct regexp_matches_ctx
51 {
52  text *orig_str; /* data string in original TEXT form */
53  int nmatches; /* number of places where pattern matched */
54  int npatterns; /* number of capturing subpatterns */
55  /* We store start char index and end+1 char index for each match */
56  /* so the number of entries in match_locs is nmatches * npatterns * 2 */
57  int *match_locs; /* 0-based character indexes */
58  int next_match; /* 0-based index of next match to process */
59  /* workspace for build_regexp_matches_result() */
60  Datum *elems; /* has npatterns elements */
61  bool *nulls; /* has npatterns elements */
63 
64 /*
65  * We cache precompiled regular expressions using a "self organizing list"
66  * structure, in which recently-used items tend to be near the front.
67  * Whenever we use an entry, it's moved up to the front of the list.
68  * Over time, an item's average position corresponds to its frequency of use.
69  *
70  * When we first create an entry, it's inserted at the front of
71  * the array, dropping the entry at the end of the array if necessary to
72  * make room. (This might seem to be weighting the new entry too heavily,
73  * but if we insert new entries further back, we'll be unable to adjust to
74  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
75  * never-before-seen items used circularly. We ought to be able to handle
76  * that case, so we have to insert at the front.)
77  *
78  * Knuth mentions a variant strategy in which a used item is moved up just
79  * one place in the list. Although he says this uses fewer comparisons on
80  * average, it seems not to adapt very well to the situation where you have
81  * both some reusable patterns and a steady stream of non-reusable patterns.
82  * A reusable pattern that isn't used at least as often as non-reusable
83  * patterns are seen will "fail to keep up" and will drop off the end of the
84  * cache. With move-to-front, a reusable pattern is guaranteed to stay in
85  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
86  */
87 
88 /* this is the maximum number of cached regular expressions */
89 #ifndef MAX_CACHED_RES
90 #define MAX_CACHED_RES 32
91 #endif
92 
93 /* this structure describes one cached regular expression */
94 typedef struct cached_re_str
95 {
96  char *cre_pat; /* original RE (not null terminated!) */
97  int cre_pat_len; /* length of original RE, in bytes */
98  int cre_flags; /* compile flags: extended,icase etc */
99  Oid cre_collation; /* collation to use */
100  regex_t cre_re; /* the compiled regular expression */
101 } cached_re_str;
102 
103 static int num_res = 0; /* # of cached re's */
104 static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
105 
106 
107 /* Local functions */
108 static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
109  text *flags,
110  Oid collation,
111  bool force_glob,
112  bool use_subpatterns,
113  bool ignore_degenerate);
114 static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
117 
118 
119 /*
120  * RE_compile_and_cache - compile a RE, caching if possible
121  *
122  * Returns regex_t *
123  *
124  * text_re --- the pattern, expressed as a TEXT object
125  * cflags --- compile options for the pattern
126  * collation --- collation to use for LC_CTYPE-dependent behavior
127  *
128  * Pattern is given in the database encoding. We internally convert to
129  * an array of pg_wchar, which is what Spencer's regex package wants.
130  */
131 static regex_t *
132 RE_compile_and_cache(text *text_re, int cflags, Oid collation)
133 {
134  int text_re_len = VARSIZE_ANY_EXHDR(text_re);
135  char *text_re_val = VARDATA_ANY(text_re);
136  pg_wchar *pattern;
137  int pattern_len;
138  int i;
139  int regcomp_result;
140  cached_re_str re_temp;
141  char errMsg[100];
142 
143  /*
144  * Look for a match among previously compiled REs. Since the data
145  * structure is self-organizing with most-used entries at the front, our
146  * search strategy can just be to scan from the front.
147  */
148  for (i = 0; i < num_res; i++)
149  {
150  if (re_array[i].cre_pat_len == text_re_len &&
151  re_array[i].cre_flags == cflags &&
152  re_array[i].cre_collation == collation &&
153  memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
154  {
155  /*
156  * Found a match; move it to front if not there already.
157  */
158  if (i > 0)
159  {
160  re_temp = re_array[i];
161  memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
162  re_array[0] = re_temp;
163  }
164 
165  return &re_array[0].cre_re;
166  }
167  }
168 
169  /*
170  * Couldn't find it, so try to compile the new RE. To avoid leaking
171  * resources on failure, we build into the re_temp local.
172  */
173 
174  /* Convert pattern string to wide characters */
175  pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
176  pattern_len = pg_mb2wchar_with_len(text_re_val,
177  pattern,
178  text_re_len);
179 
180  regcomp_result = pg_regcomp(&re_temp.cre_re,
181  pattern,
182  pattern_len,
183  cflags,
184  collation);
185 
186  pfree(pattern);
187 
188  if (regcomp_result != REG_OKAY)
189  {
190  /* re didn't compile (no need for pg_regfree, if so) */
191  pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
192  ereport(ERROR,
193  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
194  errmsg("invalid regular expression: %s", errMsg)));
195  }
196 
197  /*
198  * We use malloc/free for the cre_pat field because the storage has to
199  * persist across transactions, and because we want to get control back on
200  * out-of-memory. The Max() is because some malloc implementations return
201  * NULL for malloc(0).
202  */
203  re_temp.cre_pat = malloc(Max(text_re_len, 1));
204  if (re_temp.cre_pat == NULL)
205  {
206  pg_regfree(&re_temp.cre_re);
207  ereport(ERROR,
208  (errcode(ERRCODE_OUT_OF_MEMORY),
209  errmsg("out of memory")));
210  }
211  memcpy(re_temp.cre_pat, text_re_val, text_re_len);
212  re_temp.cre_pat_len = text_re_len;
213  re_temp.cre_flags = cflags;
214  re_temp.cre_collation = collation;
215 
216  /*
217  * Okay, we have a valid new item in re_temp; insert it into the storage
218  * array. Discard last entry if needed.
219  */
220  if (num_res >= MAX_CACHED_RES)
221  {
222  --num_res;
223  Assert(num_res < MAX_CACHED_RES);
224  pg_regfree(&re_array[num_res].cre_re);
225  free(re_array[num_res].cre_pat);
226  }
227 
228  if (num_res > 0)
229  memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
230 
231  re_array[0] = re_temp;
232  num_res++;
233 
234  return &re_array[0].cre_re;
235 }
236 
237 /*
238  * RE_wchar_execute - execute a RE on pg_wchar data
239  *
240  * Returns TRUE on match, FALSE on no match
241  *
242  * re --- the compiled pattern as returned by RE_compile_and_cache
243  * data --- the data to match against (need not be null-terminated)
244  * data_len --- the length of the data string
245  * start_search -- the offset in the data to start searching
246  * nmatch, pmatch --- optional return area for match details
247  *
248  * Data is given as array of pg_wchar which is what Spencer's regex package
249  * wants.
250  */
251 static bool
252 RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
253  int start_search, int nmatch, regmatch_t *pmatch)
254 {
255  int regexec_result;
256  char errMsg[100];
257 
258  /* Perform RE match and return result */
259  regexec_result = pg_regexec(re,
260  data,
261  data_len,
262  start_search,
263  NULL, /* no details */
264  nmatch,
265  pmatch,
266  0);
267 
268  if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
269  {
270  /* re failed??? */
271  pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
272  ereport(ERROR,
273  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
274  errmsg("regular expression failed: %s", errMsg)));
275  }
276 
277  return (regexec_result == REG_OKAY);
278 }
279 
280 /*
281  * RE_execute - execute a RE
282  *
283  * Returns TRUE on match, FALSE on no match
284  *
285  * re --- the compiled pattern as returned by RE_compile_and_cache
286  * dat --- the data to match against (need not be null-terminated)
287  * dat_len --- the length of the data string
288  * nmatch, pmatch --- optional return area for match details
289  *
290  * Data is given in the database encoding. We internally
291  * convert to array of pg_wchar which is what Spencer's regex package wants.
292  */
293 static bool
294 RE_execute(regex_t *re, char *dat, int dat_len,
295  int nmatch, regmatch_t *pmatch)
296 {
297  pg_wchar *data;
298  int data_len;
299  bool match;
300 
301  /* Convert data string to wide characters */
302  data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
303  data_len = pg_mb2wchar_with_len(dat, data, dat_len);
304 
305  /* Perform RE match and return result */
306  match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
307 
308  pfree(data);
309  return match;
310 }
311 
312 /*
313  * RE_compile_and_execute - compile and execute a RE
314  *
315  * Returns TRUE on match, FALSE on no match
316  *
317  * text_re --- the pattern, expressed as a TEXT object
318  * dat --- the data to match against (need not be null-terminated)
319  * dat_len --- the length of the data string
320  * cflags --- compile options for the pattern
321  * collation --- collation to use for LC_CTYPE-dependent behavior
322  * nmatch, pmatch --- optional return area for match details
323  *
324  * Both pattern and data are given in the database encoding. We internally
325  * convert to array of pg_wchar which is what Spencer's regex package wants.
326  */
327 static bool
328 RE_compile_and_execute(text *text_re, char *dat, int dat_len,
329  int cflags, Oid collation,
330  int nmatch, regmatch_t *pmatch)
331 {
332  regex_t *re;
333 
334  /* Compile RE */
335  re = RE_compile_and_cache(text_re, cflags, collation);
336 
337  return RE_execute(re, dat, dat_len, nmatch, pmatch);
338 }
339 
340 
341 /*
342  * parse_re_flags - parse the options argument of regexp_matches and friends
343  *
344  * flags --- output argument, filled with desired options
345  * opts --- TEXT object, or NULL for defaults
346  *
347  * This accepts all the options allowed by any of the callers; callers that
348  * don't want some have to reject them after the fact.
349  */
350 static void
352 {
353  /* regex flavor is always folded into the compile flags */
354  flags->cflags = REG_ADVANCED;
355  flags->glob = false;
356 
357  if (opts)
358  {
359  char *opt_p = VARDATA_ANY(opts);
360  int opt_len = VARSIZE_ANY_EXHDR(opts);
361  int i;
362 
363  for (i = 0; i < opt_len; i++)
364  {
365  switch (opt_p[i])
366  {
367  case 'g':
368  flags->glob = true;
369  break;
370  case 'b': /* BREs (but why???) */
371  flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
372  break;
373  case 'c': /* case sensitive */
374  flags->cflags &= ~REG_ICASE;
375  break;
376  case 'e': /* plain EREs */
377  flags->cflags |= REG_EXTENDED;
378  flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
379  break;
380  case 'i': /* case insensitive */
381  flags->cflags |= REG_ICASE;
382  break;
383  case 'm': /* Perloid synonym for n */
384  case 'n': /* \n affects ^ $ . [^ */
385  flags->cflags |= REG_NEWLINE;
386  break;
387  case 'p': /* ~Perl, \n affects . [^ */
388  flags->cflags |= REG_NLSTOP;
389  flags->cflags &= ~REG_NLANCH;
390  break;
391  case 'q': /* literal string */
392  flags->cflags |= REG_QUOTE;
393  flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
394  break;
395  case 's': /* single line, \n ordinary */
396  flags->cflags &= ~REG_NEWLINE;
397  break;
398  case 't': /* tight syntax */
399  flags->cflags &= ~REG_EXPANDED;
400  break;
401  case 'w': /* weird, \n affects ^ $ only */
402  flags->cflags &= ~REG_NLSTOP;
403  flags->cflags |= REG_NLANCH;
404  break;
405  case 'x': /* expanded syntax */
406  flags->cflags |= REG_EXPANDED;
407  break;
408  default:
409  ereport(ERROR,
410  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
411  errmsg("invalid regexp option: \"%c\"",
412  opt_p[i])));
413  break;
414  }
415  }
416  }
417 }
418 
419 
420 /*
421  * interface routines called by the function manager
422  */
423 
424 Datum
426 {
427  Name n = PG_GETARG_NAME(0);
428  text *p = PG_GETARG_TEXT_PP(1);
429 
431  NameStr(*n),
432  strlen(NameStr(*n)),
433  REG_ADVANCED,
435  0, NULL));
436 }
437 
438 Datum
440 {
441  Name n = PG_GETARG_NAME(0);
442  text *p = PG_GETARG_TEXT_PP(1);
443 
445  NameStr(*n),
446  strlen(NameStr(*n)),
447  REG_ADVANCED,
449  0, NULL));
450 }
451 
452 Datum
454 {
455  text *s = PG_GETARG_TEXT_PP(0);
456  text *p = PG_GETARG_TEXT_PP(1);
457 
459  VARDATA_ANY(s),
461  REG_ADVANCED,
463  0, NULL));
464 }
465 
466 Datum
468 {
469  text *s = PG_GETARG_TEXT_PP(0);
470  text *p = PG_GETARG_TEXT_PP(1);
471 
473  VARDATA_ANY(s),
475  REG_ADVANCED,
477  0, NULL));
478 }
479 
480 
481 /*
482  * routines that use the regexp stuff, but ignore the case.
483  * for this, we use the REG_ICASE flag to pg_regcomp
484  */
485 
486 
487 Datum
489 {
490  Name n = PG_GETARG_NAME(0);
491  text *p = PG_GETARG_TEXT_PP(1);
492 
494  NameStr(*n),
495  strlen(NameStr(*n)),
498  0, NULL));
499 }
500 
501 Datum
503 {
504  Name n = PG_GETARG_NAME(0);
505  text *p = PG_GETARG_TEXT_PP(1);
506 
508  NameStr(*n),
509  strlen(NameStr(*n)),
512  0, NULL));
513 }
514 
515 Datum
517 {
518  text *s = PG_GETARG_TEXT_PP(0);
519  text *p = PG_GETARG_TEXT_PP(1);
520 
522  VARDATA_ANY(s),
526  0, NULL));
527 }
528 
529 Datum
531 {
532  text *s = PG_GETARG_TEXT_PP(0);
533  text *p = PG_GETARG_TEXT_PP(1);
534 
536  VARDATA_ANY(s),
540  0, NULL));
541 }
542 
543 
544 /*
545  * textregexsubstr()
546  * Return a substring matched by a regular expression.
547  */
548 Datum
550 {
551  text *s = PG_GETARG_TEXT_PP(0);
552  text *p = PG_GETARG_TEXT_PP(1);
553  regex_t *re;
554  regmatch_t pmatch[2];
555  int so,
556  eo;
557 
558  /* Compile RE */
560 
561  /*
562  * We pass two regmatch_t structs to get info about the overall match and
563  * the match for the first parenthesized subexpression (if any). If there
564  * is a parenthesized subexpression, we return what it matched; else
565  * return what the whole regexp matched.
566  */
567  if (!RE_execute(re,
569  2, pmatch))
570  PG_RETURN_NULL(); /* definitely no match */
571 
572  if (re->re_nsub > 0)
573  {
574  /* has parenthesized subexpressions, use the first one */
575  so = pmatch[1].rm_so;
576  eo = pmatch[1].rm_eo;
577  }
578  else
579  {
580  /* no parenthesized subexpression, use whole match */
581  so = pmatch[0].rm_so;
582  eo = pmatch[0].rm_eo;
583  }
584 
585  /*
586  * It is possible to have a match to the whole pattern but no match for a
587  * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
588  * there is no subexpression match. So this extra test for match failure
589  * is not redundant.
590  */
591  if (so < 0 || eo < 0)
592  PG_RETURN_NULL();
593 
595  PointerGetDatum(s),
596  Int32GetDatum(so + 1),
597  Int32GetDatum(eo - so));
598 }
599 
600 /*
601  * textregexreplace_noopt()
602  * Return a string matched by a regular expression, with replacement.
603  *
604  * This version doesn't have an option argument: we default to case
605  * sensitive match, replace the first instance only.
606  */
607 Datum
609 {
610  text *s = PG_GETARG_TEXT_PP(0);
611  text *p = PG_GETARG_TEXT_PP(1);
612  text *r = PG_GETARG_TEXT_PP(2);
613  regex_t *re;
614 
616 
617  PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
618 }
619 
620 /*
621  * textregexreplace()
622  * Return a string matched by a regular expression, with replacement.
623  */
624 Datum
626 {
627  text *s = PG_GETARG_TEXT_PP(0);
628  text *p = PG_GETARG_TEXT_PP(1);
629  text *r = PG_GETARG_TEXT_PP(2);
630  text *opt = PG_GETARG_TEXT_PP(3);
631  regex_t *re;
632  pg_re_flags flags;
633 
634  parse_re_flags(&flags, opt);
635 
636  re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
637 
638  PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
639 }
640 
641 /*
642  * similar_escape()
643  * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
644  * our regexp engine.
645  */
646 Datum
648 {
649  text *pat_text;
650  text *esc_text;
651  text *result;
652  char *p,
653  *e,
654  *r;
655  int plen,
656  elen;
657  bool afterescape = false;
658  bool incharclass = false;
659  int nquotes = 0;
660 
661  /* This function is not strict, so must test explicitly */
662  if (PG_ARGISNULL(0))
663  PG_RETURN_NULL();
664  pat_text = PG_GETARG_TEXT_PP(0);
665  p = VARDATA_ANY(pat_text);
666  plen = VARSIZE_ANY_EXHDR(pat_text);
667  if (PG_ARGISNULL(1))
668  {
669  /* No ESCAPE clause provided; default to backslash as escape */
670  e = "\\";
671  elen = 1;
672  }
673  else
674  {
675  esc_text = PG_GETARG_TEXT_PP(1);
676  e = VARDATA_ANY(esc_text);
677  elen = VARSIZE_ANY_EXHDR(esc_text);
678  if (elen == 0)
679  e = NULL; /* no escape character */
680  else if (elen != 1)
681  ereport(ERROR,
682  (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
683  errmsg("invalid escape string"),
684  errhint("Escape string must be empty or one character.")));
685  }
686 
687  /*----------
688  * We surround the transformed input string with
689  * ^(?: ... )$
690  * which requires some explanation. We need "^" and "$" to force
691  * the pattern to match the entire input string as per SQL99 spec.
692  * The "(?:" and ")" are a non-capturing set of parens; we have to have
693  * parens in case the string contains "|", else the "^" and "$" will
694  * be bound into the first and last alternatives which is not what we
695  * want, and the parens must be non capturing because we don't want them
696  * to count when selecting output for SUBSTRING.
697  *----------
698  */
699 
700  /*
701  * We need room for the prefix/postfix plus as many as 3 output bytes per
702  * input byte; since the input is at most 1GB this can't overflow
703  */
704  result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
705  r = VARDATA(result);
706 
707  *r++ = '^';
708  *r++ = '(';
709  *r++ = '?';
710  *r++ = ':';
711 
712  while (plen > 0)
713  {
714  char pchar = *p;
715 
716  if (afterescape)
717  {
718  if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */
719  *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
720  else
721  {
722  *r++ = '\\';
723  *r++ = pchar;
724  }
725  afterescape = false;
726  }
727  else if (e && pchar == *e)
728  {
729  /* SQL99 escape character; do not send to output */
730  afterescape = true;
731  }
732  else if (incharclass)
733  {
734  if (pchar == '\\')
735  *r++ = '\\';
736  *r++ = pchar;
737  if (pchar == ']')
738  incharclass = false;
739  }
740  else if (pchar == '[')
741  {
742  *r++ = pchar;
743  incharclass = true;
744  }
745  else if (pchar == '%')
746  {
747  *r++ = '.';
748  *r++ = '*';
749  }
750  else if (pchar == '_')
751  *r++ = '.';
752  else if (pchar == '(')
753  {
754  /* convert to non-capturing parenthesis */
755  *r++ = '(';
756  *r++ = '?';
757  *r++ = ':';
758  }
759  else if (pchar == '\\' || pchar == '.' ||
760  pchar == '^' || pchar == '$')
761  {
762  *r++ = '\\';
763  *r++ = pchar;
764  }
765  else
766  *r++ = pchar;
767  p++, plen--;
768  }
769 
770  *r++ = ')';
771  *r++ = '$';
772 
773  SET_VARSIZE(result, r - ((char *) result));
774 
775  PG_RETURN_TEXT_P(result);
776 }
777 
778 /*
779  * regexp_matches()
780  * Return a table of matches of a pattern within a string.
781  */
782 Datum
784 {
785  FuncCallContext *funcctx;
786  regexp_matches_ctx *matchctx;
787 
788  if (SRF_IS_FIRSTCALL())
789  {
790  text *pattern = PG_GETARG_TEXT_PP(1);
791  text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
792  MemoryContext oldcontext;
793 
794  funcctx = SRF_FIRSTCALL_INIT();
795  oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
796 
797  /* be sure to copy the input string into the multi-call ctx */
798  matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
799  flags,
801  false, true, false);
802 
803  /* Pre-create workspace that build_regexp_matches_result needs */
804  matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
805  matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
806 
807  MemoryContextSwitchTo(oldcontext);
808  funcctx->user_fctx = (void *) matchctx;
809  }
810 
811  funcctx = SRF_PERCALL_SETUP();
812  matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
813 
814  if (matchctx->next_match < matchctx->nmatches)
815  {
816  ArrayType *result_ary;
817 
818  result_ary = build_regexp_matches_result(matchctx);
819  matchctx->next_match++;
820  SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
821  }
822 
823  /* release space in multi-call ctx to avoid intraquery memory leak */
824  cleanup_regexp_matches(matchctx);
825 
826  SRF_RETURN_DONE(funcctx);
827 }
828 
829 /* This is separate to keep the opr_sanity regression test from complaining */
830 Datum
832 {
833  return regexp_matches(fcinfo);
834 }
835 
836 /*
837  * setup_regexp_matches --- do the initial matching for regexp_matches()
838  * or regexp_split()
839  *
840  * To avoid having to re-find the compiled pattern on each call, we do
841  * all the matching in one swoop. The returned regexp_matches_ctx contains
842  * the locations of all the substrings matching the pattern.
843  *
844  * The three bool parameters have only two patterns (one for each caller)
845  * but it seems clearer to distinguish the functionality this way than to
846  * key it all off one "is_split" flag.
847  */
848 static regexp_matches_ctx *
849 setup_regexp_matches(text *orig_str, text *pattern, text *flags,
850  Oid collation,
851  bool force_glob, bool use_subpatterns,
852  bool ignore_degenerate)
853 {
854  regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
855  int orig_len;
856  pg_wchar *wide_str;
857  int wide_len;
858  pg_re_flags re_flags;
859  regex_t *cpattern;
860  regmatch_t *pmatch;
861  int pmatch_len;
862  int array_len;
863  int array_idx;
864  int prev_match_end;
865  int start_search;
866 
867  /* save original string --- we'll extract result substrings from it */
868  matchctx->orig_str = orig_str;
869 
870  /* convert string to pg_wchar form for matching */
871  orig_len = VARSIZE_ANY_EXHDR(orig_str);
872  wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
873  wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
874 
875  /* determine options */
876  parse_re_flags(&re_flags, flags);
877  if (force_glob)
878  {
879  /* user mustn't specify 'g' for regexp_split */
880  if (re_flags.glob)
881  ereport(ERROR,
882  (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
883  errmsg("regexp_split does not support the global option")));
884  /* but we find all the matches anyway */
885  re_flags.glob = true;
886  }
887 
888  /* set up the compiled pattern */
889  cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
890 
891  /* do we want to remember subpatterns? */
892  if (use_subpatterns && cpattern->re_nsub > 0)
893  {
894  matchctx->npatterns = cpattern->re_nsub;
895  pmatch_len = cpattern->re_nsub + 1;
896  }
897  else
898  {
899  use_subpatterns = false;
900  matchctx->npatterns = 1;
901  pmatch_len = 1;
902  }
903 
904  /* temporary output space for RE package */
905  pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
906 
907  /* the real output space (grown dynamically if needed) */
908  array_len = re_flags.glob ? 256 : 32;
909  matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
910  array_idx = 0;
911 
912  /* search for the pattern, perhaps repeatedly */
913  prev_match_end = 0;
914  start_search = 0;
915  while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
916  pmatch_len, pmatch))
917  {
918  /*
919  * If requested, ignore degenerate matches, which are zero-length
920  * matches occurring at the start or end of a string or just after a
921  * previous match.
922  */
923  if (!ignore_degenerate ||
924  (pmatch[0].rm_so < wide_len &&
925  pmatch[0].rm_eo > prev_match_end))
926  {
927  /* enlarge output space if needed */
928  while (array_idx + matchctx->npatterns * 2 > array_len)
929  {
930  array_len *= 2;
931  matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
932  sizeof(int) * array_len);
933  }
934 
935  /* save this match's locations */
936  if (use_subpatterns)
937  {
938  int i;
939 
940  for (i = 1; i <= matchctx->npatterns; i++)
941  {
942  matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
943  matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
944  }
945  }
946  else
947  {
948  matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
949  matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
950  }
951  matchctx->nmatches++;
952  }
953  prev_match_end = pmatch[0].rm_eo;
954 
955  /* if not glob, stop after one match */
956  if (!re_flags.glob)
957  break;
958 
959  /*
960  * Advance search position. Normally we start the next search at the
961  * end of the previous match; but if the match was of zero length, we
962  * have to advance by one character, or we'd just find the same match
963  * again.
964  */
965  start_search = prev_match_end;
966  if (pmatch[0].rm_so == pmatch[0].rm_eo)
967  start_search++;
968  if (start_search > wide_len)
969  break;
970  }
971 
972  /* Clean up temp storage */
973  pfree(wide_str);
974  pfree(pmatch);
975 
976  return matchctx;
977 }
978 
979 /*
980  * cleanup_regexp_matches - release memory of a regexp_matches_ctx
981  */
982 static void
984 {
985  pfree(matchctx->orig_str);
986  pfree(matchctx->match_locs);
987  if (matchctx->elems)
988  pfree(matchctx->elems);
989  if (matchctx->nulls)
990  pfree(matchctx->nulls);
991  pfree(matchctx);
992 }
993 
994 /*
995  * build_regexp_matches_result - build output array for current match
996  */
997 static ArrayType *
999 {
1000  Datum *elems = matchctx->elems;
1001  bool *nulls = matchctx->nulls;
1002  int dims[1];
1003  int lbs[1];
1004  int loc;
1005  int i;
1006 
1007  /* Extract matching substrings from the original string */
1008  loc = matchctx->next_match * matchctx->npatterns * 2;
1009  for (i = 0; i < matchctx->npatterns; i++)
1010  {
1011  int so = matchctx->match_locs[loc++];
1012  int eo = matchctx->match_locs[loc++];
1013 
1014  if (so < 0 || eo < 0)
1015  {
1016  elems[i] = (Datum) 0;
1017  nulls[i] = true;
1018  }
1019  else
1020  {
1022  PointerGetDatum(matchctx->orig_str),
1023  Int32GetDatum(so + 1),
1024  Int32GetDatum(eo - so));
1025  nulls[i] = false;
1026  }
1027  }
1028 
1029  /* And form an array */
1030  dims[0] = matchctx->npatterns;
1031  lbs[0] = 1;
1032  /* XXX: this hardcodes assumptions about the text type */
1033  return construct_md_array(elems, nulls, 1, dims, lbs,
1034  TEXTOID, -1, false, 'i');
1035 }
1036 
1037 /*
1038  * regexp_split_to_table()
1039  * Split the string at matches of the pattern, returning the
1040  * split-out substrings as a table.
1041  */
1042 Datum
1044 {
1045  FuncCallContext *funcctx;
1046  regexp_matches_ctx *splitctx;
1047 
1048  if (SRF_IS_FIRSTCALL())
1049  {
1050  text *pattern = PG_GETARG_TEXT_PP(1);
1051  text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1052  MemoryContext oldcontext;
1053 
1054  funcctx = SRF_FIRSTCALL_INIT();
1055  oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1056 
1057  /* be sure to copy the input string into the multi-call ctx */
1058  splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1059  flags,
1060  PG_GET_COLLATION(),
1061  true, false, true);
1062 
1063  MemoryContextSwitchTo(oldcontext);
1064  funcctx->user_fctx = (void *) splitctx;
1065  }
1066 
1067  funcctx = SRF_PERCALL_SETUP();
1068  splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1069 
1070  if (splitctx->next_match <= splitctx->nmatches)
1071  {
1072  Datum result = build_regexp_split_result(splitctx);
1073 
1074  splitctx->next_match++;
1075  SRF_RETURN_NEXT(funcctx, result);
1076  }
1077 
1078  /* release space in multi-call ctx to avoid intraquery memory leak */
1079  cleanup_regexp_matches(splitctx);
1080 
1081  SRF_RETURN_DONE(funcctx);
1082 }
1083 
1084 /* This is separate to keep the opr_sanity regression test from complaining */
1085 Datum
1087 {
1088  return regexp_split_to_table(fcinfo);
1089 }
1090 
1091 /*
1092  * regexp_split_to_array()
1093  * Split the string at matches of the pattern, returning the
1094  * split-out substrings as an array.
1095  */
1096 Datum
1098 {
1099  ArrayBuildState *astate = NULL;
1100  regexp_matches_ctx *splitctx;
1101 
1103  PG_GETARG_TEXT_PP(1),
1105  PG_GET_COLLATION(),
1106  true, false, true);
1107 
1108  while (splitctx->next_match <= splitctx->nmatches)
1109  {
1110  astate = accumArrayResult(astate,
1111  build_regexp_split_result(splitctx),
1112  false,
1113  TEXTOID,
1115  splitctx->next_match++;
1116  }
1117 
1118  /*
1119  * We don't call cleanup_regexp_matches here; it would try to pfree the
1120  * input string, which we didn't copy. The space is not in a long-lived
1121  * memory context anyway.
1122  */
1123 
1125 }
1126 
1127 /* This is separate to keep the opr_sanity regression test from complaining */
1128 Datum
1130 {
1131  return regexp_split_to_array(fcinfo);
1132 }
1133 
1134 /*
1135  * build_regexp_split_result - build output string for current match
1136  *
1137  * We return the string between the current match and the previous one,
1138  * or the string after the last match when next_match == nmatches.
1139  */
1140 static Datum
1142 {
1143  int startpos;
1144  int endpos;
1145 
1146  if (splitctx->next_match > 0)
1147  startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
1148  else
1149  startpos = 0;
1150  if (startpos < 0)
1151  elog(ERROR, "invalid match ending position");
1152 
1153  if (splitctx->next_match < splitctx->nmatches)
1154  {
1155  endpos = splitctx->match_locs[splitctx->next_match * 2];
1156  if (endpos < startpos)
1157  elog(ERROR, "invalid match starting position");
1159  PointerGetDatum(splitctx->orig_str),
1160  Int32GetDatum(startpos + 1),
1161  Int32GetDatum(endpos - startpos));
1162  }
1163  else
1164  {
1165  /* no more matches, return rest of string */
1167  PointerGetDatum(splitctx->orig_str),
1168  Int32GetDatum(startpos + 1));
1169  }
1170 }
1171 
1172 /*
1173  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
1174  *
1175  * The result is NULL if there is no fixed prefix, else a palloc'd string.
1176  * If it is an exact match, not just a prefix, *exact is returned as TRUE.
1177  */
1178 char *
1179 regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
1180  bool *exact)
1181 {
1182  char *result;
1183  regex_t *re;
1184  int cflags;
1185  int re_result;
1186  pg_wchar *str;
1187  size_t slen;
1188  size_t maxlen;
1189  char errMsg[100];
1190 
1191  *exact = false; /* default result */
1192 
1193  /* Compile RE */
1194  cflags = REG_ADVANCED;
1195  if (case_insensitive)
1196  cflags |= REG_ICASE;
1197 
1198  re = RE_compile_and_cache(text_re, cflags, collation);
1199 
1200  /* Examine it to see if there's a fixed prefix */
1201  re_result = pg_regprefix(re, &str, &slen);
1202 
1203  switch (re_result)
1204  {
1205  case REG_NOMATCH:
1206  return NULL;
1207 
1208  case REG_PREFIX:
1209  /* continue with wchar conversion */
1210  break;
1211 
1212  case REG_EXACT:
1213  *exact = true;
1214  /* continue with wchar conversion */
1215  break;
1216 
1217  default:
1218  /* re failed??? */
1219  pg_regerror(re_result, re, errMsg, sizeof(errMsg));
1220  ereport(ERROR,
1221  (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
1222  errmsg("regular expression failed: %s", errMsg)));
1223  break;
1224  }
1225 
1226  /* Convert pg_wchar result back to database encoding */
1227  maxlen = pg_database_encoding_max_length() * slen + 1;
1228  result = (char *) palloc(maxlen);
1229  slen = pg_wchar2mb_with_len(str, result, slen);
1230  Assert(slen < maxlen);
1231 
1232  free(str);
1233 
1234  return result;
1235 }