libarmutils  1.4
 All Data Structures Files Functions Variables Typedefs Enumerations Macros Groups
regex_utils.c
Go to the documentation of this file.
1 /*******************************************************************************
2 *
3 * COPYRIGHT (C) 2010 Battelle Memorial Institute. All Rights Reserved.
4 *
5 ********************************************************************************
6 *
7 * Author:
8 * name: Brian Ermold
9 * phone: (509) 375-2277
10 * email: brian.ermold@pnl.gov
11 *
12 ********************************************************************************
13 *
14 * REPOSITORY INFORMATION:
15 * $Revision: 60943 $
16 * $Author: ermold $
17 * $Date: 2015-04-01 19:47:36 +0000 (Wed, 01 Apr 2015) $
18 *
19 ********************************************************************************
20 *
21 * NOTE: DOXYGEN is used to generate documentation for this file.
22 *
23 *******************************************************************************/
24 
25 /** @file regex_utils.c
26  * Regular Expression Utilities
27  */
28 
29 #include "armutils.h"
30 
31 /*******************************************************************************
32  * Private Functions
33  */
34 /** @privatesection */
35 
36 /**
37  * PRIVATE: Free the results from the last regular expression match.
38  *
39  * @param re_list - pointer to the regular expressions list
40  */
41 static void __relist_free_results(REList *re_list)
42 {
43  if (re_list->string) free(re_list->string);
44  if (re_list->offsets) free(re_list->offsets);
45 
46  if (re_list->substrs)
47  re_free_substrings((re_list->nsubs + 1), re_list->substrs);
48 
49  re_list->string = (char *)NULL;
50  re_list->eflags = 0;
51  re_list->mindex = -1;
52  re_list->nsubs = 0;
53  re_list->offsets = (regmatch_t *)NULL;
54  re_list->substrs = (char **)NULL;
55 }
56 
57 /*******************************************************************************
58  * Public Functions
59  */
60 /** @publicsection */
61 
62 /**
63  * Wrapper function for regcomp().
64  *
65  * See the regcomp man page for more detailed argument descriptions.
66  *
67  * Error messages from this function are sent to the message handler
68  * (see msngr_init_log() and msngr_init_mail()).
69  *
70  * @param preg - pointer to the regex structure to use
71  * @param pattern - pattern string to compile
72  * @param cflags - compile flags
73  *
74  * @return
75  * - 1 if successful
76  * - 0 if an error occurred
77  */
78 int re_compile(regex_t *preg, const char *pattern, int cflags)
79 {
80  int errcode;
81  char *errstr;
82 
83  errcode = regcomp(preg, pattern, cflags);
84  if (errcode) {
85 
86  errstr = re_error(errcode, preg);
87 
88  if (errstr) {
89 
91  "Could not compile regular expression: '%s'\n"
92  " -> %s\n", pattern, errstr);
93 
94  free(errstr);
95  return(0);
96  }
97  }
98 
99  return(1);
100 }
101 
102 /**
103  * Wrapper function for regerror().
104  *
105  * The memory used by the returned string is dynamically allocated.
106  * It is the responsibility of the calling process to free this memory.
107  *
108  * Error messages from this function are sent to the message handler
109  * (see msngr_init_log() and msngr_init_mail()).
110  *
111  * @param errcode - error code returned by re_comp() or re_exec()
112  * @param preg - pointer to the regular expression that failed
113  *
114  * @return
115  * - regex error message
116  * - NULL if an error occurred
117  */
118 char *re_error(int errcode, regex_t *preg)
119 {
120  size_t errlen;
121  char *errstr;
122 
123  errlen = regerror(errcode, preg, NULL, 0);
124  if (errlen == 0) {
125 
127  "Could not create regular expression error message\n"
128  " -> regerror not implemented\n");
129 
130  return((char *)NULL);
131  }
132 
133  errlen++;
134 
135  errstr = (char *)malloc(errlen * sizeof(char));
136  if (!errstr) {
137 
139  "Could not create regular expression error message\n"
140  " -> memory allocation error\n");
141 
142  return((char *)NULL);
143  }
144 
145  regerror(errcode, preg, errstr, errlen);
146 
147  return(errstr);
148 }
149 
150 /**
151  * Wrapper function for regexec().
152  *
153  * See the regexec man page for more detailed argument descriptions.
154  *
155  * Error messages from this function are sent to the message handler
156  * (see msngr_init_log() and msngr_init_mail()).
157  *
158  * @param preg - pointer to the compiled regular expression
159  * @param string - string to compare with the regular expression
160  * @param nmatch - number of substrings to match
161  * @param pmatch - arrary to store the matching substring offsets
162  * @param eflags - execute flags
163  *
164  * @return
165  * - 1 if match
166  * - 0 if no match
167  * - -1 if an error occurred
168  */
170  regex_t *preg,
171  const char *string,
172  size_t nmatch,
173  regmatch_t pmatch[],
174  int eflags)
175 {
176  int errcode;
177  char *errstr;
178 
179  errcode = regexec(preg, string, nmatch, pmatch, eflags);
180 
181  if (errcode == 0) {
182  return(1);
183  }
184  else if (errcode == REG_NOMATCH) {
185  return(0);
186  }
187 
188  errstr = re_error(errcode, preg);
189 
190  if (errstr) {
191 
193  "Could not execute regular expression for string: '%s'\n"
194  " -> %s\n", string, errstr);
195 
196  free(errstr);
197  }
198 
199  return(-1);
200 }
201 
202 /**
203  * Wrapper function for regfree().
204  *
205  * @param preg - pointer to the compiled regular expression
206  */
207 void re_free(regex_t *preg)
208 {
209  if (preg) regfree(preg);
210 }
211 
212 /**
213  * Free the substring list returned by re_substrings().
214  *
215  * @param nmatch - number of substrings in match request
216  * @param substrings - array of substrings returned by re_substrings()
217  */
218 void re_free_substrings(size_t nmatch, char **substrings)
219 {
220  size_t si;
221 
222  if (substrings) {
223  for (si = 0; si < nmatch; si++) {
224  if (substrings[si]) free(substrings[si]);
225  }
226  free(substrings);
227  }
228 }
229 
230 /**
231  * Extract the substrings from a regular expression match.
232  *
233  * The array of substrings returned by this function must me freed
234  * with re_free_substrings().
235  *
236  * Error messages from this function are sent to the message handler
237  * (see msngr_init_log() and msngr_init_mail()).
238  *
239  * @param string - string that matched the regular expression
240  * @param nmatch - number of substrings in match request
241  * @param pmatch - arrary that the matching substring offsets were stored in
242  *
243  * @return
244  * - pointer to array of substrings
245  * - NULL if an error occurred
246  */
247 char **re_substrings(const char *string, size_t nmatch, regmatch_t *pmatch)
248 {
249  char **substrings;
250  size_t mi;
251  int length;
252 
253  substrings = (char **)calloc(nmatch, sizeof(char *));
254  if (!substrings) {
255  goto MEMORY_ERROR;
256  }
257 
258  for (mi = 0; mi < nmatch; mi++) {
259 
260  if (pmatch[mi].rm_so == -1) {
261  substrings[mi] = (char *)NULL;
262  }
263  else {
264 
265  length = pmatch[mi].rm_eo - pmatch[mi].rm_so;
266 
267  substrings[mi] = (char *)malloc((length + 1) * sizeof(char));
268  if (!substrings[mi]) {
269  goto MEMORY_ERROR;
270  }
271 
272  substrings[mi][length] = '\0';
273 
274  if (length) {
275  strncpy(substrings[mi], (string + pmatch[mi].rm_so), length);
276  }
277  }
278  }
279 
280  return(substrings);
281 
282 MEMORY_ERROR:
283 
284  if (substrings) {
285  for (mi--; mi > 0; mi--) {
286  if (substrings[mi]) free(substrings[mi]);
287  }
288  if (substrings[0]) free(substrings[0]);
289  free(substrings);
290  }
291 
293  "Could not extract subtrings from regular expression match for: '%s'\n"
294  " -> memory allocation error\n", string);
295 
296  return((char **)NULL);
297 }
298 
299 /**
300  * Compile a list of regular expression patterns.
301  *
302  * See the regcomp man page for the descriptions of the pattern
303  * strings and compile flags.
304  *
305  * Error messages from this function are sent to the message handler
306  * (see msngr_init_log() and msngr_init_mail()).
307  *
308  * @param re_list - pointer to the regular expressions list to add the
309  * patterns to, or NULL to create a new list
310  * @param npatterns - number of patterns to compile
311  * @param patterns - list of patterns to compile
312  * @param cflags - compile flags
313  *
314  * @return
315  * - pointer to the regular expressions list
316  * - NULL if an error occurred
317  */
319  REList *re_list,
320  int npatterns,
321  const char **patterns,
322  int cflags)
323 {
324  REList *new_re_list = (REList *)NULL;
325  int new_nregs;
326  char **new_patterns;
327  int *new_cflags;
328  regex_t **new_regs;
329  regex_t *preg;
330  int pi;
331 
332  /* Create a new REList if one was not specified */
333 
334  if (!re_list) {
335  new_re_list = (REList *)calloc(1, sizeof(REList));
336  if (!new_re_list) {
337  goto MEMORY_ERROR;
338  }
339  re_list = new_re_list;
340  re_list->mindex = -1;
341  }
342 
343  /* Allocate space for the new patterns list */
344 
345  new_nregs = re_list->nregs + npatterns;
346 
347  new_patterns = (char **)realloc(
348  re_list->patterns, new_nregs * sizeof(char *));
349 
350  if (!new_patterns) {
351  goto MEMORY_ERROR;
352  }
353 
354  re_list->patterns = new_patterns;
355 
356  /* Allocate space for the new cflags list */
357 
358  new_cflags = (int *)realloc(
359  re_list->cflags, new_nregs * sizeof(int));
360 
361  if (!new_cflags) {
362  goto MEMORY_ERROR;
363  }
364 
365  re_list->cflags = new_cflags;
366 
367  /* Allocate space for the new regs list */
368 
369  new_regs = (regex_t **)realloc(
370  re_list->regs, new_nregs * sizeof(regex_t *));
371 
372  if (!new_regs) {
373  goto MEMORY_ERROR;
374  }
375 
376  re_list->regs = new_regs;
377 
378  /* Compile the new regular expressions */
379 
380  for (pi = 0; pi < npatterns; pi++) {
381 
382  preg = (regex_t *)calloc(1, sizeof(regex_t));
383  if (!preg) {
384  goto MEMORY_ERROR;
385  }
386 
387  if (!re_compile(preg, patterns[pi], cflags)) {
388  free(preg);
389  goto REGCOMP_ERROR;
390  }
391 
392  re_list->patterns[re_list->nregs] = strdup(patterns[pi]);
393  if (!re_list->patterns[re_list->nregs]) {
394  regfree(preg);
395  free(preg);
396  goto MEMORY_ERROR;
397  }
398 
399  re_list->cflags[re_list->nregs] = cflags;
400  re_list->regs[re_list->nregs] = preg;
401  re_list->nregs++;
402  }
403 
404  return(re_list);
405 
406 MEMORY_ERROR:
407 
409  "Could not compile list of regular expression patterns\n"
410  " -> memory allocation error\n");
411 
412 REGCOMP_ERROR:
413 
414  if (new_re_list) relist_free(new_re_list);
415 
416  return((REList *)NULL);
417 }
418 
419 /**
420  * Compare a string with a list of regular expressions.
421  *
422  * For the outputs that are not NULL, this function will return the results
423  * for the first regular expression that matches the specified string. The
424  * returned pointers to the pmatch and substrings arrays will be valid until
425  * the next call to relist_execute() or relist_free().
426  *
427  * The pmatch and substrings arrays will have an entry for every parenthesised
428  * subexpression for the pattern that was matched (starting at index 1).
429  * Entries at index 0 correspond to the entire regular expression. For
430  * subexpressions that were not matched, the offsets in the pmatch entry will
431  * be -1 and the substrings entry will be NULL.
432  *
433  * See the regexec man page for more detailed descriptions of the execute
434  * flags and output pmatch array.
435  *
436  * Error messages from this function are sent to the message handler
437  * (see msngr_init_log() and msngr_init_mail()).
438  *
439  * @param re_list - pointer to the regular expressions list
440  * @param string - string to compare with the regular expression
441  * @param eflags - execute flags
442  * @param mindex - output: index of the pattern that was matched
443  * @param nsubs - output: number of parenthesised subexpressions
444  * @param pmatch - output: pointer to array of substring offsets
445  * @param substrings - output: pointer to array of substrings
446  *
447  * @return
448  * - 1 if match
449  * - 0 if no match
450  * - -1 if an error occurred
451  */
453  REList *re_list,
454  const char *string,
455  int eflags,
456  int *mindex,
457  size_t *nsubs,
458  regmatch_t **pmatch,
459  char ***substrings)
460 {
461  size_t max_nsubs;
462  size_t max_nmatch;
463  regex_t *preg;
464  regmatch_t *offsets;
465  int status;
466  int ri;
467 
468  /* Initialize outputs */
469 
470  if (mindex) *mindex = -1;
471  if (nsubs) *nsubs = 0;
472  if (pmatch) *pmatch = (regmatch_t *)NULL;
473  if (substrings) *substrings = (char **)NULL;
474 
475  /* Free results from previous match */
476 
477  __relist_free_results(re_list);
478 
479  /* Set string and eflags */
480 
481  re_list->eflags = eflags;
482  re_list->string = strdup(string);
483  if (!re_list->string) {
484  goto MEMORY_ERROR;
485  }
486 
487  /* Determine the maximum number of parenthesised subexpressions */
488 
489  max_nsubs = 0;
490 
491  for (ri = 0; ri < re_list->nregs; ri++) {
492 
493  if (re_list->cflags[ri] & REG_NOSUB) continue;
494 
495  if (max_nsubs < re_list->regs[ri]->re_nsub) {
496  max_nsubs = re_list->regs[ri]->re_nsub;
497  }
498  }
499 
500  /* Create array to store substring offsets */
501 
502  max_nmatch = max_nsubs + 1;
503 
504  offsets = (regmatch_t *)malloc(max_nmatch * sizeof(regmatch_t));
505  if (!offsets) {
506  goto MEMORY_ERROR;
507  }
508 
509  /* Find the first matching regular expression */
510 
511  for (ri = 0; ri < re_list->nregs; ri++) {
512 
513  preg = re_list->regs[ri];
514  status = re_execute(preg, string, max_nmatch, offsets, eflags);
515 
516  if (status > 0) break;
517  if (status == 0) continue;
518  if (status < 0) {
519  free(offsets);
520  return(-1);
521  }
522  }
523 
524  if (ri == re_list->nregs) {
525  free(offsets);
526  return(0);
527  }
528 
529  /* Set the results in the REList structure */
530 
531  re_list->mindex = ri;
532  re_list->offsets = offsets;
533 
534  if (re_list->cflags[ri] & REG_NOSUB) {
535  re_list->nsubs = 0;
536  }
537  else {
538  re_list->nsubs = re_list->regs[ri]->re_nsub;
539  }
540 
541  if (substrings) {
542 
543  re_list->substrs = re_substrings(
544  string, (re_list->nsubs + 1), re_list->offsets);
545 
546  if (!re_list->substrs) {
547  return(-1);
548  }
549  }
550 
551  /* Set outputs */
552 
553  if (mindex) *mindex = re_list->mindex;
554  if (nsubs) *nsubs = re_list->nsubs;
555  if (pmatch) *pmatch = re_list->offsets;
556  if (substrings) *substrings = re_list->substrs;
557 
558  return(1);
559 
560 MEMORY_ERROR:
561 
563  "Could not compare string to regular expressions list: '%s'\n"
564  " -> memory allocation error\n", string);
565 
566  return(-1);
567 }
568 
569 /**
570  * Free a regular expressions list.
571  *
572  * @param re_list - pointer to the regular expressions list
573  */
574 void relist_free(REList *re_list)
575 {
576  int ri;
577 
578  if (re_list) {
579 
580  __relist_free_results(re_list);
581 
582  if (re_list->patterns) {
583  for (ri = 0; ri < re_list->nregs; ri++) {
584  free(re_list->patterns[ri]);
585  }
586  free(re_list->patterns);
587  }
588 
589  if (re_list->regs) {
590  for (ri = 0; ri < re_list->nregs; ri++) {
591  regfree(re_list->regs[ri]);
592  free(re_list->regs[ri]);
593  }
594  free(re_list->regs);
595  }
596 
597  free(re_list->cflags);
598  free(re_list);
599  }
600 }