[PATCH 1/6] [lib/port]: Added a unicode POSIX compliant regular expression library
Eric Pouech
eric.pouech at orange.fr
Mon Jan 2 14:05:48 CST 2012
A+
---
include/wine/regexw.h | 66 ++++++
libs/port/Makefile.in | 1
libs/port/regexw.c | 513 +++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 580 insertions(+), 0 deletions(-)
create mode 100644 include/wine/regexw.h
create mode 100644 libs/port/regexw.c
diff --git a/include/wine/regexw.h b/include/wine/regexw.h
new file mode 100644
index 0000000..126e8fd
--- /dev/null
+++ b/include/wine/regexw.h
@@ -0,0 +1,66 @@
+#include <unistd.h>
+
+#ifndef REGEX_H
+#define REGEX_H
+
+typedef unsigned short WCHAR;
+typedef WCHAR re_char_t;
+
+#define MAXTAG 10
+
+/*
+ * The following defines are not meant to be changeable.
+ * They are for readability only.
+ */
+#define MAXCHR 65536
+#define BITSHFT 4
+#define CHRBIT (1 << BITSHFT)
+#define BITBLK MAXCHR/CHRBIT
+#define BITIND (CHRBIT - 1)
+
+typedef struct {
+ unsigned re_nsub; /* mandatory */
+
+ WCHAR tagstk[MAXTAG]; /* subpat tag stack..*/
+ size_t max_nfa; /* max size of nfa */
+ WCHAR *nfa; /* automaton.. */
+} regexw_t;
+
+typedef int regoff_t;
+
+typedef struct {
+ regoff_t rm_so;
+ regoff_t rm_eo;
+} regmatch_t;
+
+/* regcomp flags */
+#define REG_EXTENDED 1
+#define REG_ICASE 2
+#define REG_NEWLINE 4
+#define REG_NOSUB 8
+
+/* regexec flags */
+#define REG_NOTBOL 1
+#define REG_NOTEOL 2
+
+/* error codes */
+#define REG_NOMATCH 1
+#define REG_BADPAT 2
+#define REG_ECOLLATE 3
+#define REG_ECTYPE 4
+#define REG_EESCAPE 5
+#define REG_ESUBREG 6
+#define REG_EBRACK 7
+#define REG_EPAREN 8
+#define REG_EBRACE 9
+#define REG_BADBR 10
+#define REG_ERANGE 11
+#define REG_ESPACE 12
+#define REG_BADRPT 13
+
+extern int regcompW(regexw_t*, const WCHAR*, int);
+extern int regexecW(const regexw_t*, const WCHAR*, size_t, regmatch_t*, int);
+extern void regfreeW(regexw_t*);
+extern void regerror(int errcode, const regexw_t *preg, char *errbuf, size_t errbuf_size);
+
+#endif /* REGEX_H */
diff --git a/libs/port/Makefile.in b/libs/port/Makefile.in
index f2f6db0..bbcd9f5 100644
--- a/libs/port/Makefile.in
+++ b/libs/port/Makefile.in
@@ -20,6 +20,7 @@ C_SRCS = \
pread.c \
pwrite.c \
readlink.c \
+ regexw.c \
spawn.c \
statvfs.c \
strcasecmp.c \
diff --git a/libs/port/regexw.c b/libs/port/regexw.c
new file mode 100644
index 0000000..60ba60c
--- /dev/null
+++ b/libs/port/regexw.c
@@ -0,0 +1,513 @@
+/*
+ * regex - Regular expression pattern matching and replacement
+ *
+ * By: Ozan S. Yigit (oz)
+ * Dept. of Computer Science
+ * York University
+ *
+ * These routines are the PUBLIC DOMAIN equivalents of regex
+ * routines as found in 4.nBSD UN*X, with minor extensions.
+ *
+ * These routines are derived from various implementations found
+ * in software tools books, and Conroy's grep. They are NOT derived
+ * from licensed/restricted software.
+ * For more interesting/academic/complicated implementations,
+ * see Henry Spencer's regexp routines, or GNU Emacs pattern
+ * matching module.
+ *
+ * Code base (version 1.4) modified by Eric Pouech, with:
+ * + mode interface to POSIX
+ * + fix a couple of bugs
+ * + add UNICODE support
+ * Modification submitted under same license as original code.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include "wine/regexw.h"
+
+#define CHR 1
+#define ANY 2
+#define CCL 3
+#define BOL 4
+#define EOL 5
+#define BOT 6
+#define EOT 7
+#define REF 8
+#define CLO 9
+
+#define END 0
+
+static void chset(WCHAR* bittab, re_char_t c)
+{
+ bittab[(c) >> BITSHFT] |= (WCHAR)(1u << (c & BITIND));
+}
+
+static int re_grow(regexw_t *re, size_t m)
+{
+ while (m >= re->max_nfa)
+ {
+ size_t sz = re->max_nfa ? (2 * re->max_nfa) : 1024;
+ WCHAR *tmp = realloc(re->nfa, sz * sizeof(WCHAR));
+ if (!tmp) return 0;
+ re->nfa = tmp;
+ re->max_nfa = sz;
+ }
+ return 1;
+}
+
+#define store(x) do {if (!re_grow(re, mp)) return REG_ESPACE; re->nfa[mp++] = (WCHAR)x;} while (0)
+
+/******************************************************************
+ * regcompW
+ *
+ */
+int regcompW(regexw_t* re, const WCHAR *pat, int flags)
+{
+ const re_char_t *p; /* pattern pointer */
+ size_t mp = 0; /* nfa pointer */
+ size_t lp; /* saved pointer.. */
+ size_t sp = 0; /* another one.. */
+
+ int tagi = 0; /* tag stack index */
+ WCHAR tagc = 1; /* actual tag count */
+
+ int i;
+ WCHAR n, mask; /* xor mask -CCL/NCL */
+ WCHAR c1, c2;
+ WCHAR* bittab = NULL;
+ int ret = 0;
+
+ if (!pat || !*pat)
+ {
+ return REG_BADPAT;
+ }
+ re->nfa = NULL;
+ re->max_nfa = 0;
+ re->re_nsub = 0;
+
+ for (p = pat; *p; p++) {
+ lp = mp;
+ switch (*p) {
+
+ case '.': /* match any char.. */
+ store(ANY);
+ break;
+
+ case '^': /* match beginning.. */
+ if (p == pat)
+ store(BOL);
+ else {
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ case '$': /* match endofline.. */
+ if (!*(p+1))
+ store(EOL);
+ else {
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ case '[': /* match char class..*/
+ store(CCL);
+
+ if (!bittab) bittab = malloc(BITBLK * sizeof(WCHAR));
+ if (!bittab) return REG_ESPACE;
+ memset(bittab, 0, BITBLK * sizeof(WCHAR));
+ if (*++p == '^') {
+ mask = (WCHAR)~0;
+ p++;
+ }
+ else
+ mask = 0;
+
+ if (*p == '-') /* real dash */
+ chset(bittab, *p++);
+ else if (*p == ']') /* real brac */
+ chset(bittab, *p++);
+ while (*p && *p != ']') {
+ if (*p == '-' && *(p+1) && *(p+1) != ']') {
+ p++;
+ c1 = (re_char_t)(*(p-2) + 1);
+ c2 = *p++;
+ while (c1 <= c2)
+ chset(bittab, c1++);
+ }
+ else if (*p == '\\' && *(p+1)) {
+ p++;
+ chset(bittab, *p++);
+ }
+ else if (*p == '[' && (*(p+1) == '.' || *(p+1) == ':' || *(p+1) == '='))
+ {
+ ret = REG_ECOLLATE;
+ goto do_finish;
+ }
+ else
+ chset(bittab, *p++);
+ }
+ if (!*p)
+ {
+ ret = REG_EBRACK;
+ goto do_finish;
+ }
+
+ for (i = 0; i < BITBLK; i++)
+ store(mask ^ bittab[i]);
+
+ break;
+
+ case '*': /* match 0 or more.. */
+ case '+': /* match 1 or more.. */
+ if (p == pat)
+ {
+ ret = REG_BADRPT;
+ goto do_finish;
+ }
+ lp = sp; /* previous opcode */
+ if (re->nfa[lp] == CLO) /* equivalence.. */
+ break;
+ switch (re->nfa[lp]) {
+
+ case BOL:
+ case BOT:
+ case EOT:
+ case REF:
+ ret = REG_BADRPT;
+ goto do_finish;
+ default:
+ break;
+ }
+
+ if (*p == '+')
+ for (sp = mp; lp < sp; lp++)
+ store(re->nfa[lp]);
+
+ store(END);
+ store(END);
+ sp = mp;
+ while (--mp > lp)
+ re->nfa[mp] = re->nfa[mp - 1];
+ store(CLO);
+ mp = sp;
+ break;
+
+ case '\\': /* tags, backrefs .. */
+ switch (*++p) {
+
+ case '(':
+ if (tagc < MAXTAG) {
+ re->tagstk[++tagi] = tagc;
+ store(BOT);
+ store(tagc++);
+ }
+ else
+ {
+ ret = REG_ERANGE;
+ goto do_finish;
+ }
+ break;
+ case ')':
+ if (!re->nfa || re->nfa[sp] == BOT)
+ {
+ ret = REG_BADPAT;
+ goto do_finish;
+ }
+ if (tagi > 0) {
+ store(EOT);
+ store(re->tagstk[tagi--]);
+ }
+ else
+ {
+ ret = REG_BADPAT;
+ goto do_finish;
+ }
+ break;
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ n = (WCHAR)(unsigned)(*p-'0');
+ if (tagi > 0 && re->tagstk[tagi] == n)
+ {
+ ret = REG_BADPAT;
+ goto do_finish;
+ }
+ if (tagc > n) {
+ store(REF);
+ store(n);
+ }
+ else
+ {
+ ret = REG_BADPAT;
+ goto do_finish;
+ }
+ break;
+ case 'b':
+ store(CHR);
+ store('\b');
+ break;
+ case 'n':
+ store(CHR);
+ store('\n');
+ break;
+ case 'f':
+ store(CHR);
+ store('\f');
+ break;
+ case 'r':
+ store(CHR);
+ store('\r');
+ break;
+ case 't':
+ store(CHR);
+ store('\t');
+ break;
+ default:
+ store(CHR);
+ store(*p);
+ }
+ break;
+
+ default : /* an ordinary char */
+ store(CHR);
+ store(*p);
+ break;
+ }
+ sp = lp;
+ }
+ if (tagi > 0)
+ {
+ ret = REG_EPAREN;
+ goto do_finish;
+ }
+ store(END);
+ re->re_nsub = 1;
+
+do_finish:
+ free(bittab);
+ return ret;
+}
+
+struct pmatch_data
+{
+ const re_char_t *bol;
+ const re_char_t *bopat[MAXTAG];
+ const re_char_t *eopat[MAXTAG];
+};
+
+static const WCHAR *pmatch(struct pmatch_data *, const WCHAR *, const WCHAR *);
+
+/******************************************************************
+ * regexecW
+ *
+ */
+int regexecW(const regexw_t* re, const WCHAR *lp, size_t nb, regmatch_t* rm, int flags)
+{
+ WCHAR c;
+ const re_char_t *ep = 0;
+ WCHAR *ap = re->nfa;
+ struct pmatch_data pmd;
+
+ pmd.bol = lp;
+
+ pmd.bopat[0] = 0;
+ pmd.bopat[1] = 0;
+ pmd.bopat[2] = 0;
+ pmd.bopat[3] = 0;
+ pmd.bopat[4] = 0;
+ pmd.bopat[5] = 0;
+ pmd.bopat[6] = 0;
+ pmd.bopat[7] = 0;
+ pmd.bopat[8] = 0;
+ pmd.bopat[9] = 0;
+
+ switch (*ap) {
+
+ case BOL: /* anchored: match from BOL only */
+ ep = pmatch(&pmd, lp, ap);
+ break;
+ case CHR: /* ordinary char: locate it fast */
+ c = *(ap+1);
+ while (*lp && *lp != c)
+ lp++;
+ if (!*lp) /* if EOS, fail, else fall thru. */
+ return REG_NOMATCH;
+ default: /* regular matching all the way. */
+ do {
+ if ((ep = pmatch(&pmd, lp, ap)))
+ break;
+ } while (*lp++);
+ break;
+ case END: /* munged automaton. fail always */
+ return REG_NOMATCH;
+ }
+ if (!ep)
+ return REG_NOMATCH;
+
+ if (nb)
+ {
+ int i;
+ for (i = 0; i < nb; i++)
+ rm[i].rm_so = rm[i].rm_eo = -1;
+ rm[0].rm_so = (int)(lp - pmd.bol);
+ rm[0].rm_eo = (int)(ep - pmd.bol);
+ }
+
+ return 0;
+}
+
+#define isinset(x,y) ((x)[(y) >> BITSHFT] & (1u << ((y) & BITIND)))
+
+/*
+ * skip values for CLO XXX to skip past the closure
+ */
+
+#define ANYSKIP 2 /* [CLO] ANY END ... */
+#define CHRSKIP 3 /* [CLO] CHR chr END ... */
+#define CCLSKIP (2+BITBLK) /* [CLO] CCL BITBLK-bytes END ... */
+
+static const WCHAR *pmatch(struct pmatch_data *pmd, const WCHAR *lp, const WCHAR *ap)
+{
+ WCHAR op;
+ int c, n;
+ const re_char_t *e; /* extra pointer for CLO */
+ const re_char_t *bp; /* beginning of subpat.. */
+ const re_char_t *ep; /* ending of subpat.. */
+ const re_char_t *are; /* to save the line ptr. */
+
+ while ((op = *ap++) != END)
+ switch (op) {
+
+ case CHR:
+ if (*lp++ != *ap++)
+ return 0;
+ break;
+ case ANY:
+ if (!*lp++)
+ return 0;
+ break;
+ case CCL:
+ c = *lp++;
+ if (!isinset(ap,c))
+ return 0;
+ ap += BITBLK;
+ break;
+ case BOL:
+ if (lp != pmd->bol)
+ return 0;
+ break;
+ case EOL:
+ if (*lp)
+ return 0;
+ break;
+ case BOT:
+ pmd->bopat[*ap++] = lp;
+ break;
+ case EOT:
+ pmd->eopat[*ap++] = lp;
+ break;
+ case REF:
+ n = *ap++;
+ bp = pmd->bopat[n];
+ ep = pmd->eopat[n];
+ while (bp < ep)
+ if (*bp++ != *lp++)
+ return 0;
+ break;
+ case CLO:
+ are = lp;
+ switch (*ap) {
+
+ case ANY:
+ while (*lp)
+ lp++;
+ n = ANYSKIP;
+ break;
+ case CHR:
+ c = *(ap+1);
+ while (*lp && c == *lp)
+ lp++;
+ n = CHRSKIP;
+ break;
+ case CCL:
+ while ((c = *lp) && isinset(ap+1,c))
+ lp++;
+ n = CCLSKIP;
+ break;
+ default:
+ /* closure: bad nfa. */
+ return 0;
+ }
+
+ ap += n;
+
+ while (lp >= are) {
+ if ((e = pmatch(pmd, lp, ap)))
+ return e;
+ --lp;
+ }
+ return 0;
+ default:
+ /* bad nfa. */
+ return 0;
+ }
+ return lp;
+}
+
+/******************************************************************
+ * regfreeW
+ *
+ *
+ */
+void regfreeW(regexw_t* re)
+{
+ free(re->nfa);
+ /* just in case */
+ re->nfa = NULL;
+ re->max_nfa = 0;
+}
+
+/******************************************************************
+ * regerrorW
+ *
+ *
+ */
+void regerrorW(int errcode, const regexw_t *preg, char *errbuf, size_t errbuf_size)
+{
+ const char* msg = NULL;
+ size_t len;
+
+ switch (errcode)
+ {
+ case 0: msg = "<<no error>"; break;
+ case REG_NOMATCH: msg = "regexec: no match"; break;
+ case REG_BADPAT: msg = "regcomp: bad pattern"; break;
+ case REG_ECOLLATE: msg = "regcomp: unsupported collate"; break;
+ case REG_ECTYPE: msg = "regcomp: unsupported class type"; break;
+ case REG_EESCAPE: msg = "regcomp: trailing backslash"; break;
+ case REG_ESUBREG: msg = "regcomp: invalid reference to subexpression"; break;
+ case REG_EBRACK: msg = "regcomp: unmatched '['"; break;
+ case REG_EPAREN: msg = "regcomp: unmatched '('"; break;
+ case REG_EBRACE: msg = "regcomp: unmatched '{'"; break;
+ case REG_BADBR: msg = "regcomp: invalid back reference"; break;
+ case REG_ERANGE: msg = "regcomp: invalid range operator"; break;
+ case REG_ESPACE: msg = "regcomp: out of memory"; break;
+ case REG_BADRPT: msg = "regcomp: invalid use of + or *"; break;
+ default: msg = "unknown error code"; break;
+ }
+ if (errbuf && errbuf_size)
+ {
+ if ((len = strlen(msg)) < errbuf_size - 1) len = errbuf_size - 1;
+ memcpy(errbuf, msg, len);
+ errbuf[len] = '\0';
+ }
+}
More information about the wine-patches
mailing list