--- mawk-1.3.3/rexp/rexp0.c.orig 2006-11-01 09:06:13.000000000 -0500 +++ mawk-1.3.3/rexp/rexp0.c 2006-11-01 09:26:03.000000000 -0500 @@ -2,6 +2,7 @@ /******************************************** rexp0.c copyright 1991, Michael D. Brennan +copyright 2006, Sam Trenholme This is a source file for mawk, an implementation of the AWK programming language. @@ -11,6 +12,9 @@ ********************************************/ /*$Log: rexp0.c,v $ + *Revision 1.6 2006/11/01 08:00:12 sam + *Added support for POSIX character classes (POSIX 9.3.5.6) + * *Revision 1.5 1996/11/08 15:39:27 mike *While cleaning up block_on, I introduced a bug. Now fixed. * @@ -355,6 +359,7 @@ char *q, *t ; int cnt ; int comp_flag ; + int posix_class; p = t = (*start) + 1 ; @@ -364,27 +369,50 @@ if (*p == ']') p++ ; else if (*p == '^' && *(p + 1) == ']') p += 2 ; - while (1) /* find the back of the class */ + q = p; + prev = -1; + posix_class = 0; + cnt = 0; + + while(*q != '\0') /* find the back of the class */ { - if (!(q = strchr(p, ']'))) + if(prev == '[' && *q == ':') /* POSIX character class */ { - /* no closing bracket */ - RE_error_trap(-E3) ; + posix_class = 1; } - p = q - 1 ; - cnt = 0 ; - while (*p == '\\') + if(*q == ']' && prev != ':' && ((cnt & 1) == 0)) { - cnt++ ; p-- ; + break; } - if ((cnt & 1) == 0) + if(*q == ']' && prev == ':' && posix_class == 0) { - /* even number of \ */ - break ; + break; + } + if(prev == ':' && *q == ']') + { + posix_class = 0; + } + if(*q == '\\' && prev != '\\') + { + cnt = 1; } - p = q + 1 ; + if(*q == '\\' && prev == '\\') + { + cnt++; + } + if(*q != '\\') + { + cnt = 0; + } + prev = *q; + q++; } + if(*q != ']') + { + RE_error_trap(-E3) ; + } + /* q now pts at the back of the class */ p = t ; *start = q + 1 ; @@ -447,6 +475,105 @@ } break ; + case '[': + /* This is code to make Mawk more + POSIX compliant. Basically, POSIX has the following + character class expressions: + + [:alnum:] -> A-Za-z0-9 + [:alpha:] -> A-Za-z + [:blank:] -> space, tab + [:cntrl:] -> Control characters (ASCII 1-31; 127) + [:digit:] -> 0-9 + [:graph:] -> !-~ (printable and visible) + [:lower:] -> a-z + [:print:] -> non-control characters (ASCII 32-126) + [:punct:] -> !-/:-@\[-`\{-~ + [:space:] -> Space characters (space, tab, and formfeed) + [:upper:] -> A-Z + [:xdigit:] -> A-Fa-f0-9 (hex digit) + + Note that this code does not give Mawk the ability + to handle non-C/ASCII locales; all this does is allow + Mawk to handle the POSIX character class expressions. + This lets us write case-sensitive regular expressions + in Gawk that do not break in non-C locales, and have + the REs work in Mawk. + */ + if(strncmp(p,"[:upper:]",9) == 0) { + block_on(*bvp,'A','Z'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:lower:]",9) == 0) { + block_on(*bvp,'a','z'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:alnum:]",9) == 0) { + block_on(*bvp,'a','z'); + block_on(*bvp,'A','Z'); + block_on(*bvp,'0','9'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:alpha:]",9) == 0) { + block_on(*bvp,'a','z'); + block_on(*bvp,'A','Z'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:digit:]",9) == 0) { + block_on(*bvp,'0','9'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:graph:]",9) == 0) { + block_on(*bvp,'!','~'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:print:]",9) == 0) { + block_on(*bvp,32,126); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:punct:]",9) == 0) { + block_on(*bvp,'!','/'); + block_on(*bvp,':','@'); + block_on(*bvp,'[','`'); + block_on(*bvp,'{','~'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:blank:]",9) == 0) { + on(*bvp,' '); + on(*bvp,'\t'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:space:]",9) == 0) { + on(*bvp,' '); + on(*bvp,'\t'); + on(*bvp,'\f'); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:cntrl:]",9) == 0) { + block_on(*bvp,1,31); + on(*bvp,127); + prev = -1; + p += 9; + break; + } else if (strncmp(p,"[:xdigit:]",10) == 0) { + block_on(*bvp,'a','f'); + block_on(*bvp,'A','F'); + block_on(*bvp,'0','9'); + prev = -1; + p += 10; + break; + } + default: prev = *(unsigned char *) p++ ; on(*bvp, prev) ;