/* Copyright 1996 Acorn Computers Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * File: txtfind.c * Purpose: search/replace given possibly wild patterns * Author: IDJ * History: 24-Jan-90: IDJ Created * 29-Jan-90: IDJ Added twin-like MOST feature * Performance seems usually < 3 times slower than Twin * 30-Jan-90: IDJ Fixed some bugs in MOST, and added initial sub-string * optimisation (nicked from 'find', including the goto's!) * 16-Feb-90: IDJ Added hex char stuff * 01-Mar-90: IDJ Fixed bug in escaped magic chars in replacement string * 05-Apr-90: IDJ Fixed hexchar in replacement string bug * 19-Jul-90: IDJ copy of ambig pats is now inclusive of end-points * 23-Jul-90: IDJ fixed bug in use of | in replacement string * 21-Aug-90: IDJ introduced old !Edit patterns to satisfy marketting * who want backwards compatibility * 22-Aug-90: IDJ now you can do any size replacements!! * 31-Aug-90: IDJ bug-fix/bodge - only create 'most' node when no * following patterns (otherwise %x == ^x) * eg. *y%x --> *y^x~x * *y%xz --> *y^xz * side-effect of this is that * %@t matches the text aaaat (unlike Twin which says no match) * We believe this is better (well Harry does!) * 11-Sep-90: IDJ fixed strange behaviour in sets containing normal ch (ie \) * correctly parse [a-\~] as range a to ~ * 1-Feb-91: IDJ: fixed two bugs in old pattern matching * resulted in crash whenever 'magic' picked * 05-Apr-91: IDJ: allow \X in old patterns for hex chs. * 06-Jun-91: IDJ: fixed RO-6058 - use of special chars as end of ranges * 27-Jun-91: IDJ: further fixing to RO-6058: * - fault [a-] * - handle [#-z] and [@-z] correctly * - handle [a-\\] correctly * 01-Jul-91: IDJ: fixed \ca bug to find |A not ~|A !!!!! */ /* Find Syntax: . matches any single character $ matches end-of-line @ matches any identifier character (a-z,A-Z,_) # matches any digit |c matches CTRL-c \c matches c, even if it is special [abc] matches any one of a, b, c. c1-c2 matches any char between c1 and c2 eg. [a-g] matches lower-case a to g ~x if x is any of the above simple character patterns, then ~x matches any character other than x *x if x is any of the above simple character patterns, then *x matches 0 or more occurrences of x ^x if x is any of the above simple character patterns, then ^x matches 1 or more occurrences of x %x if x is any of the above simple character patterns, then %x matches the most contiguous characters matching x (like MOST in Twin) & matches found string ?n matches field n of the found string (0<=n<=9) Xnn matches hex char 0xnn (X is in fact char 0x84, but C disallows non-printables) */ #define BOOL int #define TRUE 1 #define FALSE 0 #include <stdio.h> #include <string.h> #include <stdlib.h> #include <ctype.h> #include "h.werr" #include "h.txt" #include "h.EditIntern.txtregexp" #include "h.EditIntern.txtfind" #include "h.trace" #include "h.msgs" #include "h.visdelay" #include "h.flex" #include "h.verintern.messages" #ifdef LIB_DEBUGGING extern void print_nfa(NFA *nfa); #endif #define txtfind__CTRL_SHIFT 128 #define txtfind__DEL 127 #define BAD (-1) #define OK 0 static int txtfind__hexval(char c1) { /* returns -1 if no good. */ if (c1 >= '0' && c1 <= '9') return c1 - '0'; if (c1 >= 'A' && c1 <= 'F') return 10 + c1 - 'A'; if (c1 >= 'a' && c1 <= 'f') return 10 + c1 - 'a'; return BAD; } static int txtfind__hexpair(char c1, char c2) { /* returns BAD if no good. */ int d1 = txtfind__hexval(c1); int d2 = txtfind__hexval(c2); if (d1 == BAD || d2 == BAD) return BAD; return (16*d1) + d2; } static int txtfind__ctrl_char(char *s) { int ctrl = 0 - '@'; char *s0 = s; int ch = *++s; if (ch == '?') ch = txtfind__DEL; else { if (ch == '!') { ctrl = txtfind__CTRL_SHIFT; ch = *++s; if (ch == '|') { ctrl -= '@'; ch = *++s; } } if (islower(ch)) ch = toupper(ch); else if (ch < '@' || ch > '_') return BAD; ch += ctrl; } return ch + ((s - s0) << 8); } void txtfind_replace(txt t, txt_index start, txt_index *end, char *rtemplate, BOOL magic #ifdef FIELDNUM , Ambiguous_entry *ambiguous #endif #ifdef ALLOW_OLD_PATTERNS , BOOL old_patterns #endif ) /* * Called when we have found a match for rtemplate in txt t at position at * We construct the appropriate string and replace the one we found. */ { char *repls=0; int i, j, k; int ii = 0, repl_space = 0; /* IDJ 23-Aug-90: first we see how much space we need for expanded replacement string */ while (rtemplate[ii] != '\0') { if (magic) { #ifdef ALLOW_OLD_PATTERNS if (old_patterns) { if (rtemplate[ii] == txtfind_old_magic_ch) { ii++; if (rtemplate[ii] == '\0') continue; switch (rtemplate[ii]) { case txtfind_old_hex_ch: case txtfind_old_upper_hex_ch: ii++; if (rtemplate[ii] == '\0') continue; ii++; if (rtemplate[ii] == '\0') continue; repl_space++; break; case txtfind_old_newline_ch: repl_space++; break; case txtfind_old_ctrl_ch: ii++; if (rtemplate[ii] == '\0') continue; repl_space++; break; case txtfind_old_foundstring_ch: repl_space+= *end-start; break; default: repl_space++; break; } } else repl_space++; } else #endif { switch (rtemplate[ii]) { case txtfind_newline_ch: repl_space++; break; case txtfind_ctrl_ch: { int ch = txtfind__ctrl_char(&rtemplate[ii]); ii += ch>>8; repl_space++; } break; case txtfind_hex_ch: ii++; if (rtemplate[ii] == '\0') continue; ii++; if (rtemplate[ii] == '\0') continue; repl_space++; break; case txtfind_normal_ch: ii++; if (rtemplate[ii] == '\0') continue; repl_space++; break; case txtfind_found_ch: repl_space+= *end-start; break; #ifdef FIELDNUM case txtfind_field_ch: { int s; ii++; s = rtemplate[ii] - '0'; repl_space+=ambiguous[s].end-ambiguous[s].start; } break; #endif default: repl_space++; break; } } } else { repl_space++; } ii++; } if ((repls = malloc(repl_space+1)) == 0) werr(FALSE, msgs_lookup(MSGS_txtfind1)); i = 0; j = 0; while (rtemplate[i] != '\0') { if (magic) { #ifdef ALLOW_OLD_PATTERNS if (old_patterns) { if (rtemplate[i] == txtfind_old_magic_ch) { i++; switch(rtemplate[i]) { case txtfind_old_hex_ch: case txtfind_old_upper_hex_ch: { char c1, c2; i++; c1 = rtemplate[i++]; if (c1 == '\0' || !isxdigit(c1)) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } c2 = rtemplate[i++]; if (c2 == '\0' || !isxdigit(c2)) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } repls[j++] = txtfind__hexpair(c1, c2); } break; case txtfind_old_newline_ch: repls[j] = '\n'; j++; i++; break; case txtfind_old_ctrl_ch: { char ch; ch = rtemplate[++i]; if (ch == 0) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); return; } if (islower(ch)) ch = toupper(ch); else if (ch < '@' || ch > '_') { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } ch -= '@'; ch |= (1<<8); repls[j] = ch; j++; } i++; break; case txtfind_old_foundstring_ch: for (k = start; k < *end; k++) { repls[j] = txt_charat(t, k); j++; } i++; break; default: repls[j] = rtemplate[i]; j++; i++; break; } } else { repls[j] = rtemplate[i]; j++; i++; } } else /* new-style regular expressions */ #endif { switch (rtemplate[i]) { case txtfind_newline_ch: repls[j] = '\n'; j++; i++; break; case txtfind_ctrl_ch: { int ch = txtfind__ctrl_char(&rtemplate[i]); if (ch >= 0) { i += ch >> 8; /* txtfind__ctl_char tells us how far to advance in string */ ch &= 255; if (j < 255) repls[j++] = ch; } i++; } break; case txtfind_found_ch: for (k = start; k < *end; k++) { repls[j] = txt_charat(t, k); j++; } i++; break; case txtfind_hex_ch: { char c1, c2; i++; c1 = rtemplate[i++]; if (c1 == '\0' || !isxdigit(c1)) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } c2 = rtemplate[i++]; if (c2 == '\0' || !isxdigit(c2)) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } repls[j++] = txtfind__hexpair(c1, c2); } break; #ifdef FIELDNUM case txtfind_field_ch: { int s; i++; if (rtemplate[i] == '\0' || !isdigit(rtemplate[i])) { werr(FALSE, msgs_lookup(MSGS_txtfind2)); if (repls) free(repls); return; } s = rtemplate[i] - '0'; i++; if (ambiguous[s].start != -1 && ambiguous[s].end != -1) { for (k = start + ambiguous[s].start; k < start + ambiguous[s].end; k++) { #ifdef LIB_DEBUGGING werr(FALSE, "Replace repls[%d] %c", j, txt_charat(t,k)); #endif repls[j] = txt_charat(t, k); j++; } } } break; #endif case txtfind_normal_ch: repls[j++] = rtemplate[++i]; i++; break; default : repls[j] = rtemplate[i]; j++; i++; break; } } } else { repls[j] = rtemplate[i]; j++; i++; } } repls[j] = '\0'; /* dot should be here anyway, but take no chances */ txt_setdot(t, start); txt_replacechars(t, *end - start, repls, j); *end = start + j; if (repls) free(repls); } static void txtfind__charset_include(CharSet *cs, int ch) { (*cs)[ch >> 5] |= (1L << (ch & 31)); } static int txtfind__charset_insert(CharSet *cs, char *s, BOOL nocase) { int j, ch, lastch; lastch = -1; for (ch = *s; ch != 0; ch = *++s) { if ((ch == txtfind_to_ch) && (lastch >= 0)) { ch = *(s + 1); if (ch == 0) { #if FALSE ch = txtfind_to_ch; #endif return BAD; } else { ++s; if (ch == txtfind_normal_ch) { ch = *(s+1); if (ch == 0) { ch = txtfind_normal_ch; } else { ++s; } } else if (ch == txtfind_ctrl_ch) { ch = txtfind__ctrl_char(s); if (ch < 0) return BAD; s += ch >> 8; ch &= 255; } else if (ch == txtfind_newline_ch) { ch = '\n'; } else if (ch == txtfind_hex_ch) /* hex pattern */ { char c1, c2; c1 = *++s; if (c1 == 0) return BAD; else c2 = *++s; if (c2 == 0) return BAD; if ((ch = txtfind__hexpair(c1, c2)) == BAD) return BAD; } if (lastch > ch) return BAD; /* fault c1-c2, where c1>c2 */ for (j = lastch; j <= ch; ++j) txtfind__charset_include(cs, j); lastch = -1; continue; } } if (ch == txtfind_alphanum_ch && *(s+1) != txtfind_to_ch) { for (j = 'A'; j <= 'Z'; ++j) txtfind__charset_include(cs, j); for (j = 'a'; j <= 'z'; ++j) txtfind__charset_include(cs, j); txtfind__charset_include(cs, '_'); } if (((ch == txtfind_alphanum_ch) || (ch == txtfind_digit_ch)) && *(s+1) != txtfind_to_ch) { lastch = ch; for (j = '0'; j <= '9'; ++j) txtfind__charset_include(cs, j); continue; } if (ch == txtfind_normal_ch) { ch = *(s + 1); if (ch == 0) ch = txtfind_normal_ch; else ++s; } else if (ch == txtfind_ctrl_ch) { ch = txtfind__ctrl_char(s); if (ch < 0) return BAD; s += ch >> 8; ch &= 255; } else if (ch == txtfind_newline_ch) { ch = '\n'; } else if (ch == txtfind_hex_ch) /* hex pattern */ { char c1, c2; c1 = *++s; if (c1 == 0) return BAD; else c2 = *++s; if (c2 == 0) return BAD; if ((ch = txtfind__hexpair(c1, c2)) == BAD) return BAD; } txtfind__charset_include(cs, ch); lastch = ch; } return OK; } static void txtfind__charset_empty(CharSet *cs) { int j; for (j = 0; j < 8; ++j) (*cs)[j] = 0; } static void txtfind__charset_negate(CharSet *cs) { int j; for (j = 0; j < 8; ++j) (*cs)[j] = ~(*cs)[j]; } static int txtfind__parse_pattern(REHandle *h, char *pattern, BOOL magic, BOOL nocase #ifdef ALLOW_OLD_PATTERNS , BOOL old_patterns #endif ) { int ch, not, modifier, most=0; for (ch = *pattern; ch != 0; ch = *++pattern) { if (magic) { #ifdef ALLOW_OLD_PATTERNS if (old_patterns) { if (ch == txtfind_old_magic_ch) { ch = *++pattern; if (ch == 0) return BAD; if (ch == txtfind_old_anystring_ch) { ch = RE_ANY; re_char(h, ch, '*'); re_modify(h, '*'); } else if (ch == txtfind_old_alphanum_ch) { CharSet cs; txtfind__charset_empty((CharSet *)(&cs)); txtfind__charset_insert((CharSet *)(&cs), "@", nocase); re_charset(h, (CharSet *)(&cs), 0); } else if (ch == txtfind_old_digit_ch) { ch = RE_DIGIT; re_char(h, ch, 0); } else if (ch == txtfind_old_any_ch) { ch = RE_ANY; re_char(h, ch, 0); } else if (ch == txtfind_old_newline_ch) { ch = '\n'; re_char(h, ch, 0); } else if (ch == txtfind_old_ctrl_ch) { ch = *++pattern; if (ch == 0) return BAD; if (islower(ch)) ch = toupper(ch); else if (ch < '@' || ch > '_') return BAD; ch -= '@'; ch &= 255; re_char(h, ch, 0); } else if (ch == txtfind_old_magic_ch) { ch = '\\'; re_char(h, ch, 0); } else if (ch == txtfind_old_hex_ch || ch == txtfind_old_upper_hex_ch) { char c1, c2; c1 = *++pattern; if (c1 == 0) return BAD; else c2 = *++pattern; if (c2 == 0) return BAD; if ((ch = txtfind__hexpair(c1, c2)) == BAD) return BAD; re_char(h, ch, 0); } } else re_char(h, ch, 0); } else /* new regular expression style patterns */ #endif { if ((ch == txtfind_0ormore_ch) || (ch == txtfind_1ormore_ch) || (ch == txtfind_most_ch)) { if (ch == txtfind_0ormore_ch) modifier = '*'; else if (ch == txtfind_1ormore_ch) modifier = '+'; else {modifier = '+'; most = '%';} ch = *++pattern; if (ch == 0) return BAD; } else {modifier = 0; most = 0;} if (ch == txtfind_not_ch) { not = 1; ch = *++pattern; if (ch == 0) return BAD; } else not = 0; if ((ch == txtfind_setbra_ch) || (ch == txtfind_alphanum_ch)) { /* CharClass... */ CharSet cs; txtfind__charset_empty((CharSet *)(&cs)); if (ch == txtfind_alphanum_ch) { txtfind__charset_insert((CharSet *)(&cs), "@", nocase); } else { char *ket = pattern; int lastch = 0, beforelastch = 0; for (;;) { ch = *++ket; if ((ch == 0) || (ch == txtfind_setket_ch) && (lastch != txtfind_normal_ch || lastch == txtfind_normal_ch && beforelastch == txtfind_normal_ch) && (lastch != txtfind_ctrl_ch)) break; beforelastch = lastch; lastch =ch; } if (ch == 0) return BAD; *ket = 0; if (txtfind__charset_insert((CharSet *)(&cs), pattern+1, nocase) != OK) return BAD; *ket = txtfind_setket_ch; pattern = ket; } if (not) txtfind__charset_negate((CharSet *)(&cs)); re_charset(h, (CharSet *)(&cs), modifier); if (modifier) re_modify(h, modifier); /* now the 'orrible hack - add a not of previous pattern!!! */ /* IDJ: 27-Jun-91: only if no pattern follows */ if (most == '%' && *(pattern+1) == 0) { txtfind__charset_negate(&cs); re_charset(h, &cs, most); } } else { /* single char */ if (ch == txtfind_any_ch) ch = RE_ANY; else if (ch == txtfind_newline_ch) ch = '\n'; else if (ch == txtfind_digit_ch) ch = RE_DIGIT; else if (ch == txtfind_hex_ch) { char c1, c2; c1 = *++pattern; if (c1 == 0) return BAD; else c2 = *++pattern; if (c2 == 0) return BAD; if ((ch = txtfind__hexpair(c1, c2)) == BAD) return BAD; } else if (ch == txtfind_normal_ch) { ch = *++pattern; if (ch == 0) return BAD; } else if (ch == txtfind_ctrl_ch) { ch = txtfind__ctrl_char(pattern); if (ch < 0) return BAD; pattern += ch >> 8; ch &= 255; } if (not) ch += RE_NOT; re_char(h, ch, modifier); if (modifier) re_modify(h, modifier); if (most == '%' && *(pattern+1) == 0) { if (ch == RE_ANY) return(BAD); /* disallow %. */ if (not) ch -= RE_NOT; else ch += RE_NOT; re_char(h, ch, most); } } } } else /* not magic */ re_char(h, ch, 0); } return OK; } extern Pattern *txtfind_build_pattern(char *pattern, BOOL magic, BOOL nocase #ifdef ALLOW_OLD_PATTERNS , BOOL old_patterns #endif #ifdef FIELDNUM , SubPattern *sub_patterns #endif ) { /* this could be parameterised to call other parsers for different wild-carding */ REHandle h; Pattern *pat; NFA *pattern_nfa = NULL; if ((pat = malloc(sizeof(Pattern))) == 0) { werr(FALSE, msgs_lookup(MSGS_txtfind3)); return 0; } /* first pass of RE assembler */ re_begin1(&h); /* now parse the pattern */ if (txtfind__parse_pattern(&h, pattern, magic, nocase #ifdef ALLOW_OLD_PATTERNS , old_patterns #endif ) == OK) { /* second pass of RE assembler */ re_begin2(&h); if (txtfind__parse_pattern(&h, pattern, magic, nocase #ifdef ALLOW_OLD_PATTERNS , old_patterns #endif ) == OK) pattern_nfa = re_end(&h); } if (pattern_nfa == NULL) { werr(FALSE, msgs_lookup(MSGS_txtfind4)); return NULL; } else { pat->nfa = pattern_nfa; pat->p = pattern; } #ifdef FIELDNUM if (sub_patterns) re_make_subpattern_table(pat->nfa, sub_patterns); #endif return pat; } /* --- ************************* functions for enhanced Boyer-Moore algorithm ******** --- */ extern void txtfind_build_TD1(int TD1[], Pattern *pat, BOOL nocase) { int l, i; char *pstr, *p; l = strlen(pat->p); re_head(pat->nfa, pat->p, l+1); pat->len = strlen(pat->p); pstr = pat->p; for (i = 0; i < 256; i++) TD1[i] = pat->len+1; for(p = pstr; *p; p++) TD1[(nocase)?tolower(*p):(*p)] = pat->len-(p-pstr); } static txt_index txtfind_match_pattern(txt t, Pattern *p, txt_index *nowat, BOOL nocase #ifdef FIELDNUM , Ambiguous_entry *ambiguous , SubPattern *sub_patterns , BOOL rescanning #endif , int TD1[] ) { int l, size, i = 0; char *text; txt_index start = (txt_index)-1, wasat = *nowat, end = 0; /* get head of pattern for initial sub-string search */ l = strlen(p->p); re_head(p->nfa, p->p, l+1); p->len = strlen(p->p); /* get a chunk of text */ #ifdef LIB_DEBUGGING werr(FALSE, "match pattern, nowat == %d", *nowat); #endif txt_arrayseg(t, *nowat, &text, &size); /* --- initial substring optimisation --- */ if (p->len > 0) { /* --- try a Quick Search on initial substring (CACM Aug. 1990, p.132) --- */ char *t, *tx = text; int plen = p->len; char *ps; if (nocase) { while(tx + plen <= text + size) { for (ps = p->p, t = tx; *ps; ps++, t++) if (tolower(*ps) != tolower(*t)) break; if (*ps == 0) {i = tx - text; goto match_pattern;} tx += TD1[tolower(*(tx+plen))]; } goto set_nowat; } else { while(tx + plen <= text + size) { for (ps = p->p, t = tx; *ps; ps++, t++) if (*ps != *t) break; if (*ps == 0) {i = tx - text; goto match_pattern;} tx += TD1[*(tx+plen)]; } goto set_nowat; } } /* do a search using the NFA for pattern */ match_pattern: #ifdef LIB_DEBUGGING werr(FALSE, "match_pattern text start %c @ %p", *(text+i), text+i); #endif /* DANGER: we must not pass the pointer "text" beacuse it may be wrong due to stack/heap/slot extension!!! */ start = (txt_index)re_match(t, *nowat+i, p->nfa, 1, &end, nocase #ifdef FIELDNUM , ambiguous , sub_patterns , rescanning #endif ); set_nowat: *nowat = wasat + end + i; #ifdef LIB_DEBUGGING werr(FALSE, "match pattern start %d, wasat %d", start, wasat); #endif if (start >= 0) return (wasat + start + i); else return (txt_index)-1; } extern txt_index txtfind_find(txt t, txt_index *nowat, Pattern *p, BOOL nocase #ifdef FIELDNUM , Ambiguous_entry *ambiguous , SubPattern *sub_patterns , BOOL rescanning #endif , int TD1[] ) { txt_index at; /* check for bad pattern */ if (p == 0) return BAD; #ifdef LIB_DEBUGGING werr(FALSE, "txtfind, nowat == %d", *nowat); #endif /* match pattern against it */ at = txtfind_match_pattern(t, p, nowat, nocase #ifdef FIELDNUM , ambiguous , sub_patterns , rescanning #endif , TD1 ); /* return start of match BAD ==> failure */ #ifdef LIB_DEBUGGING werr(FALSE, "txtfind ==> at %d, nowat %d", at, *nowat); #endif return at; } /***************************** Client of txtfind_find should: i) build a pattern from search string ii) call find with start at point to start search (ends up pointing at start of match) ******************************/