/* Copyright 1996 Acorn Computers Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/* File:    txtregexp.c
 * Purpose: Regular Expression pattern matching by NFA.
 *
 * History:
 *      LDS, 22-Jan-89  created
 *      IDJ, 24-Jan-90  made re_match return end of match through 'end'
 *      IDJ, 24-Jan-90  added case-insensitivity to re_match (I hope)
 *      IDJ, 29-Jan-90  horrible hack to re_match/char/charset to get twin-like MOST working,
 *                      it goes like this:
 *                          ....%a....  ==> ....^a~a....
 *                      when ~a is found, backtrack posn in text (and pop head of dequeue)
 *      IDJ, 6-Feb-90   further hack to deal with %x when reaching end of buffer!!
 *      IDJ, 8-Feb-90   added field# code. Put & 255 on end of access fns (fed up with
 *                      bugs associated with this!)
 *                      Ambiguous fields work like this:
 *                         As we traverse the NFA nodes, we need to know whether an
 *                         ambiguous node (or nodes) is being processed, and also need
 *                         to know when such processing is finished (to be able to store
 *                         its start and end in the text).
 *                         Some magic patterns are trivial:
 *                              ~x
 *                              .
 *                              @
 *                              #
 *                              [...]
 *                         These are all represented by a SINGLE node in the NFA.
 *                         The following are more difficult:
 *                              %x
 *                              ^x
 *                              *x
 *                         Note that these have the following NFA nodes created for them:
 *                         %x:
 *                                       --------------------
 *                                      |                    |
 *                                      |             next -->
 *                                      x -------> OR
 *                                                    alt_next ------> ~x(MOSTNODE) ----....
 *
 *
 *
 *                         ^x and %xy:
 *                                       --------------------
 *                                      |                    |
 *                                      |             next -->
 *                                      x -------> OR
 *                                                    alt_next ------> ....
 *
 *
 *                                       --------------------
 *                         *x:          |                    |
 *                                         next -------> x -->
 *                                      OR
 *                                         alt_next --------------> ......
 *
 *
 *
 *                         During NFA traversal, we check to see if any of these patterns
 *                         are found; if so we remember where the "exit" node from such an
 *                         ambiguous node is (so we know when to mark the end of this match
 *                         in the text).
 *      IDJ, 5-Apr-90 : check to see if ambiguous patterns are necessary before keeping
 *                      track of ambiguous starts/ends.  txtedit indicates no special
 *                      ? strings used in replacement by ambiguous == 0
 *      IDJ,19-Jul-90 : fixed some real tricky bugs in end-conditions for ambiguous pats.
 *
 *      IDJ, 31-Aug-90: guarded against stack/heap/wimpslot extension when getting
 *                      pointer into flex block (previously char *buffer was passed
 *                      to re_match, which may be invalid pointer after flex__budge
 *                      called from C library memory allocator
 *      IDJ, 22-Aug-91: removed last string literals into messages lookup
 *
 * Copyright (C) Acorn Computers Ltd., 1989
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "h.EditIntern.txtregexp"
#include "h.verintern.messages"
#include "msgs.h"
#include "txt.h"
#include "werr.h"
#define FALSE 0
#define TRUE  1

#ifndef NULL
#  define NULL 0
#endif

#define  NULL_NODE          0
#define  CHAR_NODE          1
#define  NOT                2
#define  NOT_CHAR_NODE      (NOT + CHAR_NODE)
#define  SPECIAL_NODE       4
#define  NOT_SPECIAL        (NOT + SPECIAL_NODE)
#define  CHAR_SET           8
#define  OR_NODE           16
#define  END_NODE          32
#define  SIGN_BIT          0x80000000
#define  TOP_CHAR_BIT      0x8000
#define  VISITED_NODE      64
#define  MOST_NODE         128 /* IDJ 29-Jan-90 special node inserted due to MOST */
#define  ON_PATH           0x80000000  /* to mark node as on goal path */


/*
 * An NFA is represented in store as this header, followed by a (possibly
 * empty) array of CharSets (each 32 bytes long) followed by an array of
 * 4-byte NFA nodes. In fact, the matching algorithm doesn't need to know
 * the number of nodes - only the number of arcs in the NFA and the index
 * of the entry node - so we don't retain n_nodes.
 */
struct re_nfa {
    int n_arcs;             /* no of arcs in the NFA - i.e. deque size */
    int n_charsets;         /* no of charsets in the NFA... */
    int entry;              /* Node offset to entry node... */
};

/*
 * Next and alt_next values - like the entry field of an re_nfa - are
 * indexes into the array of nodes. All nodes have exactly one successor,
 * except for OR ndes which have two and the END node which has none.
 */
typedef struct node {
    char type;              /* the node's type */
    char val;               /* ch val, charset idx */
    unsigned char next;     /* pointer to next node in NFA */
    unsigned char alt_next; /* if type == OR_NODE */
} Node;

/*
 * Nodes are also treated as unsigned ints with the following access fns
 */
#define  type_(n)           ((n) & 255)
#define  val_(n)            (((n) >> 8) & 255)
#define  next_(n)           (((n) >> 16) & 255)
#define  alt_next_(n)       (((n) >> 24) & 255)


typedef struct re_handle {  /* Used while building an NFA */
    int n_nodes;            /* no of nodes in the NFA */
    int n_ors;              /* of which this many OR nodes */
    int n_charsets;         /* and this many CharSets... */
    NFA *nfa;               /* during pass-2, if pass-1 is OK, else NULL */
    Node *space;            /* during pass-2, points to nfa's node array */
    Node *free;             /* during pass-2, points to next free node */
    Node *orend;            /* end of last OR-branch */
    unsigned begin;         /* offset of start of current sub-R.E. */
    unsigned bra;           /* offset of last sub-R.E. opening bracket */
    unsigned last;          /* ... start of last R.E. (for modification...) */
    unsigned pass;          /* pass-1 or pass-2... */
    unsigned modifier;      /* index of node after last modifier node... */
};

static Node *start_or;

extern void re_begin1(REHandle *hh)
{
/*
 * Initialise the REHandle for the information gathering pass.
 */
  struct re_handle *h = (struct re_handle *) hh;
  h->n_nodes = 1;                 /* 1 for the END node */
  h->n_ors = 0;
  h->n_charsets = 0;
  h->nfa = NULL;
  h->pass = 1;
  h->modifier = 0;
}

extern void re_begin2(REHandle *hh)
{
/*
 * Initialise the REHandle for the construction phase, allocating store
 * to hold the NFA's compact representation.
 */
  struct re_handle *h = (struct re_handle *) hh;
  NFA *nfa;
  int sz;
/*
 * If n_arcs > 254 there is a constraint violation, so we just
 * repeat pass-1 and return NULL via re_end().
 */
  if ((h->n_nodes + h->n_ors) <= 254)
  {
    sz = sizeof(NFA) + sizeof(CharSet)*h->n_charsets + sizeof(Node)*h->n_nodes;
    nfa = (NFA *) malloc(sz);
    memset(nfa, 0, sz);
    nfa->n_arcs = h->n_nodes + h->n_ors;
    nfa->n_charsets = 0;
    h->nfa = nfa;
    h->space = h->free = (Node *)((char *)nfa + sizeof(NFA) +
                                                sizeof(CharSet)*h->n_charsets);
    h->orend = NULL;
    h->begin = 0;
    h->bra   = 255;    /* cunning choice... +1 == 0 mod 256... */
    h->pass  = 2;
    h->modifier = -1;  /* an impossible value... */
  }
}

static void re_close_or(struct re_handle *h)
{
/*
 * Close the currently open OR structure - if there is one.
 * This requires back-patching the node.next values of the nodes at the ends
 * of each arm of the OR structure so they point just past the entire OR.
 * These nodes are chained together via their next fields, with the head of
 * the chain - the end of the penultimate branch of the OR - identified by
 * h->orend.
 */
  Node *n = h->orend;
  if (n != NULL)
  {
    h->begin = (n - h->space) + 1;
    while (n != NULL)
    { int prev = n->next;
      n->next = h->free - h->space;        /* just past the whole OR... */
      n = (prev ? h->space + prev : NULL);
    }
    h->orend = NULL;
  }
}

static int follow_possible_loop(char *base, unsigned char n, int next_link)
{
  Node *node = (Node *) (base) + n;
  int found = 0;
  if (node == start_or) {                 /* found a loop */
    start_or->type = NULL_NODE;
    return 2;                             /* signal loop found to parent proc */
  } else if (node->type & VISITED_NODE) { /* been here before */
    return 0;                             /* no loop, will be one later */
  } else if (node->type == NULL_NODE) {   /* keep going */
    node->type |= VISITED_NODE;
    found = follow_possible_loop(base, node->next, next_link);
    node->type &= ~VISITED_NODE;
    return found;
  } else if (node->type == OR_NODE) {     /* try each branch for a loop */
    node->type |= VISITED_NODE;
    found = follow_possible_loop(base, node->next, next_link);
    if (found) {
      node->type &= ~VISITED_NODE;
      if (found > 1) {                    /* first OR in loop before start_or */
        if (next_link)                    /* start_or->next is loop path */
          node->next = start_or->alt_next;
        else                              /* start_or->alt_next is loop path */
          node->next = start_or->next;
      }
      return 1;
    }
    found = follow_possible_loop(base, node->alt_next, next_link);
    node->type &= ~VISITED_NODE;
    if (found) {
      if (found > 1) {                    /* first OR in loop before start_or */
        if (next_link)                    /* start_or->next is loop path */
          node->alt_next = start_or->alt_next;
        else                              /* start_or->alt_next is loop path */
          node->alt_next = start_or->next;
      }
      return 1;
    }
  }
  return 0;                               /* no loop */
}

static void remove_or_loops(NFA *nfa)
{
  int found_loop;
  int posn;
  char *base = (char *) nfa + (sizeof(NFA) + sizeof(CharSet) * nfa->n_charsets);
  do {
    posn = 0; found_loop = 0;
    for (;;)
    { Node *n = (Node *)(base) + posn;
      if (n->type == END_NODE) break;
      if (n->type == OR_NODE) {
        start_or = n;
        n->type |= VISITED_NODE;
        found_loop = follow_possible_loop(base, n->next, 1);
        if (found_loop) break;
        found_loop = follow_possible_loop(base, n->alt_next, 0);
        if (found_loop) {
          /* n is now a NULL_NODE so the link should be via next not alt_next */
          n->next = n->alt_next;
          break;
        }
        n->type &= ~VISITED_NODE;
      }
      ++posn;
    }
    /*
     * Every time a loop is found, the nfa should be re-traversed from the start
     * in case the structure has changed for something that was ok earlier.
     */
  } while (found_loop);
}

extern NFA *re_end(REHandle *hh)
{
  struct re_handle *h = (struct re_handle *) hh;
  Node *n;
  NFA *nfa;

  re_close_or(h);                          /* just in case... */
  n = h->free;                             /* add the END node... */
  n->type = END_NODE;
  nfa = h->nfa;                            /* pick up the NFA header... */
  if (nfa) {                               /* and enter at the start of */
    nfa->entry = h->begin;                 /* the first sub-R.E... */
#ifdef LIB_DEBUGGING
  {
    werr(FALSE, "NFA before any looping NULLS/ORS removed\n");
    print_nfa(nfa);
  }
#endif
    remove_or_loops(nfa);                  /* remove looping ors/nulls */
  }
  return nfa;
}


extern void re_modify(REHandle *hh, int modifier)
{
/*
 * This procedure is best understood in pictures (so go draw some).
 * In the cases of closure (*) and optionality (?), pre-modification
 * is used. That is, an OR mode is made to precede the sub-R.E. being
 * modified. This can always be done because the sub-R.E. is either the
 * preceding simple node (which is swapped with the following free node)
 * or is a bracketed expression (the NULL node of which is swapped with
 * the following free node). In both cases, the alt_next field of the OR
 * becomes the next free node after that. In the case of closure, the next
 * field of the end of the sub-R.E. (prior to this call, the node immediately
 * before h->free) is made to point back to the OR node.
 * In the case of 1-closure, the OR node simply follows the sub-R.E. being
 * modified and its next filed points to the beginning of that sub-R.E.
 */
  struct re_handle *h = (struct re_handle *) hh;
  if (h->pass == 1)
  {
    h->modifier = (h->n_nodes += 1);   /* modification costs an OR node... */
    h->n_ors += 1;                     /* and a NULL node if followed by OR */
  }
  else
  {
    Node *l = h->space + h->last;
    Node *n = h->free++;
    if (modifier != '+')
    {
      *n = *l;
      n = l;
      l = h->free - 1;
      l->next = ((modifier == '?') ? h->free : n) - h->space;
    }
    else
    {
      n->next = h->last;
    }
    n->type = OR_NODE;
    n->alt_next = h->modifier = h->free - h->space;
  }
}

extern void re_char(REHandle *hh, int ch, int most)
{
/*
 * Add a node capable of matching a character or one of the 'special',
 * builtin character classes.
 * 30-Jan-90 IDJ most == '%' means that this is a ~-node, created as a result of %a
 */
  struct re_handle *h = (struct re_handle *) hh;
  if (h->pass == 1)
  {
    h->n_nodes += 1;
  }
  else
  { Node *n = h->free++;
    h->last = n - h->space;
    n->next = h->free - h->space;
    if (ch > (255+RE_NOT))
    {
      n->type = (most == '%')?SPECIAL_NODE+MOST_NODE:SPECIAL_NODE;
      /* ANY, Start Of Buffer and End Of Buffer can't be negated... */
      if (ch == RE_ANY || ch == RE_SOB || ch == RE_EOB) ch &= ~RE_NOT;
      ch &= (255+RE_NOT);
    }
    else
    {
      n->type = (most == '%')?CHAR_NODE+MOST_NODE:CHAR_NODE;
    }
    if (ch & RE_NOT) n->type += NOT;
    n->val = ch;
  }
}

extern void re_charset(REHandle *hh, CharSet *charset, int most)
{
/*
 * Add a node capable of matching a custom-built class of characters.
 */
  struct re_handle *h = (struct re_handle *) hh;
  if (h->pass == 1)
  {
    h->n_nodes += 1;
    h->n_charsets += 1;
  }
  else
  { Node  *n = h->free++;
    NFA *nfa = h->nfa;
    char *cs = (char *)nfa + sizeof (NFA) + sizeof(CharSet)*nfa->n_charsets;
    h->last = n - h->space;
    n->type = (most == '%')?CHAR_SET+MOST_NODE:CHAR_SET;
    n->val  = nfa->n_charsets++;
    n->next = h->free - h->space;
    memcpy(cs, charset, sizeof(CharSet));
  }
}

extern void re_head(NFA *nfa, char *buf, int buflen)
{
  Node *nodes, *n;
  nodes = (Node *)((char *)nfa +
                   sizeof(NFA) + sizeof(CharSet)*nfa->n_charsets);
  n = nodes + nfa->entry;
  while ((n->type == CHAR_NODE) && (buflen > 1))
  {
    *buf++ = n->val;
    n = nodes + n->next;
    --buflen;
  }
  *buf = 0;
}


/* ------------------------------------ code to deal with ambiguous patterns ------------- */
/* yuk */

extern int re_make_subpattern_table(NFA *nfa, SubPattern *sub_patterns)
{
   int np = 0;
   unsigned int n;
   Node *nodes;
   int s = 0;

   nodes = (Node *)((char *)nfa +
                    sizeof(NFA) + sizeof(CharSet) * nfa->n_charsets);
   n = *((unsigned *)(nodes + np));

   for (n = *((unsigned *)(nodes + np)); !(type_(n) & END_NODE); n = *((unsigned *)(nodes + np)))
   {
      /* check for *x */
      if (type_(n) == OR_NODE)
      {
#ifdef LIB_DEBUGGING
         werr(FALSE, "*x node subpattern %d", s);
#endif
         /* set up table entry */
         sub_patterns[s].start_node = np;
         sub_patterns[s].flags |= F_AMBIGUOUS|F_0ORMORE;
         s++;

         np = alt_next_(n);
         continue;
      }

      /* check for %x or ^x */
      if (type_(*((unsigned *)(nodes + next_(n)))) == OR_NODE)
      {
         unsigned int ornode = *((unsigned *)(nodes + next_(n)));

         /* check further... */
         if (next_(ornode) == np)  /* certain it is %x or ^x */
         {
#ifdef LIB_DEBUGGING
               werr(FALSE, "%%x or ^x subpattern %d", s);
#endif

            /* set up table entry */
            sub_patterns[s].start_node = np;
            sub_patterns[s].flags |= F_AMBIGUOUS;
            s++;

            if (type_(*((unsigned *)(nodes + alt_next_(ornode)))) & MOST_NODE)  /* %x */
            {
               unsigned int mostnode = *((unsigned *)(nodes + alt_next_(ornode)));

               sub_patterns[s-1].flags |= F_MOST;
               np = next_(mostnode);
            }
            else  /* ^x */
            {
               sub_patterns[s-1].flags |= F_1ORMORE;
               np = alt_next_(ornode);
            }
            continue;
         }
      }

      /* check for other special node */
      if (!(type_(n) & MOST_NODE) &&

         (((type_(n) & SPECIAL_NODE) && (val_(n) == RE_ANY-512 ||
                                         val_(n) == RE_DIGIT-512))
         ||
           (type_(n) & CHAR_SET)
         ||
           ((type_(n) & CHAR_NODE) && (type_(n) & NOT))))
      {

#ifdef LIB_DEBUGGING
               werr(FALSE, "special subpattern %d", s);
#endif
         /* set up table entry */
         sub_patterns[s].start_node = np;
         sub_patterns[s].flags |= F_AMBIGUOUS;
         s++;

         np = next_(n);
         continue;
      }

      /* we only get here if it's not a 'special' node */
      sub_patterns[s].start_node = np;
      s++;

      np = next_(n);
   }

#ifdef LIB_DEBUGGING
   { int ss=0;
     while(sub_patterns[ss].start_node != 0xffffffff)
     {
        werr(FALSE, "sub[%d] start_node %d flags %d", ss, sub_patterns[ss].start_node, sub_patterns[ss].flags );
        ss++;
     }
     werr(FALSE, "num subs %d", s);
   }
#endif

   return s;
}


static void re__setup_ambiguous_table(Ambiguous_entry *ambiguous, SubPattern *sub_patterns, int end)
{
   int s = 0, a = 0;

   while (s < 128 && a < MAX_AMBIGUOUS && sub_patterns[s].start_node != 0xffffffff)
   {
      if (sub_patterns[s].flags & F_AMBIGUOUS)
      {
         ambiguous[a].start = sub_patterns[s].start_pos;
         ambiguous[a].end = (sub_patterns[s+1].start_node != 0xffffffff)?sub_patterns[s+1].start_pos:end;
         a++;
      }
      s++;
   }
#ifdef LIB_DEBUGGING
   for (a = 0; a < MAX_AMBIGUOUS; a++)
       werr(FALSE, "amb[%d] start %d end %d", a, ambiguous[a].start, ambiguous[a].end);
#endif
}


/*
 * To Understand the following, read the relevant chapter of
 * Robert Sedgewick's masterpiece entitled "Algorithms".
 * (But beware: if you read the first edition, the code he gives
 *  is wrong! This is corrected in the second edition. Because
 *  there are 2 editions I don't quote page or chapter numbers.)
 */

typedef struct
{
   unsigned int node_number;
   int pusher;
} deque_entry;

typedef struct
{
   unsigned int node_number;
   int pusher;
   int p;      /* pos. in text buffer relative to start of search */
} path_entry;


#define  DQ_SIZE        256
#define  SCAN           255    /* advance one character in this state */
/* now define the deque operations */
#define  rm_hd_()       (head = (head + 1) & (DQ_SIZE-1))

#ifdef FIELDNUM
#define  add_hd_(x)     (head = (head - 1) & (DQ_SIZE-1), dq[head].node_number = (x), dq[head].pusher = pusher)
#define  add_tl_(x)     (tail = (tail + 1) & (DQ_SIZE-1), dq[tail].node_number = (x), dq[tail].pusher = pusher)
#else
#define  add_hd_(x)     (head = (head - 1) & (DQ_SIZE-1), dq[head].node_number = (x))
#define  add_tl_(x)     (tail = (tail + 1) & (DQ_SIZE-1), dq[tail].node_number = (x))
#endif

#ifdef FIELDNUM
static BOOL re__is_start_sub_pattern(SubPattern *sub_patterns, unsigned int node, int *s)
{
   *s = 0;

   while (sub_patterns[*s].start_node != 0xffffffff)
   {
      if (sub_patterns[*s].start_node == node) return TRUE;
      (*s)++;
   }

   return FALSE;
}

static void re__trace_back(path_entry *path, int goal_node, Ambiguous_entry *ambiguous, SubPattern *sub_patterns, int end)
{
   int p = goal_node;
   int i, sub;

   /* --- trace goal path back through array (marking members) --- */
   while (p != -1)
   {
      path[p].node_number |= ON_PATH;   /* mark node as on goal path */
      p = path[p].pusher;
   }

   /* --- ... now process array forwards setting start points for sub-patterns --- */
   for (i = 0; i<=goal_node; i++)
   {
      if (path[i].node_number & ON_PATH)
      {
         if (re__is_start_sub_pattern(sub_patterns, path[i].node_number & ~ON_PATH, &sub))
         {
             if (sub_patterns[sub].start_pos == -1) sub_patterns[sub].start_pos = path[i].p;
          }
      }
   }

   /* --- ... and construct ambiguous table from sub-pattern table --- */
   re__setup_ambiguous_table(ambiguous, sub_patterns, end);
}
#endif

extern int re_match(txt t, txt_index at, NFA *nfa, int scanning, int *end, int nocase
#ifdef FIELDNUM
, Ambiguous_entry *ambiguous
, SubPattern *sub_patterns
, BOOL rescanning
#endif
)
{
  int j, p;
  unsigned head, tail, dqh;
  Node *nodes;
  deque_entry dq[DQ_SIZE];
  char *buf;
  int len;
#ifdef FIELDNUM
  path_entry *path = 0;
  int path_size = 0;
  int pusher = -1;
  int i;
  BOOL met_most_node = FALSE;
#endif

  nodes = (Node *)((char *)nfa +
                   sizeof(NFA) + sizeof(CharSet) * nfa->n_charsets);

  j = 0;


  /* get pointer into flex block */
  txt_arrayseg(t, at, &buf, &len);

#ifdef FIELDNUM
  if (rescanning)
  {
     if ((path=malloc(DQ_SIZE * sizeof(path_entry))) == 0)
     {
        werr(FALSE, msgs_lookup(MSGS_txtfind3));
        path_size = DQ_SIZE;
        return -1;
     }
  }
#endif

  do
  {
#ifdef FIELDNUM
    if (ambiguous)
    {
       for (i = 0; i < 128; i++) sub_patterns[i].start_pos = -1;
       for (i = 0; i < MAX_AMBIGUOUS; i++) ambiguous[i].start = ambiguous[i].end = -1;
    }
    pusher = -1;
    dq[0].pusher = -1;
    dq[1].pusher = 0;
#endif

    head = 0;  tail = 1;  dq[0].node_number = nfa->entry;  dq[1].node_number = SCAN;
    met_most_node = FALSE;

    p = j;

    do
    {
      dqh = dq[head].node_number;
#ifdef FIELDNUM
      if (rescanning)
      {
         if (pusher+1 >= path_size)
         {
            path_entry *old_path = path;

            path_size += DQ_SIZE;
            if ((path = realloc(path, path_size * sizeof(path_entry))) == 0)
            {
               werr(FALSE, msgs_lookup(MSGS_txtfind3));
               free(old_path);
               return -1;
            }
         }
         path[++pusher].node_number = dqh;
         path[pusher].pusher = dq[head].pusher;
         path[pusher].p = p;
      }
#endif

      /* get pointer into flex block (do it again cos it may have moved due to slot extension) */
      if (rescanning) txt_arrayseg(t, at, &buf, &len);

      rm_hd_();

#ifdef LIB_DEBUGGING
      werr(FALSE, "processing node %d, buf %x", dqh, buf);
#endif
      if (dqh == SCAN)
      {
        ++p;  add_tl_(SCAN);  j &= ~SIGN_BIT;  /* turn off exact match */
      }
      else
      { unsigned int n = *((unsigned *)(nodes + dqh));

        if (type_(n) & (CHAR_NODE + SPECIAL_NODE + CHAR_SET))
        { int ch;
          /* another IDJ hack 6-Feb-90 */
          /* if we just had a most node and reached EOB, and no more nodes in pattern */
          /* this is a match!!! */
          if (p >= len)
          {
             if ((type_(n) & MOST_NODE) && type_(*((unsigned *)(nodes + next_(n)))) == END_NODE)
             {
                *end = p;
#ifdef FIELDNUM
                if (rescanning)
                {
                   if (pusher+1 >= path_size)
                   {
                      path_size += DQ_SIZE;
                      if (realloc(path, path_size) == 0)
                      {
                         werr(FALSE, msgs_lookup(MSGS_txtfind3));
                         free(path);
                         return -1;
                      }
                   }
                   path[++pusher].node_number = next_(n);
                   path[pusher].pusher = pusher-1;
                   path[pusher].p = p;

                   re__trace_back(path, pusher, ambiguous, sub_patterns, p);
                   free(path);
                }
#endif
                return j;
             }
             else
                continue;
          }
          /* before IDJ hack:  if (p >= len) continue;*/
          /* we can safely access array, cos no extension can have happened, since
             we called txt_arrayseg */

          ch = buf[p];

          if (type_(n) & CHAR_NODE)
          {
            int lch = (nocase)?tolower(ch):ch;
            unsigned int lvaln = (nocase)?tolower(val_(n)):val_(n);
            if (((lch == lvaln) && (type_(n) & NOT) != 0) ||
                (lch != lvaln && (type_(n) & NOT) == 0)) continue;
          }
          else if (type_(n) & SPECIAL_NODE)
          { int match;
            switch (val_(n))
            {
case  RE_ANY-512:      match = 1;            break;
case  RE_SOB-512:      if (p == 0) {match = 1;  --p;} else match = 0;
                                             break;
case  RE_EOB-512:      match = (p == (len-1));   break;
case  RE_ALPHA-512:    match = isalpha(ch);  break;
case  RE_ALPHANUM-512: match = isalnum(ch);  break;
case  RE_DIGIT-512:    match = isdigit(ch);  break;
case  RE_XDIGIT-512:   match = isxdigit(ch); break;
case  RE_UPPER-512:    match = isupper(ch);  break;
case  RE_LOWER-512:    match = islower(ch);  break;
case  RE_SPACE-512:    match = isspace(ch);  break;
case  RE_CNTRL-512:    match = iscntrl(ch);  break;
case  RE_GRAPHIC-512:  match = isgraph(ch);  break;
case  RE_PRINT-512:    match = isprint(ch);  break;
case  RE_PUNCT-512:    match = ispunct(ch);  break;
default:               match = 0;
            }
            if (type_(n) & NOT) match = !match;
            if (!match) continue;
          }
          else /* CHAR_SET */
          { CharSet *s = (CharSet *)((char *) nfa +
                         sizeof(NFA)) + val_(n);
            if (((*s)[ch >> 5] & (1L << (int)(ch & 31))) == 0) continue;
          }
          /* matched a character... */
          /* 29-Jan-90 IDJ hack for MOST -- rematch same char!!!!! */
#ifdef LIB_DEBUGGING
          werr(FALSE, "matched a char");
#endif
          if (type_(n) & MOST_NODE)
          {
              /* yuk!!!! can I do better?*/
              met_most_node = TRUE;
#if FALSE
              p--;        /* backtrack one char in the text */
              if (((head+1)&(DQ_SIZE-1)) <= tail) rm_hd_();   /* ignore next state (it will cause infinite loop!) unless queue has only one node */
#endif
          }

          add_tl_(next_(n));
        }
        else if (type_(n) == NULL_NODE)
        {
          add_hd_(next_(n));
        }
        else if (type_(n) == OR_NODE)
        {
          if ((type_(next_(n)) == OR_NODE) || (type_(next_(n)) == NULL_NODE)) {
            add_hd_(next_(n));
            add_hd_(alt_next_(n));
          } else {
            add_hd_(alt_next_(n));
            add_hd_(next_(n));
          }
        }
        else if (type_(n) == END_NODE)
        {
           if (scanning)
           {
               *end = (met_most_node)?(p-1):p;
#ifdef FIELDNUM
               if (rescanning)
               {
                  re__trace_back(path, pusher, ambiguous, sub_patterns, (met_most_node)?(p-1):p);
                  free(path);
               }
#endif
               return j;
           }
           j |= SIGN_BIT;            /* try for exact match... */
        }
      }
    } while (head != tail);

    if (!scanning)
    {
      /* LH.
       * p is incremented if a complete match was made but:
       * I think j & SIGN_BIT means 'the nfa ran out of state (ENDNODE reached)
       * and so a match was found'. It seems that it does not matter that the
       * end of the input string was not also reached. So I have added the
       * extra test that the end of input string was reached as well. It seems
       * to work, but I don't understand all that is happening. Not enough time.
       */
#ifdef LIB_DEBUGGING
  {
    if (j & SIGN_BIT) {
      if (p == len) werr(FALSE, "'%s' matches the nfa completely\n", buf);
      else werr(FALSE, "end of nfa reached at character %d in '%s'\n", p+1, buf);
    } else
      werr(FALSE, "end of nfa not reached, mismatch at character %d in '%s'\n", p+1, buf);
  }
#endif
      if ((j & SIGN_BIT) && (p == len)) ++p;
#ifdef FIELDNUM
      if (ambiguous) re__trace_back(path, pusher, ambiguous, sub_patterns, p);
#endif
      return p;
    }

    ++j;
  } while (j < len);
#ifdef LIB_DEBUGGING
  werr(FALSE, "re_match returning -1");
#endif
  return -1;
}

#ifdef LIB_DEBUGGING

extern void print_nfa(NFA *nfa)
{
  int posn;
  char *base;
  {
    posn = sizeof(NFA) + sizeof(CharSet) * nfa->n_charsets;
    werr(FALSE, "n_arcs = %d, entry = %d, NFA at %d\n",
           nfa->n_arcs, nfa->entry, posn);
    base = (char *) nfa + posn;
    posn = 0;
    for (;;)
    { Node *n = (Node *)(base) + posn;
      werr(FALSE, "%d: ", posn);
      if (n->type & MOST_NODE) werr(FALSE, "Most node");
      switch (n->type & ~MOST_NODE)
      {
case NULL_NODE:
        werr(FALSE, "NULL");
        break;
case CHAR_NODE:
        werr(FALSE, "CHAR '%c'", n->val);
        break;
case NOT_CHAR_NODE:
        werr(FALSE, "NOT CHAR '%c'", n->val);
        break;
case CHAR_SET:
        werr(FALSE, "CHARSET[%u]", n->val);
        break;
case SPECIAL_NODE:
        werr(FALSE, "SPECIAL[%u]", n->val);
        break;
case NOT_SPECIAL:
        werr(FALSE, "NOT SPECIAL[%u]", n->val);
        break;
case OR_NODE:
        werr(FALSE, "OR, alt_next = %u", n->alt_next);
        break;
case END_NODE:
        werr(FALSE, "END\n");
        return;
        break;
default:
        werr(FALSE, "Unrecognised node\n");
      }
      werr(FALSE, ", next = %u\n", n->next);
      if (n->type == CHAR_SET)
      { CharSet *cs = (CharSet *) ((char *)nfa + sizeof(NFA)) + n->val;
        int j;
        for (j = 0;  j < 8;  ++j) werr(FALSE, "|%.8lx", (*cs)[j]);
        werr(FALSE, "|\n");
      }
      ++posn;
    }
  }
}


#endif