/* Implementation of regular expression matching interface.
 *
 * Copyright 2001, KUN.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

/* $Id: pattern.c,v 1.16 2005/09/06 10:03:16 marcs Exp $ */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <sys/types.h>
#include "regex.h"

/* libabase includes */
#include <abase_error.h>
#include <abase_memalloc.h>

/* local include */
#include "pattern.h"

/*
 * Use the DFA implementation of regular expressions (from dfa.c)
 * instead of the NFA implementation (from regex.c).
 * If USE_COMBINED_REGEX (from rtslex.c) is set then USE_DFA_REGEXP
 * must be too.
 * Depending on the number and complexity of the regexps in the grammar,
 * this can easily be 100 times as fast.
 * The NFA should be removed, and then this option too.
 */
#define USE_DFA_REGEXP	1

#if !defined(AGFL) && !defined(RTS)
#define	STAND_ALONE
#endif

/*------------------------------------------------------------------------------
// Regexp wrapper functions.
//----------------------------------------------------------------------------*/
#ifdef RTS
#include "rtscode.h"
#else
static unsigned char* word_terminator_chars = " ";
#endif /* RTS */

#if USE_DFA_REGEXP
#include "dfa.h"
#include "abase_lex_input.h"
#else
#include "regex.h"
#endif

const char *dfa_error_message;

#if USE_DFA_REGEXP

/*
 * See regex.h for the meaning of these bits.
 * Based on RE_SYNTAX_EGREP with some additions and removals.
 * In short:
 * 	RE_CHAR_CLASSES
 * 		[[:alpha:]] etc are supported
 * 	RE_CONTEXT_INDEP_ANCHORS
 * 		^ and $ are always anchors, even in the middle
 * 		(but they are probably quite useless anyway in AGFL;
 * 		there is no option to turn them off completely)
 * 	RE_CONTEXT_INDEP_OPS
 * 		*?+ are operators even if there is nothing before them
 * 		(which then is an error)
 * 	not RE_HAT_LISTS_NOT_NEWLINE
 *		[^something] does match a newline
 * 	not RE_NEWLINE_ALT
 * 		newline is not an alternation operator
 * 	RE_NO_BK_PARENS
 * 		no \ needed before ( and ) as operators
 * 	RE_NO_BK_VBAR
 * 		no \ needed before | as operator
 * 	RE_NO_BK_REFS
 *		\1 is not a back reference,
 *	RE_NO_GNU_OPS
 *		\w\W\b\B\<\>\`\' are not operators but ordinary
 */
#define SYNTAX (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS | \
		RE_CONTEXT_INDEP_OPS | RE_NO_BK_PARENS | \
		RE_NO_BK_VBAR | RE_NO_BK_REFS | RE_NO_GNU_OPS)

static char *re_space, *re_dot;

/*
 * Re-arrange 'chars' for use in a charclass. Problematic characters
 * are ']' and '-'.
 * Apparenly very few people know that you can include a ] as the very
 * first character in a charclass (since an empty class would not
 * make much sense) and a - can be either very first or very last,
 * so that there is no character next to it to make a full range.
 * NO OTHER CHARACTERS ARE SPECIAL! Sometimes a backslash is just
 * a backslash...
 */

static
char *
make_charclass(const char *pre, const char *chars, const char *post)
{
    int has_close_bracket;
    int has_minus;
    char *result;
    char *here;
    char c;

    has_close_bracket = strchr(chars, ']') != NULL;
    has_minus = strchr(chars, '-') != NULL;

    result = abs_malloc(strlen(pre) + strlen(chars) + strlen(post) + 1,
	    "make_charclass");
    here = result;
    strcpy(here, pre);
    here = strchr(here, '\0');

    if (has_close_bracket)
	*here++ = ']';		/* must be very first */
    while ((c = *chars++)) {
	if (c != ']' && c != '-')
	    *here++ = c;
    }
    if (has_minus)
	*here++ = '-';		/* must be very last or very first */
    strcpy(here, post);

    return result;
}

/*
 * Convert a regular expression such that it conforms to our internal
 * idea about word terminators.
 * Specifically:
 * 	a space matches one or more terminators: [terminators]+
 * 	a . does not match a terminator: [^terminators]
 *
 * Returns the new regular expression.
 * Must be freed with abs_free().
 */
static
char *
convert_regexp(const char *re)
{
    int len;
    int margin;
    int i, j;
    int backslash, prevbackslash, cclass;
    char c;
    char *copy;

    if (!re_space) {
	re_space = make_charclass("[", word_terminator_chars ,"]+");
    }
    if (!re_dot) {
	re_dot = make_charclass("[^", word_terminator_chars ,"]");
    }

    margin = strlen(re_dot) + 2;
    len = 2 * strlen(re) + 2 * margin;
    copy = abs_malloc(len, "convert_regexp");

    j = 0;
    backslash = 0;
    prevbackslash = 0;
    cclass = 0;
    
    for (i = 0; (c = re[i]); i++) {
	if (j > len - margin) {
	    copy = abs_realloc(copy, len = 2 * len, "convert_regexp");
	}
	
	prevbackslash = backslash;
	if (c == '\\') {
	    backslash = !backslash;
	} else {
	    backslash = 0;
	}

	switch (c) {
	case '[':
	    if (!prevbackslash)
		cclass = 1;
	    copy[j++] = c;
	    /* copy next char too, since it can never end the cclass */
	    if (cclass)
		copy[j++] = re[++i];
	    break;
	case ']':
	    if (SYNTAX & RE_BACKSLASH_ESCAPE_IN_LISTS) {
		/* Backslashes inside [ ] are escapes. */
		if (!prevbackslash)
		    cclass = 0;
	    } else {
		/* Backslashes inside [ ] are not escapes.  */
		cclass = 0;
	    }
	    copy[j++] = c;
	    break;
	case '.':
	    if (cclass || prevbackslash) {
		copy[j++] = c;
	    } else {
		strcpy(&copy[j], re_dot);
		j += strlen(&copy[j]);
	    }
	    break;
	case ' ':
	    if (cclass) {
		copy[j++] = c;
	    } else {
		strcpy(&copy[j], re_space);
		j += strlen(&copy[j]);
	    }
	    break;
	default:
	    copy[j++] = c;
	    break;
	}
    }
    copy[j++] = '\0';
    /*
     * Don't bother to re-allocate the memory again to an exact size,
     * since the copy is going to be freed very soon anyway.
     */
    /*fprintf(stderr, "convert_regexp: %s -> %s\n", re, copy);*/
    return copy;
}

#endif


RegExp* new_regexp(const char* regexp_txt, int* error_p)
{
#if USE_DFA_REGEXP
    struct dfa *d = abs_malloc (sizeof(struct dfa), "new_regexp");
    char *copy = convert_regexp(regexp_txt);

    dfa_error_message = NULL;
    dfasyntax(SYNTAX, /* fold = */0, /* eolchar = */abs_EosMark);
    dfacomp(copy, strlen(copy), d, /* searchflag = */ 0);
    abs_free(copy, "new_regexp");
    *error_p = dfa_error_message != NULL;

    return (RegExp *) d;
#else
  regex_t* buf = abs_malloc (sizeof(regex_t), "new_regexp");
  *error_p = regcomp(buf, regexp_txt, REG_EXTENDED);
  return ((RegExp*) buf);
#endif
}

void
dfaerror(const char *message)
{
    dfa_error_message = message;
}

#if USE_DFA_REGEXP

RegExp *
empty_regexp()
{
    struct dfa *d = abs_malloc (sizeof(struct dfa), "new_regexp");
    dfasyntax(SYNTAX, /* fold = */0, /* eolchar = */abs_EosMark);
    dfainit(d);
    return (RegExp *) d;
}

int
add_alternative_to_regexp(RegExp *regexp, const char* regexp_txt)
{
    char *copy;

    assert(regexp);
    copy = convert_regexp(regexp_txt);
    dfa_error_message = NULL;
    dfaparse(copy, strlen(copy), (struct dfa *)regexp);
    abs_free(copy, "add_alternative_to_regexp");

    return dfa_error_message != NULL;
}

int 
finalize_regexp(RegExp *regexp)
{
    assert(regexp);
    dfamust((struct dfa *)regexp);
    dfaanalyze((struct dfa *)regexp, /*searchflag = */0);

    return 0;
}

#endif

void delete_regexp(RegExp* regexp)
{
    if (regexp) {
#if USE_DFA_REGEXP
	dfafree((struct dfa *) regexp);
#else
	regfree((regex_t*)regexp);
#endif
	abs_free (regexp, "delete_regexp");
    }
}

#define NMATCH	1

const char*
match_regexp(const char* str, RegExp* regexp)
{
#if USE_DFA_REGEXP
  int matched_size;
  int backref = 0;

  /*
   * Originally, dfaexec required that s[size-1] == eolchar
   * but with my necessary modification it now relies purely 
   * on finding eolchar in the text.
   */
  matched_size = dfaexec((struct dfa *)regexp, str,
				/* size = */0, &backref);
  assert(!backref);
  if (matched_size >= 0)
      return str + matched_size;
#else
  regmatch_t pmatch[NMATCH];
  int ret = regexec((regex_t*)regexp, str, NMATCH, pmatch, 0);
  if (ret == 0 && pmatch[0].rm_so == 0)
    return (const char*)str + pmatch[0].rm_eo;
#endif
  return NULL;
}

#if USE_DFA_REGEXP
const char*
match_regexp_all_alternatives(const char* str, RegExp* regexp, int nalts, int *alts)
{
  int matched_size;
  int backref = 0;

  matched_size = dfaexec_allmatches((struct dfa *)regexp, str,
				&backref, nalts, alts);
  assert(!backref);
  if (matched_size >= 0)
      return str + matched_size;

  return NULL;
}
#endif

size_t
regexp_error(int error, RegExp* regexp, char* buf, size_t sz)
{
#if USE_DFA_REGEXP
# if defined(WIN32) && defined(RTS)
  return sprintf(buf, "%.*s", sz-1, dfa_error_message);
# else
  return snprintf(buf, sz, "%s", dfa_error_message);
# endif
#else
  return regerror(error, (regex_t*)regexp, buf, sz);
#endif
}

/*------------------------------------------------------------------------------
// Define AGFL for CDL3 interface for Agfl compiler.
//
// EXTERNAL TEST validate regexp (>TEXT, INT>):400
//
// Description:
//	Check validity of regular expression TEXT. If valid, assign
//	the number of fields in regexp to INT, and succeed.
//	Note: number of fields not implemented.
//
// Side effects:
//	If the expression TEXT is erroneous, print an error message
//	on standard error.
//----------------------------------------------------------------------------*/
#ifdef AGFL

#include <cdl3rts.h>

int E400_validate_regexp_TEXT_INT(value v_TEXT, value* v_INT)
{
  int error;
  RegExp* comp = new_regexp(Text(v_TEXT), &error);
  if (error == 0)
  {
    *v_INT = C_INT(1L);		/* number of fields: not implemented */
    delete_regexp(comp);
    return 1;
  }
  if (error < 0)
    abs_fatal ("regexp not supported, please contact agfl@cs.kun.nl!");
  else
  {
    enum { BufSz = 256 };
    char buf[BufSz];
    regexp_error(error, comp, buf, BufSz);
    fprintf(stderr, "agfl: regexp error: %s\n", buf);
  }
  delete_regexp(comp);
  return 0;
}

#endif /* AGFL */

/*
//------------------------------------------------------------------------------
// Define STAND_ALONE for stand-alone executable with simple
// user interface for testing regexps.
//------------------------------------------------------------------------------
*/

#ifdef STAND_ALONE

static char* chop(char* p)
{
  p[strlen(p) - 1] = '\0';
  return p;
}

static char* prompt(char* buf)
{
  enum { MaxIn = 1024 };
  fprintf(stderr, "pattern> ");
  if (fgets(buf, MaxIn, stdin) && buf[0] != '\n')
    return buf;
  else
    return NULL;
}

int main(int argc, char *argv[])
{
  enum { BufSz = 1024 };

  char		buf[BufSz];
  RegExp*	regexp;
  int		error;

  if (argc != 2)
  {
    fprintf(stderr, "Usage: %s <regexp>\n", argv[0]);
    exit(1);
  };

  regexp = new_regexp(argv[1], &error);
  if (error < 0)
  {
    fprintf(stderr, "regex functions not available... update your OS!\n");
    exit(1);
  }
  if (error)
  {
    regexp_error(error, regexp, buf, BufSz);
    fprintf(stderr, "Error %d: %s\n", error, buf);
    exit(1);
  };
  while(prompt(buf))
  {
    char* next;

    chop(buf);
    next = (char*)match_regexp(buf, regexp);
    if (next == NULL)
      printf("No match\n");
    else if (*next == '\0')
      printf("Matched: `%s\'\n", buf);
    else
    {
      *next = '\0';
      printf("Matched prefix: `%s\'\n", buf);
    }
  }
  delete_regexp(regexp);
  return 0;
}

#endif /* STAND_ALONE */
