/*
   File: datfct_parser.c
   Parses dat and fact files

   Copyright 2008-2009 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: datfct_parser.c,v 1.5 2009/02/05 10:07:02 marcs Exp $"
*/

/* system includes */
#include <stdio.h>
#include <string.h>
#include <stdarg.h>

/* libabase includes */
#include <abase_repr.h>
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_fileutil.h>

/* liblexicon includes */
#include <lxcn_input.h>
#include <lxcn_vocabulary.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "dyn_array.h"
#include "lif_parser.h"
#include "datfct_parser.h"
#include "affix_values.h"
#include "nonterminals.h"
#include "entries.h"

/*
   A dat line has the form
   "WORDFORM"	nonterminal name [(PARAMS)] [NUMBER]

   A fact line has the form
   "STRING"	nonterminal name [(PARAMS)] [NUMBER]
   where the STRING will be ignored
*/

/*
   Character reading administration
*/
#define MAX_DAT_LINE_LEN 4095
static char dat_line_buffer[MAX_DAT_LINE_LEN + 1];
static char *dat_fname;
static char *line_ptr;
static int has_errors;
static int has_facts;
static int linenr;
static FILE *dat;

static void dat_error (char *format,...)
{ va_list argp;
  abs_printf ("File '%s', line %d: ", dat_fname, linenr);
  va_start (argp, format);
  abs_vprintf (format, argp);
  va_end (argp);
  abs_printf ("\n");
  has_errors = 1;
}

static void dat_warning (char *format,...)
{ va_list argp;
  if (!verbose) return;
  abs_printf ("    File '%s', line %d: ", dat_fname, linenr);
  va_start (argp, format);
  abs_vprintf (format, argp);
  va_end (argp);
  abs_printf ("\n");
}

static int ch_is_white_space (char ch)
{ return ((ch == ' ') || (ch == '\n') || (ch == '\f') || (ch == '\r') || (ch == '\t'));
}

/* Read line and eat all trailing white space */
static void read_line ()
{ char *line_end;
  int len;
  line_ptr = fgets (dat_line_buffer, MAX_DAT_LINE_LEN, dat);
  linenr++;
  if (line_ptr == NULL) return;
  len = (int) strlen (line_ptr);
  line_end = line_ptr + len - 1;
  while ((len != 0) && ch_is_white_space (*line_end))
    { line_end--; len--; }
  *++line_end = '\0';
}

/* Opening and closing of the dat/fact file */
static void try_open_file (int facts, char *lexicon_name)
{ dat_fname = abs_new_fmtd_string ("try_open_file", "%s.%s", lexicon_name,
				   (facts)?FCT_SUFFIX:DAT_SUFFIX);
  dat = abs_fopen (dat_fname, "r");

  /* Prepare line buffer */
  has_errors = 0;
  has_facts = facts;
  linenr = 0;
  read_line ();
}

static void close_file ()
{ if (has_errors)
    abs_abort ("close_file", "%s file '%s' contains errors",
	       (has_facts)?"Fact":"Lexicon", dat_fname);
  fclose (dat);
  abs_free (dat_fname, "close_file");
}

static void may_skip_white_space ()
{ while (ch_is_white_space (*line_ptr)) line_ptr++;
}

static int is_eof ()
{ return (line_ptr == NULL);
}

static int is_eoln ()
{ return (*line_ptr == '\0');
}

static void should_be_eoln ()
{ if (!is_eoln ())
    dat_error ("End of line expected");
  read_line ();
}

static void skip_eoln ()
{ while (!is_eof () && !is_eoln ()) line_ptr++;
}

static int ahead_letter ()
{ if (('a' <= (*line_ptr)) && (*line_ptr <= 'z')) return (1);
  if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  return (0);
}

static int ahead_name_char ()
{ if (('a' <= (*line_ptr)) && (*line_ptr <= 'z')) return (1);
  if (('A' <= (*line_ptr)) && (*line_ptr <= 'Z')) return (1);
  if (('0' <= (*line_ptr)) && (*line_ptr <= '9')) return (1);
  if ((((int) (*line_ptr)) & 0xff) >= 128) return (1);	/* Accept all extended ASCII */
  switch (*line_ptr)
    { case '$':
      case '+':
      case '-':
      case '?':
      case '@':
      case '^':
      case '~':
      case '_': return (1);
      default: break;
    };
  return (0);
}

static int is_char (char ch)
{ if (ch == *line_ptr)
    { line_ptr++;
      may_skip_white_space ();
      return (1);
    };
  return (0);
}

static void should_be_char (char ch)
{ if (is_char (ch)) return;
  dat_error ("Special character '%c'", *line_ptr);
  skip_eoln ();
}

/*
   Basic LL(1) parsing of the dat file
*/
static int is_comment ()
{ if (*line_ptr == '#')
    { read_line ();
      return (1);
    };
  return (0);
}

/*
   Reading of numbers:
   For parameters we must be able to distinguish between a signed number
   and a - as the first character of an affix name. Frequencies should
   always be positive
*/
static int ahead_signed_number ()
{ char *ptr;
  if (*line_ptr != '-') return (0);
  for (ptr = line_ptr + 1; ch_is_white_space (*ptr); ptr++) ;
  return (('0' <= *ptr) && (*ptr <= '9'));
}

static int is_digit (int *ret_val)
{ if (('0' <= (*line_ptr)) && (*line_ptr <= '9'))
    { *ret_val = (int) (((*line_ptr) - '0'));
      line_ptr++;
      return (1);
    };
  return (0);
}

static int is_number (int *ret_nr)
{ int value, digit;
  if (!is_digit (&value)) return (0);
  while (is_digit (&digit)) value = value * 10 + digit;
  may_skip_white_space ();
  *ret_nr = value;
  return (1);
}

static void should_be_number (int *ret_nr)
{ if (is_number (ret_nr)) return;
  dat_error ("Number expected");
  skip_eoln ();
};

static int is_signed_number (int *ret_nr)
{ int value;
  if (ahead_signed_number ())
    { should_be_char ('-');
      should_be_number (&value);
      *ret_nr = -value;
      return (1);
    };
  return (is_number (ret_nr));
}

/*
   Recognizing strings
*/
static int is_string (char *dbuf)
{ char *dptr = dbuf;
  int done = 0;
  if (*line_ptr != '"') return (0);
  line_ptr++;
  while (!done)
    switch (*line_ptr)
      { case '\0': dat_error ("Unterminated string"); done = 1; break;
	case '"':  line_ptr++; done = 1; break;
	case '\\':
	  { line_ptr++;
	    if (is_eoln ()) break;
	    *dptr++ = '\\';
	    *dptr++ = *line_ptr++;
	  }; break;
	default:
	  *dptr++ = *line_ptr++;
      };
  *dptr = '\0';
  may_skip_white_space ();
  return (1);
}

/*
   When the hyphen convention is active, a prefix, suffix and infix hyphen
   becomes special with the interpretation as below

   Convention: "-" at start of wordform -> Suffix
	       "-" at start and end of wordform -> Infix
	       "-" at end of wordform -> Prefix
	       "-" in middle of wordform -> Soft Hyphen
	       "\!" at start of wordform -> Literal match
	       "\-" -> a literal hyphen
	       "\n" -> a newline
	       "\t" -> a tab
	       "\r" -> a return
	       "\\" -> a backslash
	       "\"" -> a double quote
	       "\uHHHH"     -> UTF8 encoding of unicode character HHHH
	       "\UHHHHHHHH" -> UTF8 encoding of unicode character HHHHHHHH

   Special markers are orthogonal:
   "-\!oxy-" denotes a literal infix "oxy"

   Note: all leading and trailing white space in word forms is removed

   UTF8 encoding:           | 1st byte | 2nd byte | 3rd byte | 4th byte |
   U00000000 - U0000007F -> | 0xxxxxxx |          |          |          |
   U00000080 - U000007FF -> | 110yyyyy | 10xxxxxx |          |          |
   U00000800 - U0000FFFF -> | 1110zzzz | 10yyyyyy | 10xxxxxx |          |
   U00010000 - U001FFFFF -> | 11110uuu | 10uuzzzz | 10yyyyyy | 10zzzzzz |
*/
static int is_hex_digit (char ch, int *val)
{ if (('0' <= ch) && (ch <= '9'))
    { *val = (int) (ch - '0');
      return (1);
    };
  if (('A' <= ch) && (ch <= 'F'))
    { *val = (int) (ch - 'A') + 10;
      return (1);
    };
  if (('a' <= ch) && (ch <= 'f'))
    { *val = (int) (ch - 'a') + 10;
      return (1);
    };
  return (0);
}

static void add_utf8_character (char **sptr, char **dptr, int nr)
{ char *ptr = *sptr;
  char *tptr = *dptr;
  int value = 0;
  int ix, dig, hdr, nr_bytes;

  /* scan character */
  for (ix = 0; ix < nr; ix++)
    { if (is_hex_digit (*ptr, &dig))
	{ value = value * 16 + dig;
	  ptr++;
	}
      else
	{ dat_error ("Hex digit expected in unicode sequence");
	  break;
	};
    };
  *sptr = ptr;

  /* Encode character */
  if    (value & 0x1F0000) { hdr = 0xF0; nr_bytes = 4; }
  else if (value & 0xF800) { hdr = 0xE0; nr_bytes = 3; }
  else if (value & 0x0780) { hdr = 0xC0; nr_bytes = 2; }
  else { hdr = 0; nr_bytes = 1; };

  do
    { nr_bytes--;
      *tptr++ = (char) (hdr | (value >> (nr_bytes * 6)));
      value &= ((1 << (nr_bytes * 6)) - 1);
      hdr = 0x80;
    }
  while (nr_bytes > 0);
  *dptr = tptr;
}

static char lexeme_buf[MAX_DAT_LINE_LEN + 1];
static int is_word_form (char **ret_ptr, int *ret_marker)
{ char string_buf[MAX_DAT_LINE_LEN + 1];
  char *sptr = string_buf;
  char *dptr = lexeme_buf;
  int marker = 0;
  int len;
  char ch;
  if (!is_string (string_buf)) return (0);
  len = (int) strlen (string_buf);
  
  /* Strip leading and trailing layout */
  while (ch = *sptr, ((ch == ' ') || (ch == '\t'))) { sptr++; len--; };
  while (ch = sptr[len - 1], ((ch == ' ') || (ch == '\t'))) sptr[--len] = '\0';

  /* Copy lexeme from source to lexeme_buf while expanding escapes */
  while ((ch = *sptr++))
    switch (ch)
      { case ' ':
	case '\t':
	  { /* Eat all other white space */
	    while ((ch = *sptr), (ch == ' ') || (ch == '\t')) sptr++;
	    *dptr++ = ' ';
	    marker |= LexemeMultiWordBit;
	  }; break;
	case '-':
	  { if (!hyphen_convention_active)
	      *dptr++ = '-';
	    else if ((dptr == lexeme_buf) && !(marker & LexemeSuffixBit))
	      /* "-" is leading the lexeme, hence a suffix marker */
	      marker |= LexemeSuffixBit;
	    else if (!(*sptr))
	      /* "-" is trailing the lexeme, hence a prefix marker */
	      marker |= LexemePrefixBit;
	    else
	      /* In the middle of a lexeme, the - denotes a soft hyphen */
	      *dptr++ = SoftHyphenChar;
	  }; break;
	case '\\':
	  { ch = *sptr++;
	    switch (ch)
	      { case 'n': *dptr++ = '\n'; break;
		case 't': *dptr++ = '\t'; break;
		case 'r': *dptr++ = '\r'; break;
		case '\\': *dptr++ = '\\'; break;
		case '"': *dptr++ = '"'; break;
		case '!':
		  if (dptr != lexeme_buf)
		    { dat_warning ("Literal escape '\\!' is only recognized at start of lexeme");
		      *dptr++ = ch;
		    }
		  else if (marker & LexemeLiteralBit) /* "\!" is leading the lexeme */
		    dat_warning ("Multiple literal marking");
		  else marker |= LexemeLiteralBit;
		  break;
		case '-': *dptr++ = '-'; break;
		case 'u': add_utf8_character (&sptr, &dptr, 4); break;
		case 'U': add_utf8_character (&sptr, &dptr, 8); break;
		default:
		  dat_warning ("Unknown escape sequence '\\%c'", ch);
		  *dptr++ = ch;
	      };
	  }; break;
	default:
	  *dptr++ = ch;
      };
  *dptr = '\0';

  /* Check for empty lexeme */
  if (!strlen (lexeme_buf))
    dat_error ("Lexeme only consists of white space and literal/hyphen marks");

  /* Return with success */
  *ret_ptr = lexeme_buf;
  *ret_marker = marker;
  return (1);
}

/*
   Recognition of nonterminals, either as fact or a lexicon nonterminal
*/
static char nonterminal_buf[MAX_DAT_LINE_LEN + 1];
static int is_nonterminal_name ()
{ char *dptr = nonterminal_buf;
  if (!ahead_letter ()) return (0);
  while (!is_eoln () && ahead_name_char ())
    { *dptr++ = *line_ptr++;
      if (ch_is_white_space (*line_ptr))
        { may_skip_white_space ();

          /* Check for a new idpart */
	  if (is_eoln () || !ahead_letter ())
	    break;
          *dptr++ = ' ';
        };
    };
  *dptr = '\0';
  may_skip_white_space ();
  return (line_ptr != nonterminal_buf);
}

static void should_be_nonterminal_name ()
{ if (is_nonterminal_name ()) return;
  nonterminal_buf[0] = '\0';
  dat_error ("Nonterminal name expected");
  skip_eoln ();
}

/*
   Parameter recognization
   Actual int or text affixes of a call must be saved locally until it
   is clear whether they are critical or non critical parameters.
   So a limit is set to maximal 20 actual positions
   Set affixes are always directly registered through their index.

   We also store the formal typing, the affix indexes, and possible unions into
   three static separate arrays to avoid reallocating over and over again.
   The code for the punions array is quite dirty....

   Following the identification, non critical affixes are entered into
   their appropriate affix table. Critical affixes will be stored into
   their appropriate primary lookup tables.
*/
#define MAX_PARAMS 20
#define DEFAULT_SIZE 256
#define DEFAULT_MAX_MARKERS 8
#define DEFAULT_MAX_UNION 16
static int param_types[MAX_PARAMS];
static int actual_idxs[MAX_PARAMS];

static int act_int_params[MAX_PARAMS];
static char act_text_params[MAX_PARAMS][MAX_DAT_LINE_LEN + 1];
static int_array actual_unions[MAX_PARAMS];

static void init_datfct_parser ()
{ int ix;

  /* Explicitly initialize the unions array */
  for (ix = 0; ix < MAX_PARAMS; ix++)
    actual_unions[ix] = init_int_array (DEFAULT_MAX_UNION);
}

/*
   Integral affix recognition
*/
static int is_number_parameter (int idx)
{ int value;
  if (!is_signed_number (&value)) return (0);
  act_int_params[idx] = value;
  return (1);
}

/*
   Text affix recognition
*/
static int is_string_parameter (int idx)
{ char string_buf[MAX_DAT_LINE_LEN + 1];
  char *sptr = string_buf;
  char *dptr = act_text_params[idx];
  char ch;
  if (!is_string (string_buf)) return (0);

  /* Copy string from source to actual text param buffer while expanding escapes */
  while ((ch = *sptr++))
    switch (ch)
      { case '\\':
	  { ch = *sptr++;
	    switch (ch)
	      { case 'n': *dptr++ = '\n'; break;
		case 't': *dptr++ = '\t'; break;
		case 'r': *dptr++ = '\r'; break;

		/* To add: handling \u */
		/* To add: handling \x */
		default:
		  *dptr++ = ch;
	      };
	  }; break;
        default:
	  *dptr++ = ch;
      };
  *dptr = '\0';
  // *index = register_new_text_affix (cstring_buf);
  return (1);
}

/*
   Recognition of set affixes
   For the reading of affix names, we accept letters, digits and ornaments
*/
static int is_affix_name (int *index)
{ char buf[MAX_DAT_LINE_LEN+1];
  char *dptr = buf;
  while (!is_eoln () && ahead_name_char ())
    *dptr++ = *line_ptr++;
  *dptr = '\0';
  if (dptr == buf) return (0);
  may_skip_white_space ();
  *index = lookup_set_affix (buf);
  if (*index < 0)
    { dat_error ("Unknown set affix '%s'", buf);
      *index = FormalERROR;
    };
  return (1);
}

static void should_be_affix_name (int *index)
{ if (is_affix_name (index)) return;
  dat_error ("affix name expected");
  skip_eoln ();
  *index = FormalERROR;
}

static int is_parameter (int idx)
{ int_array new_union;
  int index;
  if (is_number_parameter (idx))
    { param_types[idx] = FormalINT;
      return (1);
    }
  else if (is_string_parameter (idx))
    { param_types[idx] = FormalTEXT;
      return (1);
    };
  if (!is_affix_name (&index)) return (0);
  if (!is_char ('|'))
    { param_types[idx] = FormalSET;
      actual_idxs[idx] = index;
      return (1);
    };

  /* So we have a union of some set affixes */
  param_types[idx] = FormalUNION;
  new_union = actual_unions[idx];
  new_union -> size = 0;
  app_int_array (new_union, index);

  /* Recognize the rest of the union and directly uniquify the affix indices */
  should_be_affix_name (&index);
  app_sorted_int_array (new_union, index);
  while (is_char ('|'))
    { should_be_affix_name (&index);
      app_sorted_int_array (new_union, index);
    };
  return (1);
}
 
static void should_be_parameter (int idx)
{ if (is_parameter (idx)) return;
  dat_error ("parameter expected");
  skip_eoln ();
}

static int is_parameter_pack (int *nr_formals)
{ int idx = 0;
  if (is_char ('('))
    { should_be_parameter (idx++);
      while (is_char (','))
        { if (idx == MAX_PARAMS)
            dat_error ("Too many actual affix positions");
	  should_be_parameter (idx++);
	};
      should_be_char (')');

      /* Register parameter pack */
      *nr_formals = idx;
      return (1);
    };

  *nr_formals = idx;
  return (0);
}

static void may_be_parameter_pack (int *nr_of_formals)
{ (void) is_parameter_pack (nr_of_formals);
}

static void should_be_parameter_pack (int *nr_of_formals)
{ if (is_parameter_pack (nr_of_formals)) return;
  dat_error ("parameter pack expected");
  skip_eoln ();
}

/*
   After the nonterminal has been identified by name and nr of formals,
   the actual parameter pack is checked against the formal pack. The
   second task of the type check is to enter the int, string and union
   affixes into their proper critical/noncritical storage while registering
   their indexes.
*/
static void type_check_union_and_unify (int ix, int formal)
{ int_array act_union = actual_unions[ix];
  int part_error = 0;
  int aff_nr;
  int iy;

  /* Check if we have only one affix to unify */
  if (act_union -> size == 1)
    { aff_nr = act_union -> array[0];
      if (!affix_belongs_to_domain (aff_nr, formal))
	dat_error ("Type mismatch for parameter %d: union part %s against formal type %s",
                    ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));

      /* We only have a single affix in the union */
      dat_warning ("Union affix resolved into single affix %s", affix_name_from_index (aff_nr));
      param_types[ix] = FormalSET;
      actual_idxs[ix] = aff_nr;
      return;
    };

  /* Type check the union of the affixes against  */
  for (iy = 0; iy < act_union -> size; iy++)
    { aff_nr = act_union -> array[iy];
      if (!affix_belongs_to_domain (aff_nr, formal))
	{ dat_error ("Type mismatch for parameter %d: union part %s against formal type %s",
		     ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));
	  part_error = 1;
	};
    };

  /* Someone in error, resolve */
  if (part_error)
    { actual_idxs[ix] = FormalERROR;
      param_types[ix] = FormalERROR;
    };

  /* Turn it into a single affix */
  actual_idxs[ix] = register_new_union_affix (act_union, formal);
  param_types[ix] = FormalSET;
}

static void type_check_parameter (int ix, int formal, int critical)
{ int ptype = param_types[ix];
  switch (ptype)
    { case FormalINT:
	if (formal != FormalINT)
	  { dat_error ("Type mismatch for parameter %d: %s against formal type %s",
		       ix + 1, affix_name_from_index (ptype), affix_name_from_index (formal));
	    actual_idxs[ix] = FormalERROR;
	  }
	else if (critical)
	  { // Enter critical int affix as itself
	    actual_idxs[ix] = act_int_params[ix];
	  }
	else actual_idxs[ix] = register_new_int_affix (act_int_params[ix]);
	break;
      case FormalTEXT:
	if (formal != FormalTEXT)
	  { dat_error ("Type mismatch for parameter %d: %s against formal type %s",
		       ix + 1, affix_name_from_index (ptype), affix_name_from_index (formal));
	    actual_idxs[ix] = FormalERROR;
	  }
	else if (critical)
	  { // Enter critical text affix in trie of all critical text affixes
	    int *info_ptr = lxcn_enter_into_vocabulary (&all_critical_texts, act_text_params[ix]);
	    if (*info_ptr == 0)
	      { char *saved_crit = abs_new_string (act_text_params[ix], "type_check_parameter");
		*info_ptr = crit_text_vector -> size;
		app_text_array (crit_text_vector, saved_crit);
	      }
	    actual_idxs[ix] = *info_ptr;
	  }
	else actual_idxs[ix] = register_new_text_affix (act_text_params[ix]);
        break;
      case FormalSET:
	{ int aff_nr = actual_idxs[ix];
	  if ((formal < 0) || !affix_belongs_to_domain (aff_nr, formal))
	    dat_error ("Type mismatch for parameter %d: %s against formal type %s",
		       ix + 1, affix_name_from_index (aff_nr), affix_name_from_index (formal));
	}; break;
      case FormalUNION:
	{ if (formal < 0)
	    { dat_error ("Type mismatch for parameter %d: union type against formal type %s",
		         ix + 1, affix_name_from_index (formal));
	      break;
	    };
	  type_check_union_and_unify (ix, formal);
        };
      default: break;
    };
}

static void type_check_parameter_pack (int nont_index)
{ int_array formals = formals_from_nonterminal (nont_index);
  int_array criticals = criticals_from_nonterminal (nont_index);
  int ix;
  for (ix = 0; ix < formals -> size; ix++)
    type_check_parameter (ix, formals -> array[ix], criticals -> array[ix]);
}

/*-----------------------------------------------------------------------------
// FREQ		PENALTY
// -2..-10	+2
// -1		+1
// 0		0	(Penalty van een MATCH)
// 1		-1	(Eigenlijk een bonus)
// 2..10	-2
//
// Bovenop de frequentie penalty komt een malus (penalty van 5)
// Deze geldt ook voor regexp matches. Hier bovenop komt een
// bonus voor collocaties gelijk aan hun aantal woorden in het kwadraat
//----------------------------------------------------------------------------*/
static int penalty_from_frequency (int freq)
{ int sign, max_freq, penalty;
  if (freq < 0)
    { sign = +1;
      freq = -freq;
    }
  else if (freq > 0) sign = -1;
  else return (0);
  for (penalty = 1, max_freq = 1;
       (penalty < 9) && (freq > max_freq);
       penalty++, max_freq *= 10) ;
  return (sign * penalty + 5);
}

static void may_be_frequency_or_bonus (int *bonus)
{ /* May be parse number */
  int nr;
  if (is_signed_number (&nr)) ;
  else if (lexicon_has_frequencies) nr = 1;
  else nr = 0;

  if (lexicon_has_frequencies)
    *bonus = -penalty_from_frequency (nr);
  else /* lexicon has penalties */
    *bonus = nr;
}

#ifdef notyet
static int calculate_multiword_bonus (char *lexeme)
{ int nwords = 1;

  while (lexeme[0])
    { if (((unsigned char) lexeme[0] == SoftHyphenChar) || ch_is_white_space(lexeme[0]))
	{ nwords++;
	  while (((unsigned char) lexeme[1] == SoftHyphenChar) || ch_is_white_space(lexeme[1]))
	    lexeme++;
        };
      lexeme++;
    }

  return (nwords * nwords - 1 * 1);
}
#endif

static int is_rule ()
{ int lexeme_marker;
  char *lexeme_ptr;

  if (is_word_form (&lexeme_ptr, &lexeme_marker))
    { /* Read remainder of rule */
      int nont_index, nr_params, bonus, fact_nr, call_id;
      int_array criticals;
      int *info_ptr;
      should_be_nonterminal_name ();
      may_be_parameter_pack (&nr_params);
      may_be_frequency_or_bonus (&bonus);
#ifdef notyet
      bonus += calculate_multiword_bonus (lexeme_ptr);
#endif

      /* Identification and typecheck */
      nont_index = lookup_nonterminal (nonterminal_buf, nr_params);
      if (nont_index < 0)
	{ dat_error ("No such nonterminal '%s/%d'", nonterminal_buf, nr_params);
	  return (1);
	}

      fact_nr = fact_nr_from_nonterminal (nont_index);
      if (fact_nr >= 0)
        { dat_error ("Nonterminal '%s/%d' is a fact", nonterminal_buf, nr_params);
          return (1);
        };

      /* Type check and register the entry */
      type_check_parameter_pack (nont_index);
      criticals = criticals_from_nonterminal (nont_index);
      call_id = register_new_call (nont_index, criticals, actual_idxs);
      info_ptr = enter_into_lexicon (lexeme_ptr, lexeme_marker);
      register_new_entry (info_ptr, call_id, bonus);
      return (1);
    };
  return (0);
}

static int *collect_crits_and_enter_into_fact_table (int fact_nr, int_array criticals)
{ int crits[MAX_PARAMS];
  int ix, cidx, total;
  for (ix = 0, cidx = 1, total = 0; ix < criticals -> size; ix++)
    if (criticals -> array[ix])
      { crits[cidx] = actual_idxs[ix];
        cidx++;
	total++;
      };
  crits[0] = total;	/* Note: should always be the same for one fact */

  /* Enter into right fact table, with crits and total */
  return (enter_into_fact_table (fact_nr, crits));
}

static int is_fact ()
{ char string_buf[MAX_DAT_LINE_LEN + 1];
  if (is_string (string_buf))
    { int nont_index, nr_params, bonus, fact_nr, call_id;
      int_array criticals;
      int *info_ptr;

      /* Parse parameters and bonus */
      should_be_nonterminal_name ();
      should_be_parameter_pack (&nr_params);
      may_be_frequency_or_bonus (&bonus);

      /* Identification and typecheck */
      nont_index = lookup_nonterminal (nonterminal_buf, nr_params);
      if (nont_index < 0)
	{ dat_error ("No such nonterminal '%s/%d'", nonterminal_buf, nr_params);
	  return (1);
	};

      fact_nr = fact_nr_from_nonterminal (nont_index);
      if (fact_nr < 0)
        { dat_error ("Nonterminal '%s/%d' is not a fact", nonterminal_buf, nr_params);
          return (1);
        };

      /* Type check and register the entry */
      type_check_parameter_pack (nont_index);
      criticals = criticals_from_nonterminal (nont_index);
      call_id = register_new_call (nont_index, criticals, actual_idxs);
      info_ptr = collect_crits_and_enter_into_fact_table (fact_nr, criticals);
      register_new_entry (info_ptr, call_id, bonus);
      return (1);
    };
  return (0);
}

/*
   Drive the lexicon and fact file parsing
*/
static void parse_dat_file (char *lname)
{ abs_message ("  reading lexicon \"%s\"", lname);
  try_open_file (0, lname);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) read_line ();
      else if (is_comment ()) ;
      else if (is_rule ())
        { /* May still be followed by a comment */
	  if (is_comment ()) ;
          else should_be_eoln ();
	}
      else
	{ dat_error ("incomprehensible syntax");
	  read_line ();
        };
    }
  close_file ();
}

static void parse_fct_file (char *fname)
{ abs_message ("  reading fact table \"%s\"", fname);
  try_open_file (1, fname);
  while (!is_eof ())
    { /* Body should eat line */
      may_skip_white_space ();
      if (is_eoln ()) read_line ();
      else if (is_comment ()) ;
      else if (is_fact ())
        { /* May still be followed by a comment */
	  if (is_comment ()) ;
          else should_be_eoln ();
	}
      else
	{ dat_error ("incomprehensible syntax");
	  read_line ();
        };
    }
  close_file ();
}

static void report_datfct_stats ()
{ abs_message ("    collected %d INT affixes, %d TEXT affixes, %d set affixes",
	       nr_of_int_affixes (), nr_of_text_affixes (), nr_of_set_affixes ());
  abs_message ("    collected %d different calls of lexicon/fact nonterminals", nr_of_calls ());
  abs_message ("    collected %d critical text affixes", crit_text_vector -> size - 1);
  abs_message ("    collected %d entries, %d feature entries",
	       nr_of_entry_lists (), nr_of_entries ());
}

void parse_datfct_files ()
{ int ix;
  init_datfct_parser ();
  for (ix = 0; ix < lexicon_names -> size; ix++)
    parse_dat_file (lexicon_names -> array[ix]);
  for (ix = 0; ix < fact_table_names -> size; ix++)
    parse_fct_file (fact_table_names -> array[ix]);
  if (verbose) report_datfct_stats ();
}
