/*
   File: lif_parser.c
   Parses the lexicon interface file

   Copyright 2009-2010 Radboud University of Nijmegen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.

   CVS ID: "$Id$"
*/

/* system includes */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>

/* libabase includes */
#include <abase_repr.h>
#include <abase_error.h>
#include <abase_memalloc.h>
#include <abase_fileutil.h>

/* local includes */
#include "options.h"
#include "globals.h"
#include "lexgen_lexer.h"
#include "dyn_array.h"
#include "lif_parser.h"
#include "affix_values.h"
#include "nonterminals.h"

/*
   A lexicon interface file consists of 7 parts separated by a specific line
   containing '*****'. The 7 parts are
   1) A list of lexicon names (to be read by the dat parser)
   2) A list of fact table names (to be read by the dat? parser)
   3) A list of triples files names (to be read by the trp parser)
   4) The affix table (defining all affix names)
   5) The lexicon nonterminal table (defining all nonterminals with
      appropriate typing)
   6) The fact nonterminal table (defining all nonterminals with
      appropriate typing)
   7) Pragmats which influence the form or meaning of the lexicon in some way
   8) The terminal table (defining all grammar terminals including
      placeholders line $end of sentence$)
*/

/*
   The lexicon part is juts a list of lexicon names
*/
static void read_name_array (text_array names)
{ while (!is_end () && !is_lif_section_separator ())
    { char name_buffer[MAX_LEX_LINE_LEN + 1];
      may_skip_white_space ();
      should_be_name (name_buffer);
      if (name_buffer[0])
	app_uniq_text_array (names, abs_new_string (name_buffer, "read_name_array"));
      should_be_eoln ();
    };
}

#define MAX_INIT_LEXICA 16
static void read_lexicon_names ()
{ lexicon_names = init_text_array (MAX_INIT_LEXICA);
  read_name_array (lexicon_names);
}

#define MAX_INIT_FACTS 16
static void read_fact_table_names ()
{ fact_table_names = init_text_array (MAX_INIT_FACTS);
  read_name_array (fact_table_names);
}

#define MAX_INIT_TRPS 16
static void read_triples_database_names ()
{ triples_database_names = init_text_array (MAX_INIT_TRPS);
  read_name_array (triples_database_names);
  /* Default: use basename */
  if (triples_database_names -> size == 0)
    { app_uniq_text_array (triples_database_names, abs_new_string (basename, "read_triples_database_names"));
    }
}

/*
   The affix table consists of a number of lines, each line containing
   4 entries separated by TABs (as column separator):
   STRING 	affix name
   NUMBER 	coder index of affix
   HEXNUMBER	value of affix
   STRING	lhs/rhs indicating if the affix is a nonterminal or terminal
*/
static void read_affix_table ()
{ int prev_lhs = 0;
  while (!is_end () && !is_lif_section_separator ())
    { char affix_name[MAX_LEX_LINE_LEN + 1];
      int coder_number;
      Bitset64 affix_value;
      int affix_lhs, stat;
      may_skip_white_space ();
      should_be_name (affix_name);
      should_be_lif_column_separator ();
      should_be_unsigned_number (&coder_number);
      should_be_lif_column_separator ();
      should_be_bitset (&affix_value);
      should_be_lif_column_separator ();
      should_be_lhs_or_rhs (&affix_lhs);
      should_be_eoln ();

      /* register the new set affix: either the index is returned or < 0 */
      stat = register_new_set_affix (affix_name, coder_number, affix_value, affix_lhs, prev_lhs);
      if (stat < 0)
	parse_error ("inconsistent declaration of set affix '%s'", affix_name);
      else if (affix_lhs) prev_lhs = stat;
    };
}

/*
   The nonterminal part consists of a number of lines, each containing
   a variable number of entries separated by TABs (as column separator):
   STRING	nonterminal name (which may contain spaces)
   NUMBER	nr of affix positions
   NUMBER	coder index (nonterminal_nr)
   STRING*	formal parameter type by name (may be INT or TEXT)
		as many as necessary, optionally prefixed with a >
		to indicate its criticalness.
*/
#define streq(s1,s2) (strcmp((s1),(s2)) == 0)
static void read_nonterminal_table (int facts)
{ while (!is_end () && !is_lif_section_separator ())
    { char nonterminal_name[MAX_LEX_LINE_LEN + 1];
      char fpar_name[MAX_LEX_LINE_LEN + 1];
      int nr_of_positions;
      int nonterminal_nr;
      int_array critical;
      int_array formals;
      int ix;

      may_skip_white_space ();
      should_be_nonterminal_name (nonterminal_name);
      should_be_lif_column_separator ();
      should_be_unsigned_number (&nr_of_positions);
      should_be_lif_column_separator ();
      formals = init_int_array (nr_of_positions);
      critical = init_int_array (nr_of_positions);
      should_be_unsigned_number (&nonterminal_nr);
      for (ix = 0; ix < nr_of_positions; ix++)
        { int crit;
	  should_be_lif_column_separator ();
	  crit = is_char ('>');
	  app_int_array (critical, crit);
	  should_be_name (fpar_name);
	  if (streq (fpar_name, "INT"))
	    app_int_array (formals, FormalINT);
	  else if (streq (fpar_name, "TEXT"))
	    app_int_array (formals, FormalTEXT);
	  else
	    { int set_affix = lookup_set_affix (fpar_name); 
	      if (crit)
		parse_error ("Critical affix '%s' may only be a TEXT or INT affix", fpar_name);
	      if (set_affix < 0)
		{ app_int_array (formals, -42);
		  parse_error ("Unknown affix '%s'", fpar_name);
		}
	      else app_int_array (formals, set_affix);
	    };
	};
      if (register_new_nonterminal (facts, nonterminal_name, nonterminal_nr, formals, critical) < 0)
	parse_error ("Incorrect (re-)declaration of '%s/%d'", nonterminal_name, formals -> size);
      should_be_eoln ();
    };
}

static void read_pragmat_table ()
{ while (!is_end () && !is_lif_section_separator ())
    { char buffer[MAX_LEX_LINE_LEN + 1];
      should_be_name (buffer);
      if (streq (buffer, "hyphen_convention")) hyphen_convention_active = 1;
      else if (streq (buffer, "hybrid_parsing")) hybrid_parsing = 1;
      else if (streq (buffer, "encoding"))
	{ may_skip_white_space ();
	  should_be_string (buffer);
	  encoding = abs_new_string (buffer, "read_pragmat_table");
	};
      should_be_eoln ();
    };
}

static void read_terminal_table ()
{ while (!is_end () && !is_lif_section_separator ())
    { char buffer[MAX_LEX_LINE_LEN + 1];
      int marker;
      if (!is_placeholder_terminal (buffer, &marker))
        should_be_word_form (buffer, &marker);
      if (register_new_terminal (buffer, marker) < 0)
	parse_error ("Incorrect redeclaration of grammar terminal '%s', marker %01x",
		     buffer, marker);
      should_be_eoln ();
    };
}

/* exported actions */
void parse_lexicon_interface ()
{ try_open_lexer_file (basename, lexicon_interface);
  read_lexicon_names ();
  read_fact_table_names ();
  read_triples_database_names ();
  read_affix_table (); 
  read_nonterminal_table (0);
  read_nonterminal_table (1);
  read_pragmat_table ();
  read_terminal_table ();
  should_be_eof ();
  close_lexer_file ();
}
