/*
   File: rtslex.h
   Interface to lexical analysis module.

   Copyright 2005 Radboud University of Nijmegen
 
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.
 
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

   CVS ID: "$Id: rtslex.h,v 1.56 2005/09/28 11:07:10 olafs Exp $"
*/
#ifndef rtslex_h
#define rtslex_h

#include <limits.h>
//#include <abase_string.h>
#include <abase_lex_input.h>
#include <lexicon.h>
#include <lexicon_search.h>

/*  USE_RTSLEX_DEFINES 1 -> #define x(y) ...
 *                     0 -> sometype x(othertype y); see also: rtslex.c
 */
#define USE_RTSLEX_DEFINES 0

typedef long ARG;

typedef unsigned	Position;	/* not completely abstracted yet */
typedef	unsigned long	Terminal;	/* should be used by rtsagfl.c */
typedef long		StateBits;
typedef	long		NegMemo;

typedef struct _PMPROD
{ /* Description of the recognized rule */
  unsigned nont_nr;
  unsigned nr_formals;
  unsigned nr_variables;		/* formals + locals */
  unsigned nr_sons;

  /* Contains formals, locals, posmemo's of sons and typing of sons */
  LXCN_VALUE* variables;

  /* Next info */
  struct StateNode* next_state;		/* Next trellis state */
  long penalty;				/* Penalty of this rule */
  void* failcont;
  void* pass2;

  /* Posmemo admin */
  struct _PMPROD* next;			/* Next solution pointer */
  struct _PMPROD* equiv;		/* Pointer to equivalent nodes */
  struct _PMPROD* prime;		/* Pointer to first equivalent node */
} PMPROD;

/*
   For each position in the trellis, a positive memo per syntax nonterminal
   is maintained. Initially all positive memos will be set to POSMEMO_UNKNOWN
*/
typedef PMPROD* PosMemo;
#define POSMEMO_BLOCKED ((PosMemo) 0L)
#define POSMEMO_UNKNOWN ((PosMemo) 1L)

/* Currently, TRANS_BY_POS may only be 1 if NONT_CLASSES==1 */
/* 
   as of dd feb 2005 NONT_CLASSES can only be used with
   a setting of 1. There are compile time errors in 
   rtsafgl.c and rtslex.c (and many more I suspect).
   Just a few too many for the moment.
 */
#define NONT_CLASSES 1

/* set below two to 0 for original (i.e. upto May 2002) RegExp behaviour */
#define RE_ALSO_PART 1
#define ALSO_SHORTER_REGEXPS 1 /* if 0, only returns greediest match */

#define MEMO_BLOCKED	(LONG_MAX)
#define MEMO_OPEN	(0)
#define	MEMO_UNKNOWN	(-1)

typedef struct
{ const LEXICON* lexicon;
  const lxcn_Trie*	trie;
  const char**	terminals;
  const char**	matches;
  const char**	skips;
#if NONT_CLASSES
  unsigned	nr_nont_classes;
#endif
  unsigned	nr_terminals;
  unsigned	nr_matches;
  unsigned	nr_skips;
  unsigned	nr_neg_memos;
  unsigned	nr_syntax_nonterminals;
  Terminal	eos_terminal;
  unsigned char const *	blanks;
  unsigned char const *	terminators;
  unsigned char const *	invisibles;
  unsigned char const *	delimiters;
  unsigned char const *	translate_src;
  unsigned char const *	translate_dst;
  void		(*log_unknown)(unsigned pos, const char* token, unsigned len);
} LexInfo;

/*---------------------------------------------------------------------------
// AggregatePartStates contains 1 bit for each of the 16
// possible partstates (0..15) at some position within token.
// At the position after the last valid part, it contains an extra flag
// NextTokenStartsHere, meaning we couldn't find a parts_transition but
// we did find a next token there.
//-------------------------------------------------------------------------*/
typedef long	AggregatePartStates;
# define PartBitsMask	0x0001F		/* translate part_state into number */
# define MaxPartState	MultiTokenBit /* partState now only has latest kind */
  /* since 2000aug07, a partstate only has one bit (the last part type
  ** so far), so we introduced MaxPartState, which is no longer PartBitsMask
  ** ("PartBitsMask + 1" would push the bit over the size of a long)
  */
#define NextTokenStartsHere	(1 << (MaxPartState + 1))

/*----------------------------------------------------------------------------
// Bit-fields of StateBits, also used for partState:
// (0..MaxPartState is also used for looping over all 16 possible part_states,
// so it must be the lower bits of a word)
//--------------------------------------------------------------------------*/
#define PrefixBit	0x00001		/* prefix token */
#define InfixBit	0x00002		/* infix token */
#define SuffixBit	0x00004		/* suffix token */
#define SingleTokenBit	0x00008		/* single token */
#define MultiTokenBit	0x00010		/* multi token */
#if 0
//#define BlankBit	0x00020		/* token followed by blank */
#endif
#define UnusedBit	0x00040		/* for future expansion ? */
#define TermBit		0x00080		/* terminal is grammar lexeme */
#define LexBit		0x00100		/* terminal is lexicon lexeme */
#define MatchBit	0x00200		/* terminal is regexp match */
#define SkipBit		0x00400		/* terminal is regexp skip */
#define TransPartsBit	0x00800		/* has transition to next part */
#define TransTokenBit	0x01000		/* has transition to next token */
#define EosBit		0x02000		/* EOS */
#define TxtFreeBit	0x04000		/* transition->text is malloced */
#define TokenStartBit	0x08000	  /* =0 at start of 2nd and following parts */
#define TokenPartBit	0x10000		/* terminal is (live) part of token */
			/* Caution: Final and Nonfinal may occur together */
#define FinalPartBit	0x20000		/* part has transition to next token */
#define NonfinalPartBit	0x40000		/* part has transition to next part */

#if 0
/*
// Bit-fields of MemoBits
// Each input position has a MemoBits field (Trellis.memo_row).
*/

#define PrefixSuccBit	0x0001
#define PrefixFailBit	0x0002
#define InfixSuccBit	0x0004
#define InfixFailBit	0x0008
#define SuffixSuccBit	0x0010
#define SuffixFailBit	0x0020
#define SingleSuccBit	0x0040
#define SingleFailBit	0x0080
#define MultiSuccBit	0x0100
#define MultiFailBit	0x0200

#endif

/*----------------------------------------------------------------------------
// A Transition contains the info associated with a terminal starting
// at a certain character position.
// The next field points to the next possible Transition at the same position,
// the trans_dest field indicates (by index or ptr) the StateNode
// at the first terminal position after the token.
//--------------------------------------------------------------------------*/
#if NONT_CLASSES

typedef struct Transition
{ Terminal		terminal;		/* Bitwise encoding */
  LXCN_PARAM const *	params;
  char const *		text;
  LXCN_PENALTY           penalty;
  StateBits		type;
  struct StateNode*	trans_dest_state;
  struct Transition*	next;
} Transition;

/*--------------------------------------------------------------------------
// A StateNode contains info associated with a certain position in the text.
// The nr of negative memos allocated with this node is given by agfl-coder.
// The nr of positive memos is equal to the number of syntax rules
//------------------------------------------------------------------------*/
typedef struct StateNode
{ Transition** trans_lists;	/* The array of transition lists */
  NegMemo* neg_memos;		/* nr is given by agfl-coder */

#ifdef PMRTS
  PosMemo* pos_memos;		/* Positive memo array (per rule) */
#endif /* PMRTS */

  Position pos;
} StateNode;

#else /* no NONT_CLASSES */

typedef struct StateNode
{
  Terminal		terminal;
  LXCN_PARAM*		params;
  char*			text;
  LXCN_PENALTY           penalty;
  StateBits		type;
  NegMemo*		memos;
#ifdef PMRTS
  PosMemo*		memos;		/* Positive memo array (per rule) */
  SucAlt*		alt_list;	/* Succeeded alts are in this list */
#endif /* PMRTS */
  Position		pos;
  struct StateNode*	trans_dest_state;
  struct StateNode*	next;
} StateNode;
typedef StateNode Transition;
typedef StateNode State;
#endif /* NONT_CLASSES */

/*---------------------------------------------------------------------------
// coding of State.terminal:
// for nonterminals: 2bits type, 22 bits number, 8 bits arity
// for terminals, regexp_match, and regexp_skip: 2bits type, 30 bits number
// typebits 00	nonterminal	(so no bitwise AND/OR needed for type bits)
//	    01	terminal
//	    10	regexp_match
//	    11	regexp_skip
//-------------------------------------------------------------------------*/
#define DECODE_TERM_TYPE(term)		((term) >> 30)
#define TERM_IS_NONT(term)		(((term) & 0xC0000000) == 0x00000000)
#define DECODE_NONT_NUMBER(nont)	(((nont) >> 8))
#define DECODE_NONT_ARITY(nont)		((nont) & 0xff)
#define TERM_IS_TERM(term)		(((term) & 0xC0000000) == 0x40000000)
#define DECODE_TERM_NUMBER(term)	((term) & 0x3fFFffFF)
#define TERM_IS_MATCH(term)		(((term) & 0xC0000000) == 0x80000000)
#define TERM_IS_SKIP(term)		(((term) & 0xC0000000) == 0xC0000000)
#define DECODE_REGEXP_NUMBER(term)	((term) & 0x3fFFffFF)
#define ENCODE_NONT(nr,arity)		(((nr) << 8) | arity)
#define ENCODE_TERM(nr)			((nr) | 0x40000000)
#define ENCODE_MATCH(nr)		((nr) | 0x80000000)
#define ENCODE_SKIP(nr)			((nr) | 0xC0000000)
#define DECODE_NONT_CLASS(nont)		((nont) >> 8)
#define DECODE_TERM_OR_RE_CLASS(term)	((((term) >> 30) & 0x3) - 1 + nr_lexicon_nonterminals)

/*----------------------------------------------------------------------------
// A trellis contains an array of pointers to statenodes. At each position, we
// have a possibly empty list of states, representing the tokens that have
// been matched at that position (if any).
//--------------------------------------------------------------------------*/
#if NONT_CLASSES
typedef struct
{
    unsigned length;
    StateNode* first_state;
    StateNode** states_row;	/* allocated separately */
    StateNode** pstates_row;	/* allocated separately, for parts */
    unsigned* last_part_end_from;
    LEXICON* lexicon;
} Trellis;

#else /* no NONT_CLASSES */

typedef struct
{
  unsigned		length;
  State*		first_state;
  State**		state_row;	/* allocated separately */
  State**		parts_row;	/* allocated separately */
  unsigned*		last_part_end_from;
} Trellis;

#endif /* no NONT_CLASSES */

typedef StateNode* StateIndicator;

#if USE_RTSLEX_DEFINES

#define STATE_POS(ind)	(ind->pos)
#if NONT_CLASSES
#  define SET_FIRST_POS(trel,pos)  trel->first_state = trel->states_row[pos]
#else
#  define SET_FIRST_POS(trel,pos)  trel->first_state = trel->state_row[pos]
#endif
#define GET_FIRST_STATE_INDICATOR(trel)  (trel->first_state)
#define GET_FIRST_STATE_PTR(trel)	   (trel->first_state)

/* used by rtsagfl to determine if a space should be printed after terminal */
/* for "+" operator, see also has_*transition / add_*transition in rtslex.c */
#define IS_LASTPART(transition)		 ((transition)->type & TransTokenBit)
#define HAS_PARTS_TRANSITION(transition) ((transition)->type & TransPartsBit)
#define TRANSITION_TERMINAL(tra)	(tra->terminal)
#define TRANSITION_PARAMS(tra)		(tra->params)
#define TRANSITION_TEXT(tra)		(tra->text)
#define TRANSITION_NEXT_TRANS(tra)	(tra->next)
#define TRANSITION_PENALTY(tra)         (tra->penalty)

#if NONT_CLASSES

# define GET_TRELLIS_STATE_ROW(trel)	(trel->states_row)
# define GET_TRELLIS_PARTS_ROW(trel)	(trel->pstates_row)

# if TRANS_BY_POS /* TODO test for parts_trans */
#  define TRANSITION_DEST_STATE_INDICATOR(tra,trel)	(tra->trans_dest_pos)
#  define GET_STATE_TRANSLIST(trel,i_st,cls)	\
      (	(trel->states_row[i_st])->trans_lists[cls] )
# else
#  define TRANSITION_DEST_STATE_INDICATOR(tra,trel)	(tra->trans_dest_state)
#  define GET_STATE_TRANSLIST(trel,i_st,cls)	(i_st->trans_lists[cls])
# endif

#else /* NONT_CLASSES */
# define TRANSITION_DEST_STATE_INDICATOR(tra,trel)	(tra->trans_dest_state)
# define GET_TRELLIS_STATE_ROW(trel)	(trel->state_row)
# define GET_TRELLIS_PARTS_ROW(trel)	(trel->parts_row)
# define GET_STATE_TRANSLIST(trel,i_st,cls)	(i_st)
#endif /* NONT_CLASSES */

#else /* !USE_RTSLEX_DEFINES */

inline Position STATE_POS(StateNode *ind);
#if NONT_CLASSES
inline void SET_FIRST_POS(Trellis *trel, Position pos);
inline StateNode* GET_FIRST_STATE_INDICATOR(Trellis *trel);
inline StateNode* GET_FIRST_STATE_PTR(const Trellis *trel);
#else
inline void SET_FIRST_POS(Trellis *trel, Position pos);
inline State* GET_FIRST_STATE_INDICATOR(Trellis *trel);
inline State* GET_FIRST_STATE_PTR(const Trellis *trel);
#endif


inline int IS_LASTPART (Transition *t);
inline int HAS_PARTS_TRANSITION (Transition *t);
inline Terminal TRANSITION_TERMINAL(Transition *tra);
inline const LXCN_PARAM* TRANSITION_PARAMS(Transition* tra);
inline const char* TRANSITION_TEXT(Transition* tra);
inline struct Transition* TRANSITION_NEXT_TRANS(Transition *tra);
inline LXCN_PENALTY TRANSITION_PENALTY(Transition *tra);

#if NONT_CLASSES

 inline StateNode** GET_TRELLIS_STATE_ROW(Trellis *trel);
 inline StateNode** GET_TRELLIS_PARTS_ROW(Trellis *trel);
 inline Transition* GET_STATE_TRANSLIST(Trellis *trel, StateIndicator i_st, ARG cls);

 inline struct StateNode *TRANSITION_DEST_STATE_INDICATOR(Transition *tra, Trellis* const trel);

#else /* NONT_CLASSES */
inline struct StateNode *TRANSITION_DEST_STATE_INDICATOR(Transition *tra, Trellis* const trel);
inline State** GET_TRELLIS_STATE_ROW(Trellis *trel);
inline State** GET_TRELLIS_PARTS_ROW(Trellis *trel);
inline StateIndicator GET_STATE_TRANSLIST(Trellis *trel, StateIndicator i_st, ARG cls);
#endif /* NONT_CLASSES */

#endif /* USE_RTSLEX_DEFINES */

abs_LexemeType get_transition_lex_type(Transition* transition);
void init_lexer(const LexInfo* info);
void end_lexer(void);
StateNode* get_shortest_transition(Trellis const * trellis, StateNode const * istate);
Trellis* make_trellis_by_word_lexing(const char* input, LEXICON* the_lex);
void delete_trellis(Trellis* trellis);
void print_trellis(const Trellis* trellis);
void show_neg_memo_blocks(const Trellis* trellis);
int state_has_eos_transition(StateNode const * the_state);
int is_empty_trellis(const Trellis* trellis);

inline unsigned get_nr_neg_memos(void);
inline unsigned long get_nr_syntax_nonterminals(void);
inline Terminal get_eos_terminal(void);

/* for use in rtstrelinp: */
#if NONT_CLASSES
extern int gr_term_class;
extern int re_match_class;
extern int re_skip_class;
#endif

typedef enum { RegMatch, RegSkip }	RegType;
unsigned get_nr_regexps(RegType reg_type);
const unsigned char* get_terminal(unsigned i);
Transition* alloc_transition(void);
void free_transition(Transition* transition);
Trellis* alloc_trellis(unsigned len);

abs_LexemeType derive_lex_type_and_strip_hyphens(char const **p_txtbeg, char const **p_txtend);
const char* get_terminal_text(unsigned id, abs_LexemeType lex_type);
char* copy_string(const char* src, unsigned len);
void init_eos_transition(Transition* transition, unsigned pos);
void init_terminal_transition(Transition* transition, unsigned pos, unsigned id,
		    const unsigned char* from, const unsigned char* to,
		    abs_LexemeType lex_type);
void init_lexicon_transition(Transition* transition, unsigned pos, long info,
		   const unsigned char* from, const unsigned char* to,
		   abs_LexemeType lex_type);
void init_regexp_transition(Transition* transition, unsigned pos, unsigned id,
		  const unsigned char* from, const unsigned char* to,
		  RegType reg_type);

#define add_transition(src,dest,dpos)	ADD_transition(src,dest)
void ADD_transition(Transition* state, StateIndicator tdest);
unsigned get_length(const Transition* state);
void insert_transition(StateNode** states_row, unsigned pos, int nont_class,
					Transition* transition, unsigned len);
void delete_transition(Transition *transition);
void init_trellis(Trellis* trellis, unsigned len);
void add_trellis_neg_memos(Trellis *trellis);
#ifdef PMRTS
void add_trellis_pos_memos(Trellis *trellis);
void reset_trellis_pos_memos (Trellis *trellis);
#endif

unsigned get_nr_terminals(void);
Terminal code_nonterminal(unsigned id, int arity);
extern const char*	eos_text;
extern const LXCN_PENALTY	penalty_unknown;
extern int NR_classes;

#endif /* rtslex_h */
