/*
   File: lxcn_search.c
   Defines the routines to search in the lexicon or to match grammar terminals

   Copyright 2007 Radboud University of Nijmegen

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of   
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU Library General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
   CVS ID: "$Id: lxcn_search.c,v 1.7 2009/02/25 14:12:17 olafs Exp $"
*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */

/* System includes */
#include <stdio.h>
#include <stdlib.h>

/* includes from the abase lib */
#include <abase_memalloc.h>
#include <abase_error.h>

/* local includes */
#include "lxcn_vocabulary.h"
#include "lxcn_lexicon.h"
#include "lxcn_vocabulary_impl.h"
#include "lxcn_lexicon_impl.h"
#include "lxcn_search.h"

/* Temporary inclusion of max_edit_distance variable: should become hidden */
int max_edit_distance = 0;

/*
   Until grammar terminals are passed to the lexicon generator
   and added to the lexicon, the runtime system should match
   grammar terminals, using the following code.
*/
static char *match_lexeme_tail (char *input, char *lexeme, int *penalty)
{ /* Try to match the text */
  char inp_ch, lex_ch;

  while (!lxcn_is_eos (lex_ch = *lexeme++))
    { inp_ch = *input;

      if (lex_ch == ' ')
	{ /* a space matches any non empty white space */
	  if (!lxcn_is_white_space (inp_ch)) return (NULL);

	  /* Skip white space in input */
	  do { input++; }
	  while (lxcn_is_white_space (*input));
	} 
      else if ((unsigned char) lex_ch == SoftHyphenChar)
        { /* A soft hyphen matches white space, a hyphen or absence of hyphen */
          /* It may be that we also have to allow for a hyphen followed by white space */
	  if (lxcn_is_white_space (inp_ch))
	    { /* Skip white space in input */
	      do { input++; }
	      while (lxcn_is_white_space (*input));
	    }
	  else if (inp_ch == '-')
	    /* Hyphen matches */
	    input++;
	  /* Else do nothing, lexeme points to next letter to match */
        }

      /* Match character or its translation */
      else if (lex_ch == inp_ch) input++;
      else if (lex_ch == lxcn_translate (inp_ch))
	{ input++;
          if (penalty != NULL) *penalty += lxcn_translate_penalty(inp_ch);
	}
      else return (NULL);
    };

  /* Report success */
  return (input);
}

/*
   For grammar terminals, the lexeme marker may still be encoded as
   the first character of the lexeme
*/
char *lxcn_match_lexeme (char *input, char *lexeme, LexemeType *lex_type, int *penalty)
{ char marker = lexeme[0];
  char *matched;
  if (is_an_old_lex_marker (marker, lex_type)) lexeme++;
  else *lex_type = SingleToken;
  matched = match_lexeme_tail (input, lexeme, penalty);
  /* Return NULL if the empty string was matched (e.g. soft hyphen) */
  return (matched > input) ? matched : NULL;
}

/*
  Matching of input against the lexicon:

  Mark that I unify some code with the matching of lexeme tails
*/

/* Impose a maximum match length */
#define MAX_MATCH_LEN 1024

/* Define the structure of the lexicon iterator */
typedef enum
{ s_marker,
  s_entry,
  s_match_key,
  s_match_trans,
  s_match_soft_hyphen,
  s_pop_frame,
} iter_state;

typedef struct stack_item_rec *match_stack;
struct stack_item_rec
{ iter_state curr_state;	/* Iterator State */
  VocTrie curr_trie;		/* Pointer into trie */
  char *input;			/* Pointer to input to be matched */
  int penalty;			/* Penalty accrued due to char translation */
};

struct lexicon_iterator_rec
{ Lexicon lexicon;		/* Pointer to lexicon */
  int curr_voc;			/* Number of current vocabulary */
  int curr_mark;		/* Current marker tried */
  char *start_input;		/* Pointer to start of input to be matched */
  match_stack sp;		/* Always pointing to the last used */
  match_stack stack;		/* Stack of entries to iterate */
};

LexiconIterator lxcn_init_lexicon_match (char *input, Lexicon lexicon)
{ match_stack stack = abs_calloc (MAX_MATCH_LEN + 1, sizeof (struct stack_item_rec),
				  "lxcn_init_lexicon_match");
  LexiconIterator iter = abs_malloc (sizeof (struct lexicon_iterator_rec),
				     "lxcn_init_lexicon_match");
  iter -> lexicon = lexicon;
  iter -> curr_voc = -1;
  iter -> stack = stack;
  iter -> start_input = input;
  iter -> sp = iter -> stack;
  iter -> sp -> curr_state = s_marker;
  iter -> sp -> curr_trie = voc_trie_nil;
  iter -> sp -> input = input;
  iter -> sp -> penalty = 0;
  return (iter);
}

static VocTrie search_subtrie (VocTrie trie, char key)
{ VocIndexTree lv = trie -> tails;
  unsigned char ukey = (unsigned char) key;

  /* Iterative search */
  while (lv != voc_index_tree_nil)
    { if (ukey < lv -> key) lv = lv -> left;
      else if (ukey > lv -> key) lv = lv -> right;
      else /* Keys match */
	return (lv -> sub_trie);
    };
  return (voc_trie_nil);
}

char *lxcn_next_lexicon_match (LexiconIterator iter)
{ while (1)
    { iter_state state = iter -> sp -> curr_state;
      VocTrie curr_trie = iter -> sp -> curr_trie;
      char *input = iter -> sp -> input;

      /* Rare case of empty lexicon */
      switch (state)
	{ case s_marker:
	    { /* In this state we advance to the next vocabulary and marker */
	      iter -> curr_voc++;
	      if (iter -> curr_voc >= iter -> lexicon -> nr_vocabularies)
	        return (NULL);	/* Done matching */

	      /* New vocabulary to try */
	      iter -> curr_mark = iter -> lexicon -> all_lexeme_markers[iter -> curr_voc];
	      iter -> sp++;
	      iter -> sp -> curr_state = s_entry;
	      iter -> sp -> curr_trie = iter -> lexicon -> all_vocabularies[iter -> curr_voc];
	      iter -> sp -> input = input;
	    }; break;
	  case s_entry:
	    { /* Remember to continue with the next character as search key */
	      iter -> sp -> curr_state = s_match_key;
	      if (curr_trie -> search_key == NULL) break;
	      input = match_lexeme_tail (input, curr_trie -> search_key + curr_trie -> rem_offset, &iter -> sp -> penalty);
	      /* Return a result if some non-empty string was matched */
	      if (input != NULL && input > iter -> start_input)
		return (input);
	    }; break;
	  case s_match_key:
	    { char inp_ch = *input;
	      char lex_ch = inp_ch;
	      VocTrie sub_trie;
	      if (lxcn_is_white_space (inp_ch)) lex_ch = ' ';
	      sub_trie = search_subtrie (curr_trie, lex_ch);

	      /* Remember we have to check the translated character */
	      iter -> sp -> curr_state = s_match_trans;
	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
		  input++;
		  if (lex_ch == ' ')
		    while (lxcn_is_white_space (*input)) input++;
		  iter -> sp++;
		  iter -> sp -> curr_state = s_entry;
		  iter -> sp -> curr_trie = sub_trie;
		  iter -> sp -> input = input;
		  break;
		};
	    }; /* fall thru */
	  case s_match_trans:
	    { char inp_ch = *input;
	      char trans_ch = lxcn_translate (inp_ch);
	      char lex_ch = trans_ch;
	      VocTrie sub_trie;

	      /* Remember we have to pop the current frame after continuation */
	      iter -> sp -> curr_state = s_match_soft_hyphen;

	      /*
		 If translation is identical or input is a white space,
		 we already tried the character
	      */
	      if ((inp_ch == trans_ch) || lxcn_is_white_space (inp_ch))
	        break;
	        
	      sub_trie = search_subtrie (curr_trie, lex_ch);
	      if (sub_trie != voc_trie_nil)
		{ /* We have longer lexemes to match */
	          input++;
		  iter -> sp++;
	          iter -> sp -> curr_state = s_entry;
	          iter -> sp -> curr_trie = sub_trie;
	          iter -> sp -> input = input;
		  iter -> sp -> penalty = iter -> sp[-1].penalty + lxcn_translate_penalty (inp_ch);
	          break;
	        };
	    }; /* fall thru */
	  case s_match_soft_hyphen:
            { /* Remember we have to pop the current frame after continuation */
	      VocTrie sub_trie = search_subtrie (curr_trie, SoftHyphenChar);
	      iter -> sp -> curr_state = s_pop_frame;
	      if (sub_trie != voc_trie_nil)
	        { /* Three possibilities */
		  char inp_ch = *input;
		  if (inp_ch == '-') input++;
		  else if (lxcn_is_white_space (inp_ch))
		    do { input++; }
		    while (lxcn_is_white_space (*input));
		  /* else do not move the input pointer */

		  /* Match the current input pointer against the SoftHyphen subtrie */
		  iter -> sp++;
                  iter -> sp -> curr_state = s_entry;
                  iter -> sp -> curr_trie = sub_trie;
                  iter -> sp -> input = input;
		  break;
		};
	      /* fall thru in most cases */
            };
	  case s_pop_frame:
	    { /* If possible, pop the top frame */
	      if (iter -> sp == iter -> stack)
		return (NULL);			/* Done matching */
	      iter -> sp--;
	    };
	  default: break;
	}; 
    };
  return (0);
}

void lxcn_finish_lexicon_match (LexiconIterator iter)
{ abs_free ((void *) iter -> stack, "lxcn_finish_lexicon_match");
  abs_free ((void *) iter, "lxcn_finish_lexicon_match");
}

/*
   Access routines for matches
*/
void lxcn_get_lexicon_match_info (LexiconIterator iter, int *entry_nr,
				  char **matched_lexeme, LexemeType *matched_marker,
				  int *penalty)
{ VocTrie curr_trie = iter -> sp -> curr_trie;
  *entry_nr = curr_trie -> info;
  *matched_lexeme = curr_trie -> search_key;
  if (!is_an_old_lex_marker (iter -> curr_mark & 7, matched_marker))
    *matched_marker = SingleToken;
  if (penalty != NULL) *penalty = iter -> sp -> penalty;
}
