/*
 * edit distance - calculate the edit distance between two strings
 *
 * Copyright 2005 KUN.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Library General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 * 
 *  $Id: edit_distance.c,v 1.8 2005/04/03 10:07:31 andres Exp $ 
 */

#define UKKONEN_OPTIMIZED 1
#define FAIL_ON_NO_HANDLE 0
#define USE_STATIC_MEMORY 1
#define USE_STRING_COPY   0

/*
 *  The code breaks when on long lists of compares, consecutive comapres 
 *  in the order of 10^8 will cause segmentation faults on a freeBSD 4.3 
 *  (twoquid) and a linux system (pinguin), reason unknown. 
 *  Some targets will need 2*10^10 compares so this behaviour is not
 *  acceptable.
 */

#if USE_STATIC_MEMORY
/*
 *  Use of memory is kept as static as possible. This includes a fixed
 *  buffer for strings to compare. It forses an upper limit on the
 *  maximum length of the strings that can be compared, a reason to
 *  keep it big as possible. It also takes time to initialise the buffer 
 *  (which is O(l^2) with l as the length of the buffer), a reason to
 *  keep it as small as possible. Whichever choice you make, it is
 *  always the wrong one. This last limitation is avoidable by initialising
 *  only that bit of the buffer which is realy needed.
 *
 *  Possible options to look at.
 *  - Use memory management for the buffers and increase the size when needed. 
 *    This will make the upper limit `elastic' in a way.
 */

#define BUFFER_SIZE 70
#endif

#include <assert.h>
#include <string.h>
#include <limits.h>

#include <abase_memalloc.h>
#if DEBUG_LEX
#include <abase_error.h>
#define DB_LEX(x) x
#else
#define DB_LEX(x)
#endif

typedef struct
{
        const char *A, *B;  /* strings to compare */
        int m, n;           /* length of the two strings */
#if USE_STATIC_MEMORY
        int row[2*BUFFER_SIZE][BUFFER_SIZE];
        char pb[2*BUFFER_SIZE];
#else
        int **row;          /* diagonals */
#endif
        char *p;            /* actions to get from A to B */
        int d;              /* edit distance */
        int d_max;          /* maximum acceptable edit distance */
#ifdef UKKONEN_OPTIMIZED
        int lower, upper;
#endif
} EDE; /* Edit Distance Environment. */

#define edit_distance_c
#include "edit_distance.h"

#define EMPTY NULL
#define UNKNOWN -1
#define EOS '\0'

#define min(x,y) (((x)<(y))?(x):(y))
#define max(x,y) (((x)>(y))?(x):(y))
#define abs(x)   (((x)>0)?(x):-(x))

/*
 * Ukkonen's Algorithm to find the edit distance d between 
 * two strings A (of length m) and B (of length n).
 * Optimised with border cutoff and diagonal length rules.
 *
 * O(nd) in time, O(d^2 + n) in space.
 *
 * Cost function (Insert, Delete, Substitute) = (1, 1, 1)
 * 
 * The actual algorithm is captured in calculate_edit_distance ()
 * calculate_row () and extract_path (). The rest are supporting
 * routines.
 */

/*
   row_length (h, k), row_offset (h, k),
   add_row (h, k), get_row (h, k), set_row (h, k, v)

   row_length:
             calculate the actual length of a diagonal.
             some diagonals have a length of min(m,n) but on the
             outer sides it will decrement to a length of zero.

   row_offset:
             calculate the actual offset of a diagonal.
             Only diagonal 0 will need to store a value at postion 0,
             all others will get there first value at |k| if any.

   add_row:  add a diagonal to the search space of (patially) visited 
             diagonals. (Not used with USE_STATIC_MEMORY)
   get_row:  right hand side of an expression with rows. (value
             is returned.)
   set_row:  left hand side of an expression with rosw. (value 
             goes into v and is stored as a side effect).

   h is an edit distance environment which contains a array row of
   diagonals and distances,  k is the diagonal number to add.
      
   row is a two dimentional arry row[diagonal][distance].
   Distance runs from 0 up to m+n (just delete all from A, then
   insert all from B.) For each diagonal m+n-1 cells of type int 
   are allocated. 

   The space bound of O(d^2 + n) can only be gained when allocating
   space on a cell base, not on a row base. By violating this
   we get a bound of O(dn) wich is still not bad. In time the
   bound was already O(dn) so that should not differ.
   Some optimalisation could be made here.
   And the USE_STATIC_MEMORY flag violates it in every way.

   Ukkonen's algorithm should work for any costfunction (Insert,
   Delete, Replace) but this way of handling the row array limmits
   it to costfunctions with for each part a maximum of 1. This include
   most if not all of the interesting cases.
 */

#define row_offset(h,k) abs(k)

static int
row_length (EDE *h, int k)
{
	int l;

	if ((h->m > h->n)  && ((h->n - h->m <= k) && (k <= 0))) l = h->n;
	if ((h->m <= h->n) && ((0 <= k) && (k <= h->n - h->m))) l = h->m;
	else if (k < 0) l = h->m + k;
	else l = h->n - k;
	return (l);
}


#if !USE_STATIC_MEMORY
static void
add_row (EDE *h, int k)
{
	int i, l, o;

	DB_LEX (assert ((-h->m < k) && (k < h->n) && "row overflow");)
	l = row_length (h, k);
	o = row_offset (h, k);
	h->row[k] = (int *) abs_calloc (l, sizeof (int), "add_row") - o;
	for (i=o; i<o+l; i++) h->row[k][i] = UNKNOWN;
}
#endif

static int
get_row (EDE *h, int k, int d)
{
	int o, l;

	o = row_offset (h, k);
	l = row_length (h, k);
	if (!((o <= d) && (d <= (o+l)))) return (UNKNOWN);
#if DEBUG_LEX
	assert ((-h->m < k) && (k < h->n) && "row buffer overflow");
#endif
#if USE_STATIC_MEMORY
	return (h->row[BUFFER_SIZE+k][d]);
#else
	if ((d < o) || (d > (o+l)) || (k < (-h->m)) || (k > (h->n))) return (UNKNOWN);
	if (h->row[k] == NULL) return (UNKNOWN);
	return (h->row[k][d]);
#endif
}

static void
set_row (EDE *h, int k, int d, int v)
{
	int o, l;

	o = row_offset (h, k);
	l = row_length (h, k);
#if DEBUG_LEX
	assert ((-h->m <= k) && (k <= h->n) && "row buffer overflow");
	assert ((o <= d) && (d <= (o+l)) && "row overflow");
	assert ((0 <= v) && (v <= (h->m+h->n)) && "edit distance corrupted");
#endif
#if USE_STATIC_MEMORY
	h->row[BUFFER_SIZE+k][d] = v;
#else
	if ((d < o) || (d > (o+l)) || (k < (-h->m)) || (k > (h->n))) return;
	if (h->row[k] == NULL) add_row (h, k);
	h->row[k][d] = v;
#endif
}

/*
   Ukkonen's algorithm.
   For a better documentation of Ukkonen's algorithm see 
   the rice-disertation.pdf in the docs department.

   cite details: 
   [Rice96]
   Stephen V. Rice. Measuring the accuracy of page-reading systems.
   Phd thesis, University of Nevada, Las Vegas, 1996.
   http://www.isri.unlv.edu/ISRI/OCRtk/rice-disertation.pdf,

   [Ukkonen85]
   Ukkonen, E. (1985). Algotihms for approximate string matching.
   Information and control, 64, 100-118
   http://www.cs.helsinki.fi/u/ukkonen

   UKKONEN
   Proceed on a diagonal as far as possible for costs d. 
   Start for costs d=0 at vertex (0, 0), for diagonal k=0 only, then 
   iterate by taking neigboring diagonals into the account on 
   increasing values of d until vertex (m, n) has been reached.

   UKKONEN_OPTIMISED
   Use optimalisation for the restriction that the distance from
   vertex (0,0) to vertex (i, i+k) + the distance from vertex (i, i+k)
   to vertex (m, n) is greater or equal to |k| + |k - (n - m)|

   Use optimalisation for the fact that vertex (m, m+k) (or vertex (n-k, n))
   is the furthest point and only diagonals greater (or lesser) than 
   k need to be considered when it is reached.

   internals.
   d = calculate_edit_distance (h)
      Calulate the edit distance between two strings A and B (given as
      part of the edit distance environment h). The result is the 
      edit distance d (also stored in h) and an array row (in h) from
      which the shortest edit path can be recovered. 

   compute_row (h, k) 
      Compute the max distance one can reach in diagonal k for distance d.
      Supporting part for calculate_edit_distance ().

   p = extract_path (h)
      Extract (one of) the shortest path from row, which is kept in h.
      p is a reference to a string kept in h for which is taken care of
      at the end of the lifetime of h.

   h, edit distance environment.
      h is a structure that takes care for all information needed to
      find the shortest edit distance d between two strings A and B
      (of length n and m) and to retrieve (one of) the shortest path.

   Interface: (with the exception of some house keeping details.)
   h = edit_distance_new_environment (A, B)
      Create a new edit distance environment h with strings A and B.
      The lifetime of A and B does not interfere with the lifetime of h.
      (For large sizes of A and B this might be a problem.)

   edit_distance_release_environment (h)
      Ends the lifetime of h, release resources and forces h to be 
      empty. (No mistakes by using deceised environmends.) 

   d = edit_distance_get_distance (h)
   d = edit_distance_get_limited_distance (h, n)
      safety wrap around calculate_edit_distance ().
      the first is defined as edit_distance_getlimited_distance(h, INT_MAX).

   p = edit_distance_get_path (h)
      safety wrap around extract_path ()

   Safety wrapping means checking on valid edit distance environmends,
   but also calculating the edit distance and the edit path only once
   and making the order of calling for distance and path not important.
 */

#if DEBUG_LEX
void display (EDE *h);
#endif

static void
compute_row (EDE *h, int k, int d)
{
	int i, iI, iD, iS;
	int j;
#if UKKONEN_OPTIMIZED
	int b, c;
#endif /* UKKONEN_OPTIMIZED */

	iD = get_row (h, k - 1, d - 1);
	iI = get_row (h, k + 1, d - 1) + 1;
	iS = get_row (h, k,     d - 1) + 1;
	i = max (iI, max (iD, iS));
	j = i + k;

	while ((i < h->m) && (j < h->n) && (h->A[i] == h->B[j]))
	{
		i++;
		j++;
	}
	set_row (h, k, d, i);

#if UKKONEN_OPTIMIZED
	if (i >= h->m) h->lower = k + 1;
	if (j >= h->n) h->upper = k - 1;

	b = d + max (h->m - 1, h->n - 1 - k) - i;
	c = h->n - h->m - b;
	if (c%2) c += (c>0) ? 1 : -1;

	h->lower = max (h->lower, c/2);
	h->upper = min (h->upper, (h->n - h->m + b)/2);
#endif /* UKKONEN_OPTIMIZED */
}

void
calculate_edit_distance (EDE *h)
{
	h->d = -1;

#if UKKONEN_OPTIMIZED
	h->lower = 1 - h->m;
	h->upper = h->n - 1;
	while ((h->lower <= (h->n - h->m)) && (h->d <= h->d_max))
	{
		int k;


		DB_LEX(assert ((h->d < h->m + h->n - 1) && "d too big.");)
		h->d++;
		if (h->m <= h->n)
		{
			for (k = min (h->n - h->m, h->d); 
			     k >= max (h->lower, -h->d); 
			     k--) 
				compute_row (h, k, h->d);
			for (k= h->n - h->m + 1; 
			     k <= min (h->upper, h->d); 
			     k++) 
				compute_row (h, k, h->d);
		}
		else
		{
			for (k = max (h->n - h->m, -h->d); 
			     k <= min (h->upper, h->d); 
			     k++) 
				compute_row (h, k, h->d);
			for (k = h->n - h->m - 1; 
			     k >= max (h->lower, -h->d); 
			     k--) 
				compute_row (h, k, h->d);
		}
	}
#else /* !UKKONEN_OPTIMIZED */
	while ((h->d <= h->d_max) && (get_row (h, h->n - h->m, h->d) < h->m))
	{
		int k;
		int r;

		DB_LEX(assert ((h->d < h->m + h->n - 1) && "d too big.");)
		h->d++;
		r = h->d - min (h->m, h->n);
		for (k=max (1-h->m, -h->d); k <= min (-1, -r); k++) 
			compute_row (h, k, h->d);
		for (k=max (0, r); k <= min (h->n-1, h->d); k++) 
			compute_row (h, k, h->d);
	}
#endif /* !UKKONEN_OPTIMIZED */

	// DB_LEX(display(h);)
}

int 
edit_distance_get_limited_distance (EDE *h, int n)
{
#if FAIL_ON_NO_HANDLE
	assert (h != NULL && "handle = NULL");
#else
	if (h == NULL) return (-1);
#endif
	if (h->d == UNKNOWN) 
	{
		h->d_max = n;
		calculate_edit_distance (h);
	}
	return ((h->d <= n) ? h->d : UNKNOWN);
}

static char *
extract_path (EDE *h)
{
#if !USE_STATIC_MEMORY
	char *p;
#endif
	int i, j, k, l, d;

	l = h->m + h->n + 1;
#if USE_STATIC_MEMORY
	for (i=0; i<BUFFER_SIZE; i++) h->pb[i] = EOS;
        h->p = h->pb;
#else
	h->p = (char *)abs_calloc (l, sizeof (char), "extract_path");
	h->p[l] = EOS;
#endif
	l--;

	i = h->m;
	j = h->n;
	d = h->d;
	while ((i > 0) || (j > 0))
	{
		DB_LEX(assert (l >= 0 && "buffer overflow");)
		k = j - i;
		if (i == get_row (h, k - 1, d - 1))
		{
			/* arc (v(i,j-1), v(i,j)) is in the path */
			j--;
			d--;
			h->p[--l] = 'I';
		}
		else if (i == get_row (h, k + 1, d - 1) + 1)
		{
			/* arc (v(i-1,j), v(i,j)) is in the path */
			i--;
			d--;
			h->p[--l] = 'D';
		}
		else if (i == get_row (h, k, d - 1) + 1)
		{
			/* non-matching arc (v(i-1,j-1), v(i,j)) is in the path */
			i--;
			j--;
			d--;
			h->p[--l] = 'R';
		}
		else
		{
			/* matching arc (v(i-1,j-1), v(i,j)) is in the path */
			i--;
			j--;
			h->p[--l] = 'a';
		}
	}

#if USE_STATIC_MEMORY
        h->p = h->pb+l;
#else
	p = abs_new_string (h->p+l, "extract_path");
	abs_free (h->p, "extract_path");
	h->p = p;
#endif
	return (h->p);
}

char *
edit_distance_get_path (EDE *h)
{
#if FAIL_ON_NO_HANDLE
	assert (h != NULL && "handle = NULL");
#else
	if (h == NULL) return (NULL);
#endif
	if (h->d == UNKNOWN) calculate_edit_distance (h);
	if (h->d > h->d_max) return (NULL);
	if (h->p == NULL) extract_path (h);
	return (h->p);
}

#if !USE_STATIC_MEMORY
void
release_row (EDE *h, const char *place)
{
	int i, o;

	if (h->row == NULL) return;
	for (i = -h->m; i < h->n-1; i++) 
	{
		if (h->row[i] != NULL) 
		{
			o = row_offset (h, i);
			abs_free (h->row[i] + o, (char *)place);
		}
	}
	abs_free (h->row - h->m, (char *)place);
	h->row = NULL;
}
#endif

static EDE *
renew_environment (EDE *h, const char *A, const char *B)
{
	int i;
#if USE_STATIC_MEMORY
	int j;
#endif

#if FAIL_ON_NO_HANDLE
	assert ((A != NULL) && (B != NULL) && "no NULL or empty strings allowed.");
#else
	if ((A == NULL) || (B == NULL) || (strlen (A) <= 0) || (strlen (B) <= 0)) 
	{ 
		DB_LEX(abs_message ("empty string(s), no EDE constructed");)
		edit_distance_release_environment (&h, "renew_environment");
		return (NULL);
	}
#endif
#if USE_STATIC_MEMORY || (!USE_STATIC_MEMORY && !USE_STRING_COPY)
	h->A = A;
	h->B = B;
#else
	abs_free ((char *)h->A, "edit_distance_renew_environment");
	abs_free ((char *)h->B, "edit_distance_renew_environment");
	h->A = abs_new_string (A, "edit_distance_renew_environment");
	h->B = abs_new_string (B, "edit_distance_renew_environment");
#endif
	h->m = strlen (A);
	h->n = strlen (B);
#if USE_STATIC_MEMORY
	if ((h->m >= BUFFER_SIZE) || (h->n >= BUFFER_SIZE))
	{
		DB_LEX(abs_message ("string(s) too long, no EDE constructed");)
		edit_distance_release_environment (&h, "renew_environment");
		return (NULL);
	}

	for (i = BUFFER_SIZE - h->m; i <= BUFFER_SIZE + h->n; i++)
	{
		h->pb[i] = EOS;
		for (j = 0; j <= min(h->m, h->n); j++) h->row[i][j] = UNKNOWN;
	}
#else
	release_row (h, "renew_environment");
	h->row = (int **) abs_calloc (h->m + h->n, sizeof (int *), "renew_environment") + h->m;
	for (i = -h->m; i < h->n; i++) h->row[i] = NULL; 
	h->p = NULL;
#endif
	h->d = UNKNOWN;
	h->d_max = INT_MAX;
	return (h);
}

EDE *
edit_distance_new_environment (const char *A, const char *B, const char *place)
{
	EDE *new;

	new = (EDE *) abs_malloc (sizeof (EDE), (char *)place);
	new->A = NULL;
	new->B = NULL;
	new->p = NULL;
#if !USE_STATIC_MEMORY
	new->row = NULL;
#endif
        return (renew_environment (new, A, B));
}

EDE *
edit_distance_renew_environment (EDE *h, const char *A, const char *B)
{
#if USE_STATIC_MEMORY
	if (h == NULL) return (edit_distance_new_environment (A, B, "edit_distance_renew_environment"));
	else return (renew_environment (h, A, B));
#else
	edit_distance_release_environment (&h, "edit_distance_renew_environment");
	return (edit_distance_new_environment (A, B, "edit_distance_renew_environment"));

#endif
}

void 
edit_distance_release_environment (EDE **h, const char *place)
{
	if (*h != EMPTY)
	{
#if !USE_STATIC_MEMORY
#if USE_STRING_COPY
		abs_free ((char *)(*h)->A, (char *)place);
		abs_free ((char *)(*h)->B, (char *)place);
#endif
		release_row (*h, place);
		abs_free ((*h)->p, (char *)place);
#endif
		abs_free (*h, (char *)place);
		*h = EMPTY;
	}
	return;
}

#if DEBUG_LEX
void
edit_distance_debug_show_how (EDE *h)
{
	int k, d;

	abs_printf ("\nEdit distance environment at d=%d\n", h->d);
	abs_printf ("A: '%s' m=%d\n", h->A, h->m);
	abs_printf ("B: '%s' n=%d\n", h->B, h->n);

	abs_message ("\ndiagonal range array:\n  d");
	for (k=-h->n+1; k<h->m; k++) abs_printf (" %2d", k);
	abs_printf ("\n");
        for (d=0; d<=h->d; d++)
	{
		abs_printf (" %2d", d);
		for (k=-h->m+1; k<h->n; k++)
		{
			int v;

			v = get_row(h, k, d);
			if (abs(k)>d) abs_printf ("  .");
			else if (v<0) abs_printf ("  ?");
			else abs_printf (" %2d", v);
		}
		abs_printf ("\n");
	}
	abs_printf ("\n");
}
#endif /* DEBUG_LEX */
