/*
   File: untype.c

*/

/* global includes */
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <unistd.h>

extern char *optarg;
extern int optind;
extern int optopt;
extern int opterr;
extern int optreset;

int weak = 0;			/* keep the first type but strip the rest */
int strip_number = 1;
int strip_pos = 1;
int ignore_comments = 1;

static void usage()
{
    fprintf(stderr, "Usage: untype [-cnpw]        input: 123-456:NP:N:A:R:man\n"
                    "       -c: don't pass through comments (#) unchanged\n"
                    "       -n: don't strip number (position) (123-346:)\n"
                    "       -p: don't strip Part Of Speech (NP:N:A:R:)\n"
                    "       -w: strip Part Of Speech weakly (NP:)\n");
    exit(1);
}

#define EOD 1
static void outchar(int c)
{
    fputc(c, stdout);
    if (c == EOD) {
	fflush(stdout);
    }
}

static void outstr(char *from, char *to)
{
    while (from < to) {
	outchar(*from++);
    }
}

/*
 * Currently we don't check if a comment starts at the beginning of the line.
 */
static void comment(int c)
{
    outchar(c);

    do {
	c = getchar();
	if (c == EOF)
	    return;
	outchar(c);
    } while (c != '\n');
}

#define MAXWORD	1024

/*
 * Example of a typical input word: 123-456:NP:N:A:R:man
 */

static void untype_word(char *word, char nextchar)
{
    char *colon, *s;

    /*
     * Now we have collected a whole word, possibly with extra whitespace
     * or other meaningless punctuation at the end.
     */
    colon = strchr(word, ':');
    if (!colon) {
	/* If it doesn't appear to conform, just print it. */
fail:
	fputs(word, stdout);
	outchar(nextchar);
	return;
    }

    s = word;
    if (isdigit(s[0])) {
	/*
	 * Numeric position information seems to be present.
	 * Check it thoroughly; we don't want to be confused by a
	 * time, such as 9:00.
	 * To pass, it must match /[0-9]+-[0-9]+:/.
	 */
	char *c = s;
	while (*c >= '0' && *c <= '9')	/* [0-9]* */
	    c++;
	if (c == s)			/* + */
	    goto fail;
	if (*c++ != '-')		/* '-' */
	    goto fail;
	if (c == colon)			/* + */
	    goto fail;
	while (*c >= '0' && *c <= '9')	/* [0-9]* */
	    c++;
	if (c != colon)			/* ':' */
	    goto fail;

	if (strip_number) {
	    /* Don't print it */
	} else {
	    outstr(s, colon + 1);
	}
	s = colon + 1;
    }

    /* Find colon just before bare word */
    colon = strrchr(s, ':');
    if (colon) {
	/* Part-of-speech information seems to be present */
	if (strip_pos) {
	    if (weak) {
		/* Print up to/including first colon: just final type */
		while (s[0] != ':') {
		    outchar(*s++);
		}
		outchar(':');
	    } else {
		/* Don't print it */
	    }
	} else {
	    outstr(s, colon + 1);
	}
	s = colon + 1;
    }

    /* Output the word + trailing stuff */
    fputs(s, stdout);
    if (nextchar)
	outchar(nextchar);
}

static void collect(int c)
{
    char word[MAXWORD];
    int i = 0;
    int nextchar;
    int done = 0;

    word[i++] = c;

    while (!done && i < MAXWORD && (c = getchar()) != EOF) {
	switch (c) {
	case '{': case '}':
	case '[': case ']':
	case '<': case '>':
	case '|':
	case ',': case ' ':
	case '\n': case '\t':
	    word[i++] = 0;
	    nextchar = c;
	    done = 1;
	    break;
	default:
	    word[i++] = c;
	    break;
	}
    }

    untype_word(word, nextchar);
}

static void collect_quoted()
{
    char word[MAXWORD];
    int i = 0;
    int c, nextchar = 0;
    int done = 0;

    while (!done && i < MAXWORD-1 && (c = getchar()) != EOF) {
	switch (c) {
	case '\\':
	    word[i++] = c;
	    word[i++] = getchar();
	    break;
	case '"':
	case '\n': case '\t':
	    word[i++] = 0;
	    nextchar = c;
	    done = 1;
	    break;
	default:
	    word[i++] = c;
	    break;
	}
    }

    outchar('"');
    untype_word(word, nextchar);
}

static void untype()
{
    int c;

    while ((c = getchar()) != EOF) {
	switch (c) {
	case '{': case '}':
	case '[': case ']':
	case '<': case '>':
	case '|':
	case ',': case ' ':
	case '\n': case '\t':
	    outchar(c);
	    break;
	case '"':
	    collect_quoted();
	    break;
	default:
	    if (c == '#' && ignore_comments) {
		comment(c);
	    } else if (isprint(c)) {
		collect(c);
	    } else {
		outchar(c);
	    }
	}
    }
}

int main(int argc, char **argv)
{
    int ch;
    int first = 1;

    /* loop over option-arguments */
    while ((ch = getopt(argc, argv, "cnpw")) != -1) {
	switch (ch) {
	    case 'c':
		ignore_comments = 0;
		break;
	    case 'n':
		strip_number = 0;
		break;
	    case 'p':
		strip_pos = 0;
		break;
	    case 'w':
		weak = 1;
		break;
	    case '?':
	    default:
		usage();
	}
    }
    argc -= optind;
    argv += optind;

    untype();

    return (0);
}
