#!/usr/local/bin/perl
use strict;
use FileHandle;

#-------------------------------------------------------------------------------
# Synopsis:
#	analyse-keyword.pl [ -v ] <in-dir> <out-dir>
#
# Description:
#	Dummy analyser. Documents in directory <in-dir> are stripped from
#	garbage and written to <out-dir>. The original documents are deleted.
#
# Example:
#	analyse-keyword.pl analyse/in analyse/out
#-------------------------------------------------------------------------------

# Check arguments

my $verbose = 0;
my $in_dir = "";
my $out_dir = "";
my $stop_list = "";
my $keep_list = "";

while (@ARGV>0)
{ if ($ARGV[0] eq "-v")
  { $verbose = 1;
    shift;
  }
  elsif ($ARGV[0] eq "-sl")
  { shift;
    if(@ARGV>0)
    { $stop_list =$ARGV[0];
      shift;
    }
    else
    { usage("file name expected"); }
  }
  elsif ($ARGV[0] eq "-kl")
  { shift;
    if(@ARGV>0)
    { $keep_list =$ARGV[0];
      shift;
    }
    else
    { usage("file name expected"); }
  }
  else
  { usage("unrecognized parameter"); }
}

my %stplist;
if( $stop_list ne "" )
{ check_file_exists($stop_list);
  open SL,$stop_list;
  for(grep !/#/,<SL>)
    { chomp;
      $stplist{$_}++; }
}
my %kplist;
if( $keep_list ne "" )
{ check_file_exists($keep_list);
  open KL,$keep_list;
  for(grep !/#/,<KL>)
    { chomp;
      $kplist{$_}++; }
}
# Copy files from in-dir to out-dir.

my @in = ();
my @files = ();
my @in_dirs = ();
my @out_dirs = ();

#autoflush STDOUT 1;
while (<>) {
    m/^(.*)$/;
    &analyse_document($1);
}

# Sub-routines

# YOU CAN REPLACE THE FOLLOWING FUNCTION BY A CALL TO AN EXTERNAL PROGRAM
#
# sub analyse_document
# { my($in, $out) = @_;
#   system "analyse $in $out";
#or system "analyse <$in >$out";
# }
#
#

sub analyse_document
{
    my $docname = $_[0];
             #   print STDERR "Analyzing: $docname\n";
    my $INFILE;
    open(INFILE, "<$docname") || die "$docname: $!";
  while (<INFILE>)
  { # remove first line, if it contains the class name.
    if($verbose)
      { print; }
    s/^<CLASS.*>.$//;
    # remove end-of-output markers from input file
    s/\001//g;

    # remove html encoded orthographic marks (warning: incomplete)
    s/&([a-z])acute;/$1/g;
    s/&([a-z])cedil;/$1/g;
    s/&([a-z])circ;/$1/g;
    s/&([a-z])grave;/$1/g;
    s/&([a-z])tilde;/$1/g;
    s/&([a-z])uml;/$1/g;
    
    if($verbose)
      { print; }
    #suppress accent in extended ascii
    s/[\300-\305\340-\345]/a/g;
    s/[\306\346]/ae/g;
    s/[\307\347]/c/g;
    s/[\310-\313\350-\353]/e/g;
    if($verbose)
      { print;
	print "\350\351\352\353\n"; }
    s/[\314-\317\354-\357]/i/g;
    s/[\320\360]/dh/g;
    s/[\321\361]/n/g;
    s/[\327\367]/ /g;
    s/[\322-\326\330\362-\366\370]/o/g;
    s/[\331-\334\371-\374]/u/g;
    s/[\335\375\377]/y/g;
    s/[\336\376]/th/g;
    s/\337/ss/g;
     
    tr/A-Z/a-z/;		# convert upper to lower case
    tr/a-z / /cs;		# convert non alphas to single space
    s/  */ /g;

    if( $keep_list ne "")
    { while( /(\w+)/ )
      { if( $kplist{$1}!=0 )
        { print "$1 "; }
        s/\w+\s*//;
      }
    }
    else
    { while( /(\w+)/ )
      { if( $stplist{$1}==0 )
        { print "$1 "; }
        s/\w+\s*//;
      }
    }
    print "\n";
  }
  close INFILE;
  print "\1\n";
  autoflush STDOUT 1;
  autoflush STDOUT 0;
}

sub check_file_exists
{ my($file) = @_;
  if (! -f $file)
    { die "analyse: file `$file' does not exist\n"; }
}

sub check_dir_exists
{ my($dir) = @_;
  if (! -d $dir)
    { die "analyse: directory `$dir' does not exist\n"; }
}

sub usage
{ print STDERR "error: @_\n";
  print STDERR "Usage: analyse-keyword.pl [ -v ] \n";
  print STDERR "              [ -sl stoplist_file | -kl keeplist_file ]\n";
  exit 1;
}
