#!/bin/csh
#
#
#  Usage: Stats input-file output-file-prefix
#
#  Code for identifying word-word collocations
#  based on Ken Church's 1995 'NGRAMS' tutorial
#  and extended to use the likelihood ratio as
#  an alternative (and generally better) measure
#  of association than mutual information.
#
#  Author:  Philip Resnik
#
# changes made by Steffi Bruninghaus, 10/2001 to make this code 
# run under Solaris 2.6 


if ($2 == "" || $3 != "") then
  echo "Usage: $0 input-file output-file-prefix"
else

  set INFILE=$1
  set OUTPREFIX=$2

  echo "Processing file $INFILE"

#  echo "Getting unigram counts: see $OUTPREFIX.unigrams"
#  /bin/rm -f $OUTPREFIX.unigrams
#  ./count_words < $INFILE > $OUTPREFIX.unigrams

#  echo "Getting bigram counts: see $OUTPREFIX.bigrams"
#  /bin/rm -f $OUTPREFIX.bigrams
#  ./count_bigrams $INFILE $OUTPREFIX > $OUTPREFIX.bigrams
    
    echo "Steffi is computing bigram and unigram counts" 
    /bin/rm -f $OUTPREFIX.words
    /bin/rm -f $OUTPREFIX.bigrams
    /bin/rm -f $OUTPREFIX.unigrams
    ./count-unigrams.pl $INFILE $OUTPREFIX
    echo "Returning to the original code"
  
    echo "Computing bigram mutual information: see $OUTPREFIX.mi"
    /bin/rm -f $OUTPREFIX.mi
    ./mutual_info $OUTPREFIX > $OUTPREFIX.mi

  echo "Computing likelihood ratio for bigrams: see $OUTPREFIX.lr"
  /bin/rm -f $OUTPREFIX.lr.values $OUTPREFIX.lr 
  cat $OUTPREFIX.mi | ./lr_filter.pl `cat $OUTPREFIX.words | wc -l` \
     | xargs -n 6 ./lr_simple > $OUTPREFIX.lr.values
  paste $OUTPREFIX.lr.values $OUTPREFIX.mi | sort -nr > $OUTPREFIX.lr


endif