#!/bin/csh # # # Usage: Stats input-file output-file-prefix # # Code for identifying word-word collocations # based on Ken Church's 1995 'NGRAMS' tutorial # and extended to use the likelihood ratio as # an alternative (and generally better) measure # of association than mutual information. # # Author: Philip Resnik # # changes made by Steffi Bruninghaus, 10/2001 to make this code # run under Solaris 2.6 if ($2 == "" || $3 != "") then echo "Usage: $0 input-file output-file-prefix" else set INFILE=$1 set OUTPREFIX=$2 echo "Processing file $INFILE" # echo "Getting unigram counts: see $OUTPREFIX.unigrams" # /bin/rm -f $OUTPREFIX.unigrams # ./count_words < $INFILE > $OUTPREFIX.unigrams # echo "Getting bigram counts: see $OUTPREFIX.bigrams" # /bin/rm -f $OUTPREFIX.bigrams # ./count_bigrams $INFILE $OUTPREFIX > $OUTPREFIX.bigrams echo "Steffi is computing bigram and unigram counts" /bin/rm -f $OUTPREFIX.words /bin/rm -f $OUTPREFIX.bigrams /bin/rm -f $OUTPREFIX.unigrams ./count-unigrams.pl $INFILE $OUTPREFIX echo "Returning to the original code" echo "Computing bigram mutual information: see $OUTPREFIX.mi" /bin/rm -f $OUTPREFIX.mi ./mutual_info $OUTPREFIX > $OUTPREFIX.mi echo "Computing likelihood ratio for bigrams: see $OUTPREFIX.lr" /bin/rm -f $OUTPREFIX.lr.values $OUTPREFIX.lr cat $OUTPREFIX.mi | ./lr_filter.pl `cat $OUTPREFIX.words | wc -l` \ | xargs -n 6 ./lr_simple > $OUTPREFIX.lr.values paste $OUTPREFIX.lr.values $OUTPREFIX.mi | sort -nr > $OUTPREFIX.lr endif