#!/usr/local/bin/perl


# Stefanie Bruninghaus 10/2001 

# usage: see Stats program, follows the convention of the embedding program 
# first parameter: infile, second parameter: outfile prefix string

$infile = $ARGV[0];
$outstring = $ARGV[1];

$unigram_outfile = ">>$outstring.unigrams";
$bigram_outfile = ">>$outstring.bigrams";
$words_outfile = ">>$outstring.words";

open BIGRAMOUTFILE, $bigram_outfile 
  or die "Can't open bigrams $bigram_outfile: $!\n";
open UNIGRAMOUTFILE, $unigram_outfile 
  or die "Can't open unigrams $unigram_outfile: $!\n";
open WORDSOUTFILE, $words_outfile 
  or die "Can't open words $words_outfile: $!\n";
open INFILE, $infile 
  or die "Can't open infile $infile: $!\n";

undef $\; # slurp in file at once - not pretty, but makes life easier


while (<INFILE>) {
    $text .= $_; 
}
close INFILE; 


$text =~ s/[()!<>=:;.,?0123456789]/ /g; # remove all non-letter characters, 
                                        # the next few lines are Emacs-perl-mode
                                        # artifacts
                                        # Notice that \w and \W aren't perfect,
                                        # since we also want to delete numbers, 
                                        # but keep whitespaces 
                       
$text =~ s/\-/ /g;                  
$text =~ s/\'/ /g;                      # remove the ' in special line 
                                        # because otherwise, formatting 
                                        # in Emacs is confused 
$text =~ s/\"//g;                       # dito 
$text =~ s:/: :g;                       # funny construct to remove Perl-
	                                # control character \

$text =~ s/\s/ /g;                      # replace whitespaces 
$text =~ s/ +/ /g;                      # collapse duplicate whitespace 
$text =~ s/^ +//;                       # remove any leading whitespace 
$text =~ s/ +$//;                       # remove any trailing whitespace 

#$text = lc($text);                     # uncomment this line to get everything
                                        # in lower case if wanted 

@words = split / /, $text;              # finally - split text into words! 

foreach $word (@words) {                # double function loop - get words file
                                        # and do unigram count in one go - not
                                        # good programming, but efficient. 
    unless ($word =~ m/^ +$/) {         # good habit - ditch blanks (not 
                                        # really needed, I guess)
	print WORDSOUTFILE "$word\n";
	$count{$word}++;
    }
}

@keys = sort { $count{$b} <=> $count{$a} } (keys %count);     
                                        # sort unigrams by count
# @keys = sort { $count{$a} cmp $count{$b} } (keys %count);   
                                        # sort unigrams alphabetically 

foreach $word (@keys) {
    print UNIGRAMOUTFILE "$count{$word} $word\n";
}


# Less elegant solution start: with first word alone as bigram 
#$prev = "";
#foreach $word (@words) {
#    $bigram = "$prev\t$word"; 
#    $bcount{$bigram}++;
#    $prev = $word; 
#}

# Much more beautiful
for ($i = 0; $i < $#words; $i++) {
    $bigram = "$words[$i]\t$words[$i+1]";   # following the original code, we 
                                            # define a bigram as the two words, 
                                            # glued together with a tab. 
    $bcount{$bigram}++; 
}
 

@bkeys = sort { $bcount{$b} <=> $bcount{$a} } (keys %bcount); 
                                            # sort bigrams by count
# @bkeys = sort { $bcount{$a} cmp $bcount{$b} } (keys %bcount); 
                                            # sort bigrmas alphabetically

foreach $bigram (@bkeys) {
    print BIGRAMOUTFILE "$bcount{$bigram} $bigram\n";
}
    
# good housekeeping ;-) 
close BIGRAMOUTFILE; 
close UNIGRAMOUTFILE; 
close WORDSOUTFILE;