#!/usr/local/bin/perl # Stefanie Bruninghaus 10/2001 # usage: see Stats program, follows the convention of the embedding program # first parameter: infile, second parameter: outfile prefix string $infile = $ARGV[0]; $outstring = $ARGV[1]; $unigram_outfile = ">>$outstring.unigrams"; $bigram_outfile = ">>$outstring.bigrams"; $words_outfile = ">>$outstring.words"; open BIGRAMOUTFILE, $bigram_outfile or die "Can't open bigrams $bigram_outfile: $!\n"; open UNIGRAMOUTFILE, $unigram_outfile or die "Can't open unigrams $unigram_outfile: $!\n"; open WORDSOUTFILE, $words_outfile or die "Can't open words $words_outfile: $!\n"; open INFILE, $infile or die "Can't open infile $infile: $!\n"; undef $\; # slurp in file at once - not pretty, but makes life easier while () { $text .= $_; } close INFILE; $text =~ s/[()!<>=:;.,?0123456789]/ /g; # remove all non-letter characters, # the next few lines are Emacs-perl-mode # artifacts # Notice that \w and \W aren't perfect, # since we also want to delete numbers, # but keep whitespaces $text =~ s/\-/ /g; $text =~ s/\'/ /g; # remove the ' in special line # because otherwise, formatting # in Emacs is confused $text =~ s/\"//g; # dito $text =~ s:/: :g; # funny construct to remove Perl- # control character \ $text =~ s/\s/ /g; # replace whitespaces $text =~ s/ +/ /g; # collapse duplicate whitespace $text =~ s/^ +//; # remove any leading whitespace $text =~ s/ +$//; # remove any trailing whitespace #$text = lc($text); # uncomment this line to get everything # in lower case if wanted @words = split / /, $text; # finally - split text into words! foreach $word (@words) { # double function loop - get words file # and do unigram count in one go - not # good programming, but efficient. unless ($word =~ m/^ +$/) { # good habit - ditch blanks (not # really needed, I guess) print WORDSOUTFILE "$word\n"; $count{$word}++; } } @keys = sort { $count{$b} <=> $count{$a} } (keys %count); # sort unigrams by count # @keys = sort { $count{$a} cmp $count{$b} } (keys %count); # sort unigrams alphabetically foreach $word (@keys) { print UNIGRAMOUTFILE "$count{$word} $word\n"; } # Less elegant solution start: with first word alone as bigram #$prev = ""; #foreach $word (@words) { # $bigram = "$prev\t$word"; # $bcount{$bigram}++; # $prev = $word; #} # Much more beautiful for ($i = 0; $i < $#words; $i++) { $bigram = "$words[$i]\t$words[$i+1]"; # following the original code, we # define a bigram as the two words, # glued together with a tab. $bcount{$bigram}++; } @bkeys = sort { $bcount{$b} <=> $bcount{$a} } (keys %bcount); # sort bigrams by count # @bkeys = sort { $bcount{$a} cmp $bcount{$b} } (keys %bcount); # sort bigrmas alphabetically foreach $bigram (@bkeys) { print BIGRAMOUTFILE "$bcount{$bigram} $bigram\n"; } # good housekeeping ;-) close BIGRAMOUTFILE; close UNIGRAMOUTFILE; close WORDSOUTFILE;