#!/usr/bin/env perl # good way to call this program: # perl collocations.pl | sort -nr | more # input format is one word per line $file_name = "/Volumes/Corpora/en/brown-10percent/tnt-format/TNT-brown-1col_noPOS.txt"; # open file open FILE, '<', $file_name or die "Couldn't open input file"; sub pmi { # pmi function calculates the pointwise mutual information # unpack the argument list: $w1_count = $_[0]; $w2_count = $_[1]; $bigram_count = $_[2]; $n = $_[3]; # calculate the pmi: return log $n*($bigram_count/($w1_count*$w2_count)); } # initialize dictionaries to store unigrams & bigrams %Unigrams = {}; %Bigrams = {}; # read each line in the file # declare a dummy start token and define $n, which will store the # corpus length $previous = 'START'; $n = 0; while () { # remove trailing newline chomp; $word = $_; ++$n; # add word and bigram (=previous+word) to appropriate dictionaries $Unigrams{$word}++; if ($previous ne 'START') { $Bigrams{$previous.' '.$word}++; } # re-set $previous variable $previous = $word; } # iterate over each bigram in Bigrams foreach $bigram (keys %Bigrams) { # get appropriate counts and individual words $bigram_count = "$Bigrams{$bigram}"; @wordlist = split(/\s+/,$bigram); $word1 = $wordlist[0]; $word2 = $wordlist[1]; # get the pmi score $w1_count = $Unigrams{$word1}; $w2_count = $Unigrams{$word2}; # check to make sure we won't have division by zero: if ($w1_count != 0 && $w2_count != 0) { $score = &pmi($w1_count,$w2_count,$bigram_count,$n); # can remove the 'sweet' condition to check *all* words ... # could also change condition to check that bigram occurs, e.g., # more than 5 times if ($word1 eq 'sweet') { # print out all values print "$score\t$bigram\t$bigram_count\t$w1_count\t$w2_count\n"; } } }