#!/usr/bin/perl open EN_OUT, '>', "en_output.txt"; open DE_OUT, '>', "de_output.txt"; %En = (); %De = (); ########## sub new_unit { if (/<(CHAPTER|SPEAKER|P>)/) { return 1; } else { return 0; } } ########## # 1) Processing the English corpus # If you've mapped a network drive, use this line: chdir "/Volumes/Corpora/multilingual/europarl/aligned/en-de/en/"; # If you've logged in to jones, comment out the previous line and # uncomment this line: #chdir "/Volumes/Data/Corpora/multilingual/europarl/aligned/en-de/en/"; ########## @all_files = <*>; # loop over all files in the directory foreach $filename (@all_files) { # for time reasons, we'll only look at a subset of the corpora if ($filename =~ /ep-00/) { # open each file, one at a time print "Now processing: $filename (English)\n"; open FILE, '<', $filename; # initialize variables $unitnum = 1; $found_one = 0; $previous_lines = ""; # read each line in the file while () { # new_unit indicates that we're starting a new unit of text if (&new_unit($_)) { # The criterion for adding to the En hash is that we've found # the word we're looking for if ($found_one) { # make the original word more noticeable $previous_lines =~ s{\b(one)\b}{$1}ig; # store this position for later $key = $filename.'='.$unitnum; $En{$key} = $previous_lines; } # re-initialize variables $previous_lines = $_; # note that we include the structural mark-up $found_one = 0; $unitnum++; } # Store all the previous lines else { $previous_lines .= $_; } # Test whether 'one' appears in this unit: if so, set $found_one # to be some non-zero value if (/\bone\b/i) { $found_one = $unitnum; } } close FILE; } } ########## # 2) Processing the German corpus # If you've mapped a network drive, use this line: chdir "/Volumes/Corpora/multilingual/europarl/aligned/en-de/de/"; # If you've logged in to jones, comment out the previous line and # uncomment this line: #chdir "/Volumes/Data/Corpora/multilingual/europarl/aligned/en-de/de/"; ########## @all_files = <*>; # loop over all files in the directory foreach $filename (@all_files) { # for time reasons, we'll only look at a subset of the corpora if ($filename =~ /ep-00/) { # open each file, one at a time print "Now processing: $filename (German)\n"; open FILE, '<', $filename; # initialize variables $unitnum = 1; $previous_lines = $_; # read each line in the file while () { # new_unit indicates that we're starting a new unit of text if (&new_unit($_)) { $key = $filename.'='.$unitnum; # The criterion for adding to the De hash is that the # corresponding English unit had the word we're looking for if (exists $En{$key}) { $De{$key} = $previous_lines; } # re-initialize variables $previous_lines = $_; # note that we include the structural mark-up $unitnum++; } # Store all the previous lines else { $previous_lines .= $_; } } close FILE; } } @en_keys = keys %En; foreach $key (@en_keys) { if (exists $De{$key}) { select EN_OUT; print "$key\n$En{$key}\n"; select DE_OUT; print "$key\n$De{$key}\n"; } } close EN_OUT; close DE_OUT;