# call program as so: # cat inputfile | perl transform.pl > outputfile sub get_word_tag { $line = $_[0]; @list = split(/\s+/,$line); } sub is_be { $line = $_[0]; @list = split(/\s+/,$line); $word = $list[0]; if ($word =~ /\b(be|is|am|are|was|were|being|been)\b/) { return 1 } return 0 } while (<>) { # 1. DT becomes ART s/\bDT\b/ART/g; # 2. POS stays as POS # 3. VBZ becomes 3PR s/\bVBZ\b/3PR/g; # 4. irregular past tense (IRPST) needs to be hand-checked # 5. NNS becomes PL s/\bNNS\b/PL/g; # 6. Auxiliary becomes AUXBE & 7. Copula becomes COP if (&is_be($prev_line)) { if (/(NN|NNS|NNP|NNPS|JJ|JJR|JJS)/) { $prev_line =~ s/VB[DGNPZ]?/COP/g; } elsif (/(VBG|VBN)/) { $prev_line =~ s/\bVB[DGNPZ]?\b/AUXBE/g; } } # 8. VBG becomes PROG s/\bVBG\b/PROG/g; $prev_line = $_; print $prev_line; }