Description later. Check svn for current version. Should be refactored to multiple files.

   1 
   2 #!/usr/bin/env python2.5
   3 from copy import deepcopy
   4 ## utilities ##
   5 def partition(f, l):
   6     good = []; bad = []
   7     for x in l:
   8         if f(x):
   9             good.append(x)
  10         else:
  11             bad.append(x)
  12     return good, bad
  13 ## grammars ##
  14 def parse_grammar(g):
  15     """Ironically, it would be a good idea to have a Real Parser here"""
  16     g2 = {}
  17     for sym,rule in g.items():
  18         alternatives = map(str.strip, rule.split("|"))
  19         g2[sym] = map(str.split, alternatives)
  20     return g2
  21 def issymbol(s):
  22     return s[0].isupper()
  23 def isliteral(s):
  24     return not issymbol(s)
  25 def isempty(s): #Some better representation is needed.
  26     return s==["e"]
  27 ## Convert to CNF (should probably be in cnf.py) ###
  28 def cnf(g, start="S"):
  29     remove_empty(g)
  30     remove_unit(g)
  31     remove_non_productive(g)
  32     remove_non_reachable(g, start)
  33     normalise(g)
  34 
  35 def remove_empty(g):
  36     """Right now, this only works if
  37     (1) the e production is the only thing in its alternate
  38     (2) None of the symbols are the empty string.
  39       But you'd never do a thing like that."""
  40     while True:
  41         empty_sym = find_empty(g)
  42         if empty_sym:
  43             inline_empty(g, empty_sym)
  44         else:
  45             break
  46     return g # only for debugging, this is a destructive operation
  47 def find_empty(g):
  48     for sym,rule in g.items():
  49         for alt in rule:
  50             if isempty(alt):
  51                 return sym
  52     return None
  53 def contains_symbol(sym):
  54     "Check to see if some alternate contains a symbol"
  55     def check_alt(alt):
  56         return sym in alt
  57     return check_alt
  58 def inline_empty(g, empty_sym):
  59     g[empty_sym].remove(["e"])
  60     for sym,rule in g.items():
  61         needs_fixed, ok = partition(contains_symbol(empty_sym), rule)
  62         g[sym] = ok + inline_rule(needs_fixed, empty_sym)
  63     remove_empty_rule(g, empty_sym)
  64 def inline_rule(needs_fixed, empty_sym):
  65     acc = []
  66     for alt in needs_fixed:
  67         for i,sym in enumerate(alt):
  68             if sym==empty_sym:
  69                 if alt not in acc: acc.append(alt)
  70                 copy = list(alt)
  71                 del copy[i]
  72                 if copy != [] and copy not in acc: acc.append(copy)
  73     return acc
  74 def remove_empty_rule(g, empty_sym):
  75     if g[empty_sym]==[]: # truly empty
  76         del g[empty_sym]
  77         for sym,rule in g.items():
  78             acc = []
  79             for alt in rule:
  80                 # propogate true empties upward
  81                 if alt==[empty_sym]:
  82                     acc.append(["e"])
  83                 # otherwise only keep alts that don't use the empty symbol
  84                 elif empty_sym not in alt:
  85                     acc.append(alt)
  86             g[sym] = acc
  87 
  88 def remove_unit(g):
  89     # TODO: Remember to check for rules like A -> A
  90     while True:
  91         unit_symbol = find_unit(g)
  92         if unit_symbol:
  93             inline_unit(g, *unit_symbol)
  94         else:
  95             break
  96     return g # only for debugging, this is a destructive operation
  97 def find_unit(g):
  98     for sym,rule in g.items():
  99         for alt in rule:
 100             if isunit(alt):
 101                 return sym, alt[0]
 102     return None
 103 def inline_unit(g, unit_sym, replacement_sym):
 104     g[unit_sym].remove([replacement_sym])
 105     g[unit_sym].extend(g[replacement_sym])
 106 def isunit(alt):
 107     return len(alt)==1 and issymbol(alt[0])
 108 
 109 def remove_non_productive(g):
 110     productive = set()
 111     previous_size = -1
 112     while len(productive) > previous_size:
 113         previous_size = len(productive)
 114         mark_productives(g, productive)
 115     subset_grammar(g, productive)
 116 def subset_grammar(g, subset):
 117     for remove in set(g) - subset:
 118         del g[remove]
 119 def mark_productives(g, productive):
 120     for sym,rule in g.items():
 121         for alt in rule:
 122             if any(isliteral(term) or term in productive for term in alt):
 123                 productive.add(sym)
 124                 break
 125 
 126 def remove_non_reachable(g, start):
 127     reachable = set([start])
 128     def traverse(sym):
 129         for alt in g[sym]:
 130             for term in alt:
 131                 if term not in reachable and issymbol(term):
 132                     reachable.add(term)
 133                     traverse(sym)
 134     traverse(start)
 135     subset_grammar(g, reachable)
 136 
 137 def normalise(g):
 138     #newrules = []
 139     for sym,rule in g.items():
 140         for alt in rule:
 141             if len(alt) > 2:
 142                 new = make_new_rules(sym, alt)
 143                 g[sym].remove(alt)
 144                 g[sym] += new[0][1] # the first alternate
 145                 g.update(new[1:])
 146                 #newrules.extend(new[1:])
 147     #print newrules
 148     #g.update(newrules)
 149     #print newrules[0][0] in g
 150     #return g
 151 def make_new_rules(sym, alt):
 152     newnames = [new_rule_name(sym, alt[0], i) for i in range(len(alt) - 2)]
 153     return [(sym,[[term,next]]) for sym,term,next # sym -> term next
 154             in zip([sym] + newnames, alt, newnames + [alt[-1]])]
 155 def new_rule_name(original, *extras):
 156     return original + '.' + '.'.join(map(str, extras))
 157 ## test grammars ##
 158 math = parse_grammar(dict(S = "Expr",
 159                          Expr = "Expr + Term | Term",
 160                          Term = "Term x Factor | Factor",
 161                          Factor = "( Expr ) | i"))
 162 numbers = parse_grammar(dict(Number = "Integer | Real",
 163                              Integer = "Digit | Integer Digit",
 164                              Real = "Integer Fraction Scale",
 165                              Fraction = ". Integer",
 166                              Scale = "x Sign Integer | Empty",
 167                              Digit = "0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9",
 168                              Empty = "e",
 169                              Sign = "+ | -"))
 170 tricky = parse_grammar(dict(S = "L a M",
 171                             L = "L M | e",
 172                             M = "M M | e"))
 173 cyclic = parse_grammar(dict(A = "B",
 174                             B = "C | b",
 175                             C = "A"))
 176 cyclic_only = parse_grammar(dict(A = "B", # this grammar has no terminals
 177                                  B = "C",
 178                                  C = "A"))
 179 useless = parse_grammar(dict(A = "B | foo",
 180                              B = "bar",
 181                              C = "baz",
 182                              Z = "qux"))
 183 pretty_flat = parse_grammar(dict(A = "B | C | a",
 184                                  B = "A A B C",
 185                                  C = "B B B"))
 186 tricky_no_empty = deepcopy(tricky)
 187 remove_empty(tricky_no_empty)
 188 num = deepcopy(numbers)
 189 # cnf(num, 'Number')

ChomskyNormaliser (last edited 2009-02-18 16:47:02 by NathanSanders)