import re def tokenize(line): list = line.split() tokens = [] for item in list: while re.match('\W',item): # non-alphnumeric item at beginning of item tokens.append(item[0]) item = item[1:] # to maintain order, we use temp temp = [] while re.search('\W$',item): # non-alphnumeric item at end of item temp.append(item[-1]) item = item[:-1] # Contraction handling if item == "can't": tokens.append("can") tokens.append("n't") # other n't words: elif re.search("n't",item): tokens.append(item[:-3]) tokens.append(item[-3:]) # other words with apostrophes ('s, 'll, etc.) elif re.search("'",item): wordlist = item.split("'") tokens.append(wordlist[0]) tokens.append("'"+wordlist[1]) # no apostrophe, i.e., normal word: else: tokens.append(item) tokens.extend(temp) return tokens