import nltk.corpus import numpy as np vocabularySize = 9996 mapping = dict() def getTokens(i, parent, isNumber): for node in parent: if (type(node) is nltk.Tree): if (node.label() != "-NONE-"): if (node.label() == "CD"): getTokens(i, node, True) else: getTokens(i, node, False) else: key = str(node).lower() if (isNumber): key = "NUMBER" if (key not in mapping): mapping[key] = set() if (i not in mapping[key]): mapping[key].add(i) for i in range(1850): reader = nltk.corpus.BracketParseCorpusReader("trn", str(i).zfill(4) + "\.mrg") for tree in reader.parsed_sents(): getTokens(i, tree, False) members = dict() for k,v in mapping.items(): freq = len(v) if (freq not in members): members[freq] = [] if (k not in members[freq]): members[freq].append(k) tuples = sorted(members.items(), reverse = True) if (vocabularySize > len(mapping)): vocabularySize = len(mapping) selected = [] needed = vocabularySize for i in range(len(tuples)): if (needed > 0): candidates = tuples[i][1] np.random.shuffle(candidates) count = len(candidates) if (count > needed): for j in range(needed): selected.append(candidates[j]) needed = needed - 1 else: for candidate in candidates: selected.append(candidate) needed = needed - 1 else: break output = open("ptb-vocabulary.dat", "w") for token in selected: output.write(str(len(mapping[token])) + "\t" + token + "\n") output.close()