import nltk.corpus import numpy as np token2index = dict() index = 4 for line in open("ptb-vocabulary.dat", "r"): value = line.strip("\r\n").split("\t") token2index[value[1]] = index index = index + 1 def getTokens(i, parent, isNumber): for node in parent: if (type(node) is nltk.Tree): if (node.label() != "-NONE-"): if (node.label() == "CD"): getTokens(i, node, True) else: getTokens(i, node, False) else: key = str(node).lower() if (isNumber): key = "NUMBER" if (i not in sentences): sentences[i] = [] identifier = 3 if (key in token2index): identifier = token2index[key] sentences[i].append(str(identifier)) count = dict() count["trn"] = 1850 count["val"] = 231 count["tst"] = 231 for partition in [ "trn", "val", "tst" ]: sentences = dict() sentenceIndex = 0 for i in range(count[partition]): reader = nltk.corpus.BracketParseCorpusReader(partition, str(i).zfill(4) + "\.mrg") for tree in reader.parsed_sents(): getTokens(sentenceIndex, tree, False) sentenceIndex = sentenceIndex + 1 outfile = open(partition + ".dat", "w") for k,v in sentences.items(): seq = [] for token in v: seq.append(token) outfile.write(" ".join(seq) + "\n") outfile.close()