import numpy as np
import re
import spacy
from spacy.lang.en import English
from zipfile import ZipFile

maxSeqLen = 948

tokenIndex = dict()
index = 1
for line in open("vocabulary.dat", "r", encoding = "utf-8"):
    value = line.strip("\r\n").split("\t")
    tokenIndex[value[1]] = index
    index = index + 1
vocabularySize = index

nlp = English()
tokenizer = nlp.tokenizer

size = dict()
size["trn"] = 15062
size["val"] = 1883
size["tst"] = 1883

with ZipFile("ml530-2022-fall-newsgroups.zip", "r") as archive:
    for partition in [ "trn", "val", "tst" ]:
        docCount = size[partition]
        docTokenMatrix = np.zeros((docCount, maxSeqLen)).astype("int32")
        for i in range(docCount):
            with archive.open("newsgroups_" + partition + "/newsgroups_" + partition + "_" + str(i).zfill(5) + ".txt") as file:
                text = file.read().decode("utf-8")
                text = re.sub("[\t\r\n]", " ", text)
                text = " ".join(text.split())
                sequence = []
                seqLen = 0
                for token in [ token.text for token in tokenizer(text) ]:
                    key = token.lower()
                    if (seqLen < maxSeqLen):
                        if (key in tokenIndex):
                            docTokenMatrix[i,seqLen] = tokenIndex[key]
                        seqLen = seqLen + 1
        np.save("newsgroups_" + partition + "X.npy", docTokenMatrix)