import numpy as np
import re
import spacy
from bs4 import BeautifulSoup
from spacy.lang.en import English
from zipfile import ZipFile

tokenIndex = dict()
index = 0
for line in open("vocabulary.dat", "r"):
    value = line.strip("\r\n").split("\t")
    tokenIndex[value[1]] = index
    index = index + 1

vocabularySize = index

nlp = English()
tokenizer = nlp.tokenizer

size = dict()
size["trn"] = 10322
size["val"] = 1290
size["tst"] = 1290
with ZipFile("ml530-2022-fall-reuters.zip", "r") as archive:
    for partition in [ "trn", "val", "tst" ]:
        docCount = size[partition]
        docTokenMatrix = np.zeros((docCount, vocabularySize)).astype("float32")
        if (partition != "tst"):
            docLabelMatrix = np.zeros((docCount, 3)).astype("float32")
        index = 0
        for i in range(docCount):
            with archive.open("reuters_" + partition + "/reuters_" + partition + "_" + str(i).zfill(5) + ".sgm") as file:
                text = file.read().decode("utf-8")
                soup = BeautifulSoup(text, "html.parser")
                for element in [ soup.title, soup.body ]:
                    if (element != None):
                        temp = element.get_text()
                        temp = re.sub("[\t\r\n]", " ", temp)
                        temp = " ".join(temp.split())
                        for token in [ token.text for token in tokenizer(temp) ]:
                            key = token.lower()
                            if (key in tokenIndex):
                                docTokenMatrix[index, tokenIndex[key]] = 1
                if (partition != "tst"):
                    if (soup.topics != None):
                        labels = [ topic.get_text() for topic in soup.topics.find_all('d') ]
                    if ("earn" in labels):
                        docLabelMatrix[index, 0] = 1
                    if ("acq" in labels):
                        docLabelMatrix[index, 1] = 1
                    if ("money-fx" in labels):
                        docLabelMatrix[index, 2] = 1
            index = index + 1

        np.save("reuters_" + partition + "X.npy", docTokenMatrix)
        if (partition != "tst"):
            np.save("reuters_" + partition + "Y.npy", docLabelMatrix)