# This script generates expected number of remaining candidate answers for wordle.
#   Choose smallest value for greedy optimization.
#   Set process_count to the number of processors you'd like to allocate.
#
# usage:
#   python expectations.py
#   sort -n output??.dat | head
#
# input:
#   see https://www.nytimes.com/games-assets/v2/wordle.a0b43bff9849d49cf1b2.js [find "first"]
#   latest.txt: current set of possible answer words
#   pool.txt: set of all possible answer words
#   extra.txt: set of additional words allowed for guess
#
# output:
#   output##.dat:
#     expected number of remaining candidates if this guess word is used
#     guess word
#     asterisk identifies guess is from answer words

import numpy as np
from datetime import datetime
from multiprocessing import Pool

process_count = 24

candidate_list = []
candidate_set = set()
for word in open('latest.txt', 'r'):
    word = word.strip()
    candidate_list.append(word)
    candidate_set.add(word)
np.random.shuffle(candidate_list)

candidate_set_size = len(candidate_list)
guess_list = []
for file in [ 'pool.txt', 'extra.txt' ]:
    for word in open(file, 'r'):
        guess_list.append(word.strip())
np.random.shuffle(candidate_list)
guess_set_size = len(guess_list)

def retrieve_feedback(answer, guess):
    feedback = ['m']*5
    freq = {}
    for i in range(5):
        freq[answer[i]] = freq.get(answer[i], 0) + 1
    for i in range(5):
        if (guess[i] == answer[i]):
            feedback[i] = 'h'
            freq[guess[i]] -= 1
    for i in range(5):
        if (feedback[i] != 'h') and (freq.get(guess[i], 0) > 0):
            feedback[i] = 'o'
            freq[guess[i]] -= 1
    feedback = ''.join(feedback)
    return feedback

def is_match(candidate, guess, feedback):
    freq = {}
    for i in range(5):
        freq[candidate[i]] = freq.get(candidate[i], 0) + 1
    for i in range(5):
        if (feedback[i] == 'h'):
            if (guess[i] == candidate[i]):
                freq[guess[i]] -= 1
            else:
                return False
    for i in range(5):
        if (feedback[i] == 'o'):
            if (guess[i] != candidate[i]) and (freq.get(guess[i], 0) > 0):
                freq[guess[i]] -= 1
            else:
                return False
    for i in range(5):
        if (feedback[i] == 'm'):
            if (freq.get(guess[i], 0) > 0):
                return False
    return True

def apply_feedback(candidate_list, guess, feedback):
    return np.sum([ 1 for candidate in candidate_list if is_match(candidate, guess, feedback) ])

def get_expectation(index):
    output = open('output' + str(index).zfill(2) + '.dat', 'w')
    next_index = index
    while (next_index < guess_set_size):
        guess = guess_list[next_index]
        candidate_set_size_mean = np.mean([ apply_feedback(candidate_list, guess, retrieve_feedback(answer, guess)) for answer in candidate_list ])
        if (guess in candidate_set):
            output.write(str(candidate_set_size_mean) + '\t' + guess + '\t*\n')
        else:
            output.write(str(candidate_set_size_mean) + '\t' + guess + '\t\n')
        next_index += process_count
        if (index == 0) and ((next_index % (10 * process_count)) == 0):
            print(str(datetime.now()), next_index / guess_set_size, sep = '\t')
    output.close()

pool = Pool(processes = process_count)
i = range(process_count)
t = pool.map(get_expectation, i)
print('done.')