"""This script can be used to conduct the "Zufallsexperiment" described in my thesis (Chapter 4.3). It is used to make
the lexical diversity of corpora of different sizes comparable.
The input format is a tab separated CSV file that contains a column "Token" containing the tokens of a sub vocabulary
line by line. The result of the experiment is output on the console when running the script."""

import pandas as pd
import random
import os
import numpy as np
import calculate_lexical_diversity
from random import randint

# Change the values of the following variables if required
# a random seed is set so the results are reproducible
random.seed(42)
# the file path to the input files (tab separated CSV files) for which you want to run the experiment
os.chdir("..\\input")
input_dir = "..\\input"
# the number of repetitions of the experiment
repetition = 10000


# A function that determines the size of the smallest corpus from the input files.
# return reduce_size: the size of the smallest corpus
def get_reduce_size():
    token_amount_list = []
    for root, dirs, files in os.walk(input_dir, topdown=False):
        for filename in files:
            if filename.endswith(".csv"):
                token_amount_list.append(len(pd.read_csv(filename, sep='\t', encoding='utf-16')["Token"]))
    reduce_size = min(token_amount_list)
    return reduce_size


# A function that conducts the experiment, i.e., first reduces the size of the larger corpora from the input files to
# the size of the smallest corpus (the size is determined using get_reduce_size()). The corpus size-dependent lexical
# diversity measures are then calculated for the size-reduced corpora.
# return reduced_values_dict: A dictionary that has the filename as key and a list with the calculated results
# for the corpus size-dependent lexical diversity measures (i.e. TTR by Johnson (1944), PP by Baayen (2009), Entropy by
# Shannon (1948)) as value.
def reduce_corpora():
    reduced_values_dict = {}
    reduced_size = get_reduce_size()
    for root, dirs, files in os.walk(input_dir, topdown=False):
        for filename in files:
            if filename.endswith(".csv"):
                reduced_values_dict[filename] = []
                hapax_legomena_list = []
                type_list = []
                entropy_list = []
                if len(pd.read_csv(filename, sep='\t', encoding='utf-16')["Token"]) > reduced_size:
                    token_list = pd.read_csv(filename, sep='\t', encoding='utf-16')["Token"]
                    i = 0
                    while i < repetition:
                        random_index_list = []
                        counter = 0
                        reduced_list = []
                        reduced_token_freq_dict = {}
                        hapax_legomena_amount = 0

                        while counter < reduced_size:
                            index = randint(0, reduced_size-1)
                            while index in random_index_list:
                                index = randint(0, reduced_size-1)
                            random_index_list.append(index)
                            reduced_list.append(token_list[index])
                            counter += 1
                        i += 1

                        # get the frequency for each type
                        for token in reduced_list:
                            if token not in reduced_token_freq_dict.keys():
                                reduced_token_freq_dict[token] = 1
                            else:
                                reduced_token_freq_dict[token] += 1

                        # calculate the entropy
                        entropy_list.append(calculate_lexical_diversity.calc_entropy(reduced_token_freq_dict.values(),
                                                                                     reduced_size))
                        # get the number of types
                        type_list.append(len(reduced_token_freq_dict.keys()))

                        # get the number of hapax legomena
                        for value in reduced_token_freq_dict.values():
                            if value == 1:
                                hapax_legomena_amount += 1
                        hapax_legomena_list.append(hapax_legomena_amount)

                    # calculate the average ttr
                    ttr = calculate_lexical_diversity.calc_ttr(np.mean(type_list), reduced_size)
                    # calculate the average pp
                    pp = calculate_lexical_diversity.calc_pp(np.mean(hapax_legomena_list), reduced_size)

                    # change if required
                    # round the result to the second decimal place
                    reduced_values_dict[filename].append(round(ttr, 2))
                    reduced_values_dict[filename].append(round(pp, 2))
                    reduced_values_dict[filename].append(round((np.mean(entropy_list)), 2))
                else:
                    token_list = pd.read_csv(filename, sep='\t', encoding='utf-16')["Token"]
                    token_freq_dict = {}
                    hapax_legomena_amount = 0

                    # get the frequency of each type
                    for token in token_list:
                        if token not in token_freq_dict.keys():
                            token_freq_dict[token] = 1
                        else:
                            token_freq_dict[token] += 1

                    # get the number of hapax legomena
                    for value in token_freq_dict.values():
                        if value == 1:
                            hapax_legomena_amount += 1

                    # calculate the ttr
                    ttr = calculate_lexical_diversity.calc_ttr(len(token_freq_dict.keys()), len(token_list))
                    # calculate the pp
                    pp = calculate_lexical_diversity.calc_pp(hapax_legomena_amount, len(token_list))
                    # calculate the entropy
                    entr = calculate_lexical_diversity.calc_entropy(token_freq_dict.values(), len(token_list))
                    # change if required
                    # round the result to the second decimal place
                    reduced_values_dict[filename].append(round(ttr, 2))
                    reduced_values_dict[filename].append(round(pp, 2))
                    reduced_values_dict[filename].append(round(entr, 2))
    return reduced_values_dict


# This loop iterates through the dictionary returned by the function reduce_corpora() and prints the file name and
# the list with the calculated values of the corpus size-dependent lexical diversity measures (TTR, PP, entropy)
# on the console.
for k, v in reduce_corpora().items():
    print("File name:")
    print(k)
    print("TTR:")
    print(v[0])
    print("PP:")
    print(v[1])
    print("Entropy")
    print(v[2])
    print("\n")

