"""This script implements the computation of various lexical diversity measures.
The input format is a tab separated CSV file that contains a column "Token" containing the tokens of a sub vocabulary
line by line. Comment in the function calls at the end of this script as required."""

import math
import pandas as pd
import numpy as np
import os


# Change the values of the following variables if required
# the file path to the input file (tab separated CSV file) for which you want to calculate the TTR, the PP and the
# entropy
input_file = "..\\input\\Anhang_F.csv"
token_list = pd.read_csv(input_file, sep='\t', encoding='utf-16')["Token"].values.tolist()
# the file path to the input directory containing the input files for calculating the MSTTR and the MTLD
# os.chdir("..\\input")
# input_dir = "..\\input"


# A function that determines the frequency of each type in a corpus.
# param token_list: a list containing the tokens of a corpus
# return type_dict: a dictionary containing the type as key and its frequency as value
def get_type_freq(token_list):
    type_dict = {}
    for token in token_list:
        if token not in type_dict.keys():
            type_dict[token] = 1
        else:
            type_dict[token] += 1
    return type_dict


# A function that determines the number of types in a corpus.
# param type_list: a list containing the types in a corpus
# return type_amount: the number of types in a corpus
def get_number_of_types(type_list):
    type_amount = len(type_list)
    return type_amount


# A function that determines the number of hapax legomena in a corpus.
# param type_dict: a dictionary containing the type as key and its frequency as value
# return hapax_amount: the number of hapax legomena
def get_number_of_hapax_leg(type_dict):
    hapax_amount = 0
    for freq in type_dict.values():
        if freq == 1:
            hapax_amount += 1
    return hapax_amount


# A function that calculates the Type-Token-Ratio (TTR) by Johnson (1944).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return ttr: the calculated TTR
def calc_ttr(types_total, token_total):
    ttr = types_total / token_total
    return ttr


# A function that calculates the Potential Productivity (PP) by Baayen (2009).
# param hapax_legomena_total: the total number of hapax legomena in a corpus
# param token_total: the total number of tokens in a corpus
# return pp: the calculated PP
def calc_pp(hapax_legomena_total, token_total):
    pp = hapax_legomena_total / token_total
    return pp


# A function that calculates the Entropy by Shannon (1948).
# param token_type_freq_list: a list containing the frequency of each types in a corpus
# param token_total: the total number of tokens in a corpus
# return entropy: the calculated entropy
def calc_entropy(token_type_freq_list, token_total):
    entropy = 0
    for freq in token_type_freq_list:
        entropy += -((freq / token_total) * math.log2(freq / token_total))
    return entropy


# A function that calculates the Mean-Segmental-Type-Token-Ratio (MSTTR) by Johnson (1944), as recommended in my thesis.
# param input_dir: file path to the folder containing the files in which the tokens of a sub vocabulary are randomly
# sorted by text from which they were extracted (for that purpose you can use the script create_random_sorted_files.py)
# param seg_size_min: the smallest segment size
# param seg_size_max: the largest segment size
# param incr_steps: the increment steps from the smallest segment size to the largest
# return avg_msttr_dict: a dictionary that has the segment size as key and the average MSTTR value of all input files
# of the segment size as value
def calc_msttr(input_dir, seg_size_min, seg_size_max, incr_steps):
    msttr_dict = {}
    seg_size = seg_size_min
    while seg_size <= seg_size_max:
        msttr_dict[seg_size] = []
        seg_size += incr_steps

    for root, dirs, files in os.walk(input_dir, topdown=False):
        for filename in files:
            if filename.endswith(".csv"):
                seg_size = seg_size_min
                table = pd.read_csv(filename, sep='\t', encoding='utf-16')
                token_df = table["Token"]
                type_per_file_list = []
                while seg_size <= seg_size_max:
                    ttr_list = []
                    counter = 0
                    type_list = []
                    token_dict = {}
                    for token in token_df:
                        if counter > 1 and counter % seg_size == 0:
                            ttr = len(type_list) / seg_size
                            if ttr < 0:
                                ttr_list.append(0)
                            else:
                                ttr_list.append(ttr)
                            type_per_file_list.append(len(type_list))
                            type_list = []
                            token_dict = {}
                        if token not in type_list:
                            token = token
                            type_list.append(token)
                        if token not in token_dict.keys():
                            token_dict[token] = 1
                        else:
                            token_dict[token] += 1
                        counter += 1
                    msttr_dict[seg_size].append(np.mean(ttr_list))
                    seg_size += incr_steps
    avg_msttr_dict = {}
    for k,v in msttr_dict.items():
        avg_msttr_dict[k] = round(np.mean(v),2)
    return avg_msttr_dict


# A function that calculates the Measurement of Textual Lexical Diversity (MTLD) by McCarthy/Jarvis (2010), as
# recommended in my thesis.
# param input_dir: file path to the folder containing the files in which the tokens of a sub vocabulary are randomly
# sorted by text from which they were extracted (for that purpose you can use the script create_random_sorted_files.py)
# return avg_mtld: the average MTLD of all input files
def calc_mtld(input_dir):
    factor_ctr_fw_list = []
    factor_ctr_bw_list = []
    mtld_list = []

    for root, dirs, files in os.walk(input_dir, topdown=False):
        for filename in files:
            if filename.endswith(".csv"):
                table = pd.read_csv(filename, sep='\t', encoding='utf-16')
                token_df = table["Token"]
                factor_counter = 0
                threshold = 0.720
                counter = 0
                ttr = 0
                last_ttr_value = 0
                type_list = []

                # mtld forward
                for token in token_df:
                    if counter != 0 and ttr < threshold:
                        factor_counter += 1
                        counter = 0
                        type_list = []
                    if token not in type_list:
                        type_list.append(token)
                    counter += 1
                    ttr = len(type_list) / counter
                    last_ttr_value = ttr
                if last_ttr_value > threshold:
                    partial_factor = (1 - last_ttr_value) / 0.28
                    factor_counter += partial_factor
                mtld_fw = len(token_df) / factor_counter
                factor_ctr_fw_list.append(factor_counter)

                # mtld backward
                type_list = []
                factor_counter = 0
                threshold = 0.720
                counter = 0
                ttr = 0
                last_ttr_value = 0
                for token in reversed(token_df):
                    if counter != 0 and ttr <= threshold:
                        ttr = 0
                        factor_counter += 1
                        counter = 0
                        type_list = []
                    if token not in type_list:
                        type_list.append(token)
                    counter += 1
                    ttr = len(type_list) / counter
                    last_ttr_value = ttr
                if last_ttr_value > threshold:
                    partial_factor = (1 - last_ttr_value) / 0.28
                    factor_counter += partial_factor
                mtld_bw = len(token_df) / factor_counter
                factor_ctr_bw_list.append(factor_counter)

            file_mtld = np.mean([mtld_fw, mtld_bw])
            mtld_list.append(file_mtld)
    avg_mtld = round(np.mean(mtld_list), 2)
    return avg_mtld


# This is a help function to calculate the permutation test for the values of the MSTTR.
# param tok_list: a list containing the tokens of a corpus
# param seg_size: the segment size
# return msttr: the calculated MSTTR
def calc_msttr_permutation_test(tok_list, seg_size):
    type_per_seg_list = []
    ttr_list = []
    index = 0
    type_list = []

    for token in tok_list:
        if index > 1 and index % seg_size == 0:
            ttr = len(type_list) / seg_size
            if ttr < 0:
                ttr_list.append(0)
            else:
                ttr_list.append(ttr)
            type_per_seg_list.append(len(type_list))
            type_list = []
        if token not in type_list:
            type_list.append(token)
        index += 1
    msttr = np.mean(ttr_list)
    return msttr


# This is a help function to calculate the permutation test for the values of the MLTD.
# param token_list: a list containing the tokens of a corpus
# return mtld: the calculated MTLD
def calc_mtld_permutation_test(token_list):
    factor_ctr_fw_list = []
    factor_ctr_bw_list = []
    factor_counter = 0
    threshold = 0.720
    counter = 0
    ttr = 0
    last_ttr_value = 0
    type_list = []

    # mtld forward
    for token in token_list:
        if counter != 0 and ttr < threshold:
            factor_counter += 1
            counter = 0
            type_list = []
        if token not in type_list:
            type_list.append(token)
        counter += 1
        ttr = len(type_list) / counter
        last_ttr_value = ttr
    if last_ttr_value > threshold:
        partial_factor = (1 - last_ttr_value) / 0.28
        factor_counter += partial_factor
    if factor_counter == 0:
        mtld_fw = 0
    else:
        mtld_fw = (len(token_list)) / factor_counter
    factor_ctr_fw_list.append(factor_counter)

    # mtld backward
    type_list = []
    factor_counter = 0
    threshold = 0.720
    counter = 0
    ttr = 0
    last_ttr_value = 0
    for token in reversed(token_list):
        if counter != 0 and ttr <= threshold:
            ttr = 0
            factor_counter += 1
            counter = 0
            type_list = []
        if token not in type_list:
            type_list.append(token)
        counter += 1
        ttr = len(type_list) / counter
        last_ttr_value = ttr
    if last_ttr_value > threshold:
        partial_factor = (1 - last_ttr_value) / 0.28
        factor_counter += partial_factor
    if factor_counter == 0:
        mtld_bw = 0
    else:
        mtld_bw = (len(token_list)) / factor_counter
    factor_ctr_bw_list.append(factor_counter)
    mtld = np.mean([mtld_fw, mtld_bw])
    return np.mean(mtld)


"""The following measures were evaluated as unfit in my thesis, but are included here for completeness"""


# A function that calculates the Moving-Average-Type-Token-Ratio by Covington/McFall (2010).
# param input_dir: file path to the folder containing the files in which the tokens of a sub vocabulary are randomly
# sorted by text from which they were extracted (for that purpose you can use the script create_random_sorted_files.py)
# param window_size_min: the smallest window size
# param window_size_max: the largest window size
# param incr_steps: the increment steps from the smallest window size to the largest
# return avg_mattr_dict: a dictionary that has the window size as key and the average MATTR value of all input files
# of the window size as value
def calc_mattr(input_dir, window_size_min, window_size_max, incr_steps):
    mattr_dict = {}
    window_size = window_size_min
    while window_size <= window_size_max:
        mattr_dict[window_size] = []
        window_size += incr_steps

    for root, dirs, files in os.walk(input_dir, topdown=False):
        for filename in files:
            if filename.endswith(".csv"):
                window_size = window_size_min
                table = pd.read_csv(filename, sep='\t', encoding='utf-16')
                token_df = table["Token"]
                while window_size <= window_size_max:
                    start_token_index = 0
                    end_token_index = window_size
                    ttr_list = []
                    while end_token_index < len(token_df):
                        type_list = []
                        index = start_token_index
                        while index < end_token_index:
                            token = token_df[index]
                            index += 1
                            if token not in type_list:
                                type_list.append(token)
                        ttr = len(type_list) / window_size
                        ttr_list.append(ttr)
                        start_token_index += 1
                        end_token_index += 1
                    mattr_dict[window_size] = np.mean(ttr_list)
                    window_size += incr_steps
    avg_mattr_dict = {}
    for k,v in mattr_dict.items():
        avg_mattr_dict[k] = np.mean(v)
    return avg_mattr_dict


# A function that calculates the Root-Type-Token-Ratio (RTTR) by Guiraud (1959).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return rttr: the calculated RTTR
def calc_rttr(types_total, token_total):
    rttr = types_total / np.sqrt(token_total)
    return rttr


# A function that calculates the Corrected-Type-Token-Ratio (CTTR) by Carroll (1938).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return cttr: the calculated CTTR
def calc_cttr(types_total, token_total):
    cttr = types_total / np.sqrt(2*token_total)
    return cttr


# A function that calculates the Log-Type-Token-Ratio (LTTR) by Herdan (1960).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return lttr: the calculated LTTR
def calc_lttr(types_total, token_total):
    lttr = np.log(types_total) / np.log(token_total)
    return lttr


# A function that calculates S by Somers (1966).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return s: the calculated S
def calc_s(types_total, token_total):
    s = np.log(np.log(types_total)) / np.log(np.log(token_total))
    return s


# A function that calculates k by Dugast (1979).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return k: the calculated k
def calc_k(types_total, token_total):
    k = np.log(types_total) / np.log(np.log(token_total))
    return k


# A function that calculates a^2 by Maas (1972).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return a_square: the calculated a^2
def calc_a_square(types_total, token_total):
    a_square = np.log(token_total)-np.log(types_total) /np.power(np.log(token_total), 2)
    return a_square


# A function that calculates Uber by Dugast (1978).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# return uber: the calculated Uber
def calc_uber(types_total, token_total):
    uber = np.power(np.log(token_total), 2) / np.log(token_total)-np.log(types_total)
    return uber


# A function that calculates T by Tuldava (1993).
# param types_total: the total number of types in a corpus
# param token_total: the total number of tokens in a corpus
# param a: a variable whose value is determined depending on the language of a text or
# its genre (see Malvern et al. 2004: 200)
# return t: the calculated T
def calc_t(types_total, token_total, a):
    t = np.power(np.log(token_total), 2) / np.log(np.log(token_total / types_total)) + a
    return t


# comment in the function calls as required
# print("Types and their frequency:")
# print(get_type_freq(token_list))
# print("Number of types:")
# print(get_number_of_types(get_type_freq(token_list).values()))
# print("Number of hapax legomena:")
# print(get_number_of_hapax_leg(get_type_freq(token_list)))
# print("TTR:")
# print(round(calc_ttr(get_number_of_types(get_type_freq(token_list).values()), len(token_list)), 2))
# print("PP:")
# print(round(calc_pp(get_number_of_hapax_leg(get_type_freq(token_list)), len(token_list)), 2))
# print("Entropy:")
# print(round(calc_entropy(get_type_freq(token_list).values(), len(token_list)), 2))
# print("MSTTR:")
# print(calc_msttr(input_dir, 25, 50, 25))
# print("MTLD:")
# print(calc_mtld(input_dir))

