"""This script implements a permutation test (Fisher 1936, Koplenig 2019) for the 'non-sequential' measures of
lexical diversity (TTR, PP, entropy).
The input format is a tab separated CSV file that contains a column "Token" containing the tokens of a sub vocabulary
line by line. The result of the permutation test is output on the console."""

import pandas as pd
import random
import calculate_lexical_diversity as cld
import os

# Change the values of the following variables if required
# a random seed is set so the results are reproducible
random.seed(42)
# the file path to the input files (tab separated CSV files) for which you want to run a permutation test
# info: a permutation test can only be conducted for two corpora
os.chdir("..\\input")
input_file_0 = "..\\input\\Anhang_F.csv"
input_file_1 = "..\\input\\Anhang_G.csv"
# # the number of repetitions
repetition = 10000

token_list_0 = pd.read_csv(input_file_0, sep='\t', encoding='utf-16')["Token"].values.tolist()
token_list_1 = pd.read_csv(input_file_1, sep='\t', encoding='utf-16')["Token"].values.tolist()

# get the total number of tokens in both files
token_amount_0 = len(token_list_0)
token_amount_1 = len(token_list_1)


# A function that calculates the difference between the TTR values of the two input corpora.
# return ttr_diff: the difference between the TTR values
# return larger_flag: a boolean variable to determine for a permutation test which of the two corpora is the minuend
# and which is the subtrahend
def calculate_ttr_diff():
    ttr_0 = cld.calc_ttr(cld.get_number_of_types(cld.get_type_freq(token_list_0).values()), token_amount_0)
    ttr_1 = cld.calc_ttr(cld.get_number_of_types(cld.get_type_freq(token_list_1).values()), token_amount_1)
    if ttr_0 > ttr_1:
        ttr_diff = ttr_0 - ttr_1
        larger_flag = True
    else:
        ttr_diff = ttr_1 - ttr_0
        larger_flag = False
    return ttr_diff, larger_flag


# A function that calculates the difference between the PP values of the two input corpora.
# return pp_diff: the difference between the PP values
# return larger_flag: a boolean variable to determine for a permutation test which of the two corpora is the minuend
# and which is the subtrahend
def calculate_pp_diff():
    pp_0 = cld.calc_pp(cld.get_number_of_hapax_leg(cld.get_type_freq(token_list_0)), token_amount_0)
    pp_1 = cld.calc_pp(cld.get_number_of_hapax_leg(cld.get_type_freq(token_list_1)), token_amount_1)
    if pp_0 > pp_1:
        pp_diff = pp_0 - pp_1
        larger_flag = True
    else:
        pp_diff = pp_1 - pp_0
        larger_flag = False
    return pp_diff, larger_flag


# A function that calculates the difference between the entropy values of the two input corpora.
# return entr_diff: the difference between the entropy values
# return larger_flag: a boolean variable to determine for a permutation test which of the two corpora is the minuend
# and which is the subtrahend
def calculate_entr_diff():
    entr_0 = cld.calc_entropy(cld.get_type_freq(token_list_0).values(), token_amount_0)
    entr_1 = cld.calc_entropy(cld.get_type_freq(token_list_1).values(), token_amount_1)
    if entr_0 > entr_1:
        entr_diff = entr_0 - entr_1
        larger_flag = True
    else:
        entr_diff = entr_1 - entr_0
        larger_flag = False
    return entr_diff, larger_flag


# A function that conducts a permutation test. The p-value of each measure is output to the console.
# param token_list_0: a list containing the tokens of the first corpus
# param token_list_1: a list containing the tokens of the second corpus
def permutation_test(token_list_0, token_list_1):
    total_token_list = []
    for token in token_list_0:
        total_token_list.append(token)
    for token in token_list_1:
        total_token_list.append(token)

    i = 0
    diff_ttr_list = []
    diff_pp_list = []
    diff_entr_list = []

    while i < repetition:
        shuffle_list = list(total_token_list)
        random.shuffle(shuffle_list)
        permut_token_list_0 = shuffle_list[:len(token_list_0)]
        permut_token_list_1 = shuffle_list[len(token_list_0):]

        # ttr
        ttr_0 = cld.calc_ttr(cld.get_number_of_types(cld.get_type_freq(permut_token_list_0).values()), len(permut_token_list_0))
        ttr_1 = cld.calc_ttr(cld.get_number_of_types(cld.get_type_freq(permut_token_list_1).values()), len(permut_token_list_1))

        if calculate_ttr_diff()[1]:
            diff_ttr_list.append(ttr_0 - ttr_1)
        else:
            diff_ttr_list.append(ttr_1 - ttr_0)


        # pp
        pp_0 = cld.calc_pp(cld.get_number_of_hapax_leg(cld.get_type_freq(permut_token_list_0)), len(permut_token_list_0))
        pp_1 = cld.calc_pp(cld.get_number_of_hapax_leg(cld.get_type_freq(permut_token_list_1)), len(permut_token_list_1))
        if calculate_pp_diff()[1]:
            diff_pp_list.append(pp_0 - pp_1)
        else:
            diff_pp_list.append(pp_1 - pp_0)


        # entropy
        entr_0 = cld.calc_entropy(cld.get_type_freq(permut_token_list_0).values(), len(permut_token_list_0))
        entr_1 = cld.calc_entropy(cld.get_type_freq(permut_token_list_1).values(), len(permut_token_list_1))

        if calculate_entr_diff()[1]:
            diff_entr_list.append(entr_0 - entr_0)
        else:
            diff_entr_list.append(entr_1 - entr_0)

        i += 1

    diff_ttr_ctr = 0
    for diff_ttr in diff_ttr_list:
        if diff_ttr >= calculate_ttr_diff()[0]:
            diff_ttr_ctr += 1
    print("TTR p-value")
    print(diff_ttr_ctr / repetition)

    diff_pp_ctr = 0
    for diff_pp in diff_pp_list:
        if diff_pp >= calculate_pp_diff()[0]:
            diff_pp_ctr += 1
    print("PP p-value")
    print(diff_pp_ctr / repetition)

    diff_entr_ctr = 0
    for diff_entr in diff_entr_list:
        if diff_entr >= calculate_entr_diff()[0]:
            diff_entr_ctr += 1
    print("Entropy p-value")
    print(diff_entr_ctr / repetition)


# function call
permutation_test(token_list_0, token_list_1)