"""This script generates files from an input file in which the tokens of a sub vocabulary are randomly sorted by text
from which they were extracted.
The input format is a tab separated CSV file that contains a column "Token" containing the tokens of a sub vocabulary
line by line and a column "Dateiname" containing the title of the text from which the respective token was extracted."""

import os.path
import io
import pandas as pd
from random import sample

# Change the values of the following variables if required
# the file path to the input file (tab separated CSV file) for which you want to generate the randomly sorted files
input_file = "..\\input\\Anhang_F.csv"
# the file path to the folder where the generated files are to be saved
output_dir = "..\\output"
# the number of files to be generated
file_amount = 5
# set this boolean variable to true if it is a corpus formed with the RW corpus. If not, set it to false.
rw_corpus_bool = True

table = pd.read_csv(input_file, sep='\t', encoding='utf-16')
file_df = table["Dateiname"].values.tolist()
token_list = table["Token"].values.tolist()


# This is a help function for the names of the files in the RW corpus:
# Tokens can be extracted from the same text, but from different samples. This can be determined by the numbering in the
# file names. This numbering is truncated so that tokens from the same text are randomly sorted together in the
# generated output files.
# param all_files_list: a list containing all file names of the RW corpus
# return clean_file_list: a list containing all truncated file names of the RW corpus
def clean_filename(all_files_list):
    clean_file_list = []
    for file in all_files_list:
        if file.split("-")[0] not in clean_file_list:
            clean_file_list.append(file.split("-")[0])
    return clean_file_list


# A function that saves all the titles of the texts in a corpus once in a list.
# param all_files_list: a list of all the titles of the texts in a corpus
# return file_list: a list in which each title of the texts in a corpus is included once
def get_filename(all_files_list):
    file_list = []
    for file in all_files_list:
        if file not in file_list:
            file_list.append(file)
    return file_list


# A function that generates the randomly sorted files and saves them in the specified output folder.
def sort_token_per_file_random():
    counter = 0
    while counter < file_amount:
        if rw_corpus_bool:
            random_list = sample(clean_filename(file_df), len(clean_filename(file_df)))
        else:
            random_list = sample(get_filename(file_df), len(get_filename(file_df)))
        random_file_list = []
        random_token_list = []

        for file in random_list:
            print(file)
            index = 0
            for f in file_df:
                if file == f.split("-")[0]:
                    random_file_list.append(file)
                    random_token_list.append(token_list[index])
                index += 1
        csv_str = "Dateiname\tToken\n"

        file_number = 0
        while file_number < len(random_file_list):
            csv_str += "{}\t{}\n".format(random_file_list[file_number], random_token_list[file_number])
            file_number += 1
            with io.open(os.path.join(output_dir, "random_file_" + str(counter) + ".csv"), "w",
                         encoding="utf-16") as f:
                f.write(csv_str)
        counter += 1


# function call
sort_token_per_file_random()
