import os
import csv
import difflib
import datetime


def Cluster_My_Clients(Input_Files, Output_File, Match_Strength) :

    # 1. Scan the client names and remove duplicates
    String_List = []
    Token_List = []
    Cluster_Num = {}
    Duplicate_Free_String_List = {}

    script_dir = os.path.dirname(os.path.realpath(__file__))

    print('Loading data.')
    
    for f in Input_Files :
        Data_TXT = script_dir + '\\Data\\' + f
        with open(Data_TXT , 'r', encoding = 'utf-8') as csvfile :
            dict_reader = csv.reader(csvfile, delimiter = ',')
            for w in dict_reader :
                Duplicate_Free_String_List[w[0].lower()] = 1

    # 2. Sort the unique client names
    Duplicate_Free_String_List = sorted(Duplicate_Free_String_List.keys())
    for d in Duplicate_Free_String_List :
        String_List.append(d)
        Token_List.append(d.split(' ')[0])
        Cluster_Num[d.split(' ')[0]] = 1

    Client_Name_Strings = len(String_List)
    
    # 3. Use approximate string matching to cluster the names
    check_st = datetime.datetime.now()
    print('Creating clusters for', Input_Files, check_st)
    
    Merge_Map = {}
    Tokens = list(Cluster_Num.keys())
    token_num = len(Tokens)
    progress = 4

    j = 1
    cl_id = 1
    for t in Tokens :
        if round((j/token_num)*100) > progress :
            print('Progress', round((j/token_num)*100), '%', t)
            progress = progress + 5
        
        if t not in Merge_Map :
            similar_tokens = difflib.get_close_matches(t, Tokens, len(Tokens), 0.7)
            
            if len(similar_tokens) > 1 :
                for s in similar_tokens :
                    Merge_Map[s] = cl_id
                cl_id = cl_id + 1
            else :
                Merge_Map[t] = 0
        j = j + 1

    # 4. Number the cluster list and output the results
    Final_Cluster_List = []
    c = 0
    for t in Token_List :
        Final_Cluster_List.append([Merge_Map[t], String_List[c]])
        c = c + 1

    Cluster_Numbers = []
    with open(script_dir + '\\' + Output_File + '.csv', 'w', newline = '', encoding = 'utf-8') as csv_file :
        writer = csv.writer(csv_file, delimiter = '\t')
        for r in Final_Cluster_List :
            writer.writerow(r)
            Cluster_Numbers.append(r[0])
    
    print('\nClustering complete. Time:', datetime.datetime.now() - check_st)

    Cluster_Numbers = list(set(Cluster_Numbers))
    print('\nNumber of distinct client name strings before clustering :', Client_Name_Strings)
    print('Estimated number of clients :', len(Cluster_Numbers))


# Parameters - Name of the files to process and the cutoff ratio for the matches.
# The difflib documentation says that match ratios over 0.6 indicate a good match.
# Only the first column from each csv is used. It is assumed that it's where the client names are.
Cluster_My_Clients(['PlanStepPayees.csv', 'Companies.csv', 'CaseClaims.csv'], 'Client Clusters', 0.7)