import os import csv import difflib import datetime def Cluster_My_Clients(Input_Files, Output_File, Match_Strength) : # 1. Scan the client names and remove duplicates String_List = [] Token_List = [] Cluster_Num = {} Duplicate_Free_String_List = {} script_dir = os.path.dirname(os.path.realpath(__file__)) print('Loading data.') for f in Input_Files : Data_TXT = script_dir + '\\Data\\' + f with open(Data_TXT , 'r', encoding = 'utf-8') as csvfile : dict_reader = csv.reader(csvfile, delimiter = ',') for w in dict_reader : Duplicate_Free_String_List[w[0].lower()] = 1 # 2. Sort the unique client names Duplicate_Free_String_List = sorted(Duplicate_Free_String_List.keys()) for d in Duplicate_Free_String_List : String_List.append(d) Token_List.append(d.split(' ')[0]) Cluster_Num[d.split(' ')[0]] = 1 Client_Name_Strings = len(String_List) # 3. Use approximate string matching to cluster the names check_st = datetime.datetime.now() print('Creating clusters for', Input_Files, check_st) Merge_Map = {} Tokens = list(Cluster_Num.keys()) token_num = len(Tokens) progress = 4 j = 1 cl_id = 1 for t in Tokens : if round((j/token_num)*100) > progress : print('Progress', round((j/token_num)*100), '%', t) progress = progress + 5 if t not in Merge_Map : similar_tokens = difflib.get_close_matches(t, Tokens, len(Tokens), 0.7) if len(similar_tokens) > 1 : for s in similar_tokens : Merge_Map[s] = cl_id cl_id = cl_id + 1 else : Merge_Map[t] = 0 j = j + 1 # 4. Number the cluster list and output the results Final_Cluster_List = [] c = 0 for t in Token_List : Final_Cluster_List.append([Merge_Map[t], String_List[c]]) c = c + 1 Cluster_Numbers = [] with open(script_dir + '\\' + Output_File + '.csv', 'w', newline = '', encoding = 'utf-8') as csv_file : writer = csv.writer(csv_file, delimiter = '\t') for r in Final_Cluster_List : writer.writerow(r) Cluster_Numbers.append(r[0]) print('\nClustering complete. Time:', datetime.datetime.now() - check_st) Cluster_Numbers = list(set(Cluster_Numbers)) print('\nNumber of distinct client name strings before clustering :', Client_Name_Strings) print('Estimated number of clients :', len(Cluster_Numbers)) # Parameters - Name of the files to process and the cutoff ratio for the matches. # The difflib documentation says that match ratios over 0.6 indicate a good match. # Only the first column from each csv is used. It is assumed that it's where the client names are. Cluster_My_Clients(['PlanStepPayees.csv', 'Companies.csv', 'CaseClaims.csv'], 'Client Clusters', 0.7)