So when I try to run the following code for arrays with more than 10k elements, it takes hours and I don’t know how to make it in the most efficient way. Any ideas?
from scipy.stats import entropy as KL import numpy as np def dis(di,dj): di = np.asanyarray(di) dj = np.asanyarray(dj) m = 0.5 * (di+dj) kl1 = KL(di,m) kl2 = KL(dj,m) return 0.5*(kl1+kl2) def Intra_Cluster_dist(C): C = np.asanyarray(C) K = float(C.shape[0]) factor1 = 1.0/float(K) total_sum = 0.0 for cluster in C: cluster = np.asanyarray(cluster) below1 = float(cluster.shape[0]) below2 = float(below1 - 1) sub_sum = 0.0 for di in cluster: #others = cluster[:] #others.remove(di) others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))] #for dj in others: # sub_sum = sub_sum + (2*float(dis(di,dj)))/(float(below1)*float(below2)) sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum() total_sum = total_sum + sub_sum return float(factor1 * total_sum) def Inter_Cluster_dist(C): K = float(len(C)) factor1 = float((1/(K*(K-1)))) total_sum = 0.0 for cluster in C: sub_sum = 0.0 other_clusters = C[:] other_clusters.remove(cluster) below1= float(len(cluster)) for other in other_clusters: below2= float(len(other)) for di in cluster: for dj in other: sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2))) total_sum = total_sum + sub_sum return float(factor1 * total_sum ) def H_score(C): return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))