Python K significa agrupamiento

Estoy tratando de implementar el código en este sitio web para estimar qué valor de KI debería usar para mi K significa agrupación.

Selection of K in K-means Clustering, Reloaded

Sin embargo, no estoy obteniendo ningún éxito, en particular, estoy tratando de obtener f (k) frente al número de agrupaciones k que puedo usar para obtener el valor ideal de k para usar.

Mi formato de datos es el siguiente:

Cada una de las coordenadas tiene 5 dimensiones / variables, es decir, son puntos de datos que viven en un espacio de cinco dimensiones. La lista de coordenadas está debajo, donde, por ejemplo, el primer punto de datos tiene coordenadas ( 35.38361202590826,-24.022420305129415, 0.9608968122051765, -11.700331772145386, -9.4393980963685) .

     Variable1 = [35.38361202590826, 3.0, 10.0, 10.04987562112089, 5.385164807134505, 24.35159132377184, 10.77032961426901, 10.816653826391967, 18.384776310850235, 14.317821063276353, 24.18677324489565, 3.0, 24.33105012119288, 8.94427190999916, 2.82842712474619, 4.123105625617661, 4.47213595499958, 13.453624047073712, 12.529964086141668, 19.4164878389476, 5.385164807134505, 5.0, 24.041630560342618, 30.083217912982647, 15.132745950421555, 1.414213562373095, 21.470910553583888, 12.649110640673516, 9.0, 9.055385138137416, 16.124515496597102, 18.027756377319946, 7.615773105863908, 4.47213595499958, 5.0, 16.124515496597102, 8.246211251235321, 3.0, 23.02172886644268, 2.23606797749979, 10.0, 13.416407864998737, 14.7648230602334, 12.649110640673516, 2.82842712474619, 9.899494936611665, 12.806248474865697, 13.0, 10.19803902718557, 10.440306508910549] Variable2 = [-24.022420305129415, -40.0, -21.0, -36.020346285601605, -14.298541039632994, -10.225204451297113, -7.242118188905023, -10.816653826391967, -16.263455967290593, -0.9079593845004517, -5.70559779110359, -1.0, -17.426292654367874, -0.4472135954999579, -12.727922061357855, -38.32062875574061, -15.205262246998569, -13.89960053482201, -6.943355894868313, -18.43793805396085, -14.298541039632994, -8.0, -9.899494936611665, -10.537436550735357, -9.251460406371256, -1.414213562373095, -0.23287321641631115, -4.743416490252569, -10.0, -25.951408627588936, -5.457528321925173, -11.648704120729812, -15.231546211727816, -9.838699100999074, -2.2, 4.713319914389921, -3.395498750508662, -32.0, -16.59301967354925, -4.47213595499958, -3.4, -13.416407864998737, 4.944183868793753, -3.478505426185217, -21.213203435596423, -18.384776310850235, -6.871645523098667, -21.0, -5.491251783869154, -8.620436566990362] Variable3 = [0.9608968122051765, 22.0, 21.0, 18.507691737905798, 15.412713068695306, -8.08982038917884, -0.7427813527082074, -7.211102550927978, -14.849242404917499, -0.4190581774617469, -10.170848236315095, -7.0, 1.150792911137501, -5.366563145999495, -12.727922061357855, 4.85071250072666, 9.838699100999074, -8.473553267217696, 6.065460321953928, -10.249021432229634, 4.642383454426297, -9.0, 9.899494936611665, 4.354587344310195, -8.854969246098202, -8.48528137423857, -10.292996165600954, -11.067971810589327, -30.0, -10.932721081409808, -14.6360986815266, -22.188007849009164, 0.0, -7.155417527999327, -5.4, -12.279438724331637, 19.40285000290664, -7.0, 18.938629784469825, 8.94427190999916, 3.8, -8.94427190999916, -43.549455173073746, -8.538149682454623, -11.31370849898476, 1.4142135623730951, -10.619815808425212, 12.0, 7.060180864974626, -7.854175538813441] Variable4 = [-11.700331772145386, -8.0, -5.0, -2.9851115706299676, -10.398938937914904, -8.459406092237773, -7.242118188905023, -10.539303728279352, -21.920310216782973, -8.03194840135015, -10.791021909261136, -10.0, -9.69954025101608, -2.6832815729997477, -23.33452377915607, -7.761140001162655, -17.44133022449836, -4.980070779856015, -2.7134954071899156, -6.48933015307002, -12.441587657862476, -5.2, -18.384776310850235, -10.603918800266811, -14.604091070057484, -4.949747468305833, -1.3506646552146047, -7.905694150420948, -14.0, -29.706080514133717, -2.4806946917841692, -23.574758339572238, -3.2826608214930637, -5.813776741499453, -13.4, -4.9613893835683385, -11.884245626780316, -19.0, -5.473090258814675, -2.23606797749979, -2.0, -2.6832815729997477, -6.163297699455227, -12.01665510863984, -12.727922061357855, -12.020815280171307, -8.589556903873333, -18.53846153846154, -5.491251783869154, -4.789131426105757] Variable5 = [-9.4393980963685, -4.0, -2.0, -0.29851115706299675, -9.84185292338375, 6.118696639531204, -6.127946159842712, -2.218800784900916, 10.606601717798213, 0.6984302957695782, 0.7442084075352507, -0.0, 3.452378733412503, 1.3416407864998738, -6.363961030678928, 6.305926250944657, -5.813776741499453, -0.4459764877482998, -0.7980868844676221, 7.673890419106611, -1.4855627054164149, 1.4, -2.8284271247461903, -2.925218979383948, 3.9649116027305387, 0.7071067811865475, 0.4191717895493601, 1.5811388300841895, -4.0, 4.748555621218401, 4.341215710622296, 4.714951667914447, -5.120950881529179, 4.919349550499537, 6.2, 0.6201736729460423, -6.305926250944657, -9.0, -6.168085847235585, 0.0, -1.0, 1.3416407864998738, 3.3186987612451224, 4.427188724235731, 4.242640687119285, 4.949747468305833, 5.9346029517670305, 2.3076923076923075, -3.1378581622109447, 1.436739427831727] 

    Soy capaz de usar scikit-learn para crear grupos con estas coordenadas, sin embargo, estoy interesado en encontrar el valor k óptimo para usar, sin embargo, scikit-learn no tiene una función en la que pueda estimar el valor óptimo de K con esta técnica (o cualquier técnica que yo sepa).

    Puedes probar el código en el último comentario de Monte Shaffer. Aquí hay una versión simplificada:

     import numpy as np import random from numpy import zeros class KMeansFK(): def __init__(self, K, X): self.K = K self.X = X self.N = len(X) self.mu = None self.clusters = None self.method = None def _cluster_points(self): mu = self.mu clusters = {} for x in self.X: bestmukey = min([(i[0], np.linalg.norm(x-mu[i[0]])) \ for i in enumerate(mu)], key=lambda t:t[1])[0] try: clusters[bestmukey].append(x) except KeyError: clusters[bestmukey] = [x] self.clusters = clusters def _reevaluate_centers(self): clusters = self.clusters newmu = [] keys = sorted(self.clusters.keys()) for k in keys: newmu.append(np.mean(clusters[k], axis = 0)) self.mu = newmu def _has_converged(self): K = len(self.oldmu) return(set([tuple(a) for a in self.mu]) == \ set([tuple(a) for a in self.oldmu])\ and len(set([tuple(a) for a in self.mu])) == K) def find_centers(self, K, method='random'): self.method = method X = self.X K = self.K # https://stackoverflow.com/questions/44372231/population-must-be-a-sequence-or-set-for-dicts-use-listd self.oldmu = random.sample(list(X), K) if method != '++': # Initialize to K random centers self.mu = random.sample(list(X), K) while not self._has_converged(): self.oldmu = self.mu # Assign all points in X to clusters self._cluster_points() # Reevaluate centers self._reevaluate_centers() def _dist_from_centers(self): cent = self.mu X = self.X D2 = np.array([min([np.linalg.norm(xc)**2 for c in cent]) for x in X]) self.D2 = D2 def _choose_next_center(self): self.probs = self.D2/self.D2.sum() self.cumprobs = self.probs.cumsum() r = random.random() ind = np.where(self.cumprobs >= r)[0][0] return(self.X[ind]) def init_centers(self,K): self.K = K #self.mu = random.sample(self.X, 1) self.mu = random.sample(list(self.X), 1) while len(self.mu) < self.K: self._dist_from_centers() self.mu.append(self._choose_next_center()) def get_ak(self,k, Nd): if k == 2: return( 1 - 3.0 / (4.0 * Nd ) ) else: previous_a = self.get_ak(k-1, Nd) return ( previous_a + (1.0-previous_a)/6.0 ) def fK(self, thisk, Skm1=0): X = self.X Nd = len(X[0]) self.find_centers(thisk, method='++') mu, clusters = self.mu, self.clusters Sk = sum([np.linalg.norm(mu[i]-c)**2 \ for i in range(thisk) for c in clusters[i]]) if thisk == 1: fs = 1 elif Skm1 == 0: fs = 1 else: fs = Sk/(self.get_ak(thisk,Nd)*Skm1) return fs, Sk def run(self, maxk): ks = range(1,maxk) fs = zeros(len(ks)) Wks,Wkbs,sks = zeros(len(ks)+1),zeros(len(ks)+1),zeros(len(ks)+1) # Special case K=1 self.init_centers(1) fs[0], Sk = self.fK(1) # Rest of Ks for k in ks[1:]: self.init_centers(k) fs[k-1], Sk = self.fK(k, Skm1=Sk) self.fs = fs 

    Y luego ejecutarlo en sus datos:

     X = np.array([Variable1, Variable2, Variable3, Variable4, Variable5]) km = kmeans.KMeansFK(2, X) km.run(5) 

    Ahora km.clusters tiene el resultado.