diff options
author | Aaron LI <aly@aaronly.me> | 2017-10-16 10:59:31 +0800 |
---|---|---|
committer | Aaron LI <aly@aaronly.me> | 2017-10-16 10:59:31 +0800 |
commit | 020c5df2758d299f72d4badc98f8255edfa61b3a (patch) | |
tree | 558ed7e4286ce88bab7f76f121987c3994a747ab /cluster | |
parent | 2a4f07ece389ab3454afeeeced8480a1c958f8f9 (diff) | |
download | atoolbox-020c5df2758d299f72d4badc98f8255edfa61b3a.tar.bz2 |
Move some scripts
Diffstat (limited to 'cluster')
-rw-r--r-- | cluster/kMeans.py | 76 |
1 files changed, 0 insertions, 76 deletions
diff --git a/cluster/kMeans.py b/cluster/kMeans.py deleted file mode 100644 index f4868c6..0000000 --- a/cluster/kMeans.py +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# -# Credit: Machine Learning in Action: Chapter 10 -# -# Aaron LI -# 2015/06/23 -# - -""" -k-means clustering algorithm -""" - - -import numpy as np - - -def loadDataSet(fileName): - dataMat = [] - fr = open(fileName) - for line in fr.readlines(): - curLine = line.strip().split('\t') - fltLine = list(map(float, curLine)) - dataMat.append(fltLine) - return np.array(dataMat) - - -def distEclud(vecA, vecB): - return np.sqrt(np.sum(np.power(vecA - vecB, 2))) - - -def randCent(dataSet, k): - n = np.shape(dataSet)[1] - centroids = np.zeros((k, n)) - for j in range(n): - minJ = np.min(dataSet[:, j]) - rangeJ = float(np.max(dataSet[:, j]) - minJ) - centroids[:, j] = minJ + rangeJ * np.random.rand(k) - return centroids - - -def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): - m = np.shape(dataSet)[0] - clusterAssment = np.zeros((m, 2)) - centroids = createCent(dataSet, k) - clusterChanged = True - iterations = 0 - while clusterChanged: - clusterChanged = False - iterations += 1 - for i in range(m): - minDist = np.inf - minIndex = -1 - # to find the nearest centroid - for j in range(k): - distJI = distMeas(centroids[j, :], dataSet[i, :]) - if distJI < minDist: - minDist = distJI - minIndex = j - if clusterAssment[i, 0] != minIndex: - clusterChanged = True - clusterAssment[i, :] = minIndex, minDist**2 - #print(centroids) - for cent in range(k): - ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)] - centroids[cent, :] = np.mean(ptsInClust, axis=0) - result = { - 'k': k, - 'centroids': centroids, - 'labels': clusterAssment[:, 0].astype(int), - 'distance2': clusterAssment[:, 1], - 'accessment': clusterAssment, - 'iterations': iterations - } - return result - |