diff options
author | Aaron LI <aaronly.me@gmail.com> | 2016-03-31 10:57:34 +0800 |
---|---|---|
committer | Aaron LI <aaronly.me@gmail.com> | 2016-03-31 10:57:34 +0800 |
commit | c9c896dea2ba43551c4e10bd49666105449e9bd7 (patch) | |
tree | e94b73f17b2d776c2acd4c9549657f500c3dc7ce /cluster | |
parent | 2b6cb9b655a53d43b32a8a211287c82f4f59999a (diff) | |
download | atoolbox-c9c896dea2ba43551c4e10bd49666105449e9bd7.tar.bz2 |
add all scripts/tools
Diffstat (limited to 'cluster')
-rw-r--r-- | cluster/kMeans.py | 76 |
1 files changed, 76 insertions, 0 deletions
diff --git a/cluster/kMeans.py b/cluster/kMeans.py new file mode 100644 index 0000000..f4868c6 --- /dev/null +++ b/cluster/kMeans.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Credit: Machine Learning in Action: Chapter 10 +# +# Aaron LI +# 2015/06/23 +# + +""" +k-means clustering algorithm +""" + + +import numpy as np + + +def loadDataSet(fileName): + dataMat = [] + fr = open(fileName) + for line in fr.readlines(): + curLine = line.strip().split('\t') + fltLine = list(map(float, curLine)) + dataMat.append(fltLine) + return np.array(dataMat) + + +def distEclud(vecA, vecB): + return np.sqrt(np.sum(np.power(vecA - vecB, 2))) + + +def randCent(dataSet, k): + n = np.shape(dataSet)[1] + centroids = np.zeros((k, n)) + for j in range(n): + minJ = np.min(dataSet[:, j]) + rangeJ = float(np.max(dataSet[:, j]) - minJ) + centroids[:, j] = minJ + rangeJ * np.random.rand(k) + return centroids + + +def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): + m = np.shape(dataSet)[0] + clusterAssment = np.zeros((m, 2)) + centroids = createCent(dataSet, k) + clusterChanged = True + iterations = 0 + while clusterChanged: + clusterChanged = False + iterations += 1 + for i in range(m): + minDist = np.inf + minIndex = -1 + # to find the nearest centroid + for j in range(k): + distJI = distMeas(centroids[j, :], dataSet[i, :]) + if distJI < minDist: + minDist = distJI + minIndex = j + if clusterAssment[i, 0] != minIndex: + clusterChanged = True + clusterAssment[i, :] = minIndex, minDist**2 + #print(centroids) + for cent in range(k): + ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)] + centroids[cent, :] = np.mean(ptsInClust, axis=0) + result = { + 'k': k, + 'centroids': centroids, + 'labels': clusterAssment[:, 0].astype(int), + 'distance2': clusterAssment[:, 1], + 'accessment': clusterAssment, + 'iterations': iterations + } + return result + |