aboutsummaryrefslogtreecommitdiffstats
path: root/python/kMeans.py
blob: f4868c6e7ca824ac593497818bff5ef0e45a7a2e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Credit: Machine Learning in Action: Chapter 10
#
# Aaron LI
# 2015/06/23
#

"""
k-means clustering algorithm
"""


import numpy as np


def loadDataSet(fileName):
    dataMat = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float, curLine))
        dataMat.append(fltLine)
    return np.array(dataMat)


def distEclud(vecA, vecB):
    return np.sqrt(np.sum(np.power(vecA - vecB, 2)))


def randCent(dataSet, k):
    n = np.shape(dataSet)[1]
    centroids = np.zeros((k, n))
    for j in range(n):
        minJ = np.min(dataSet[:, j])
        rangeJ = float(np.max(dataSet[:, j]) - minJ)
        centroids[:, j] = minJ + rangeJ * np.random.rand(k)
    return centroids


def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent):
    m = np.shape(dataSet)[0]
    clusterAssment = np.zeros((m, 2))
    centroids = createCent(dataSet, k)
    clusterChanged = True
    iterations = 0
    while clusterChanged:
        clusterChanged = False
        iterations += 1
        for i in range(m):
            minDist = np.inf
            minIndex = -1
            # to find the nearest centroid
            for j in range(k):
                distJI = distMeas(centroids[j, :], dataSet[i, :])
                if distJI < minDist:
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True
            clusterAssment[i, :] = minIndex, minDist**2
        #print(centroids)
        for cent in range(k):
            ptsInClust = dataSet[np.nonzero(clusterAssment[:, 0] == cent)]
            centroids[cent, :] = np.mean(ptsInClust, axis=0)
    result = {
            'k': k,
            'centroids': centroids,
            'labels': clusterAssment[:, 0].astype(int),
            'distance2': clusterAssment[:, 1],
            'accessment': clusterAssment,
            'iterations': iterations
    }
    return result