Spectral clustering on Zachary's karate club¶

Importing packages¶

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from scipy.linalg import fractional_matrix_power
from sklearn.cluster import KMeans

Reading the dataset¶

karate.paj is downloaded from http://networkdata.ics.uci.edu/data/karate/.

with open('karate.paj', 'r') as infile:
    karateLines = list(filter(lambda x : len(x) > 0, infile.read().split('\n')))

edgeInd = karateLines.index('*Edges')
vertexLines = karateLines[1:edgeInd]
edgeLines = karateLines[edgeInd+1:]

print(vertexLines)
print(edgeLines)

# generate adjacency matrix
nVertices = len(vertexLines)
print('num. of vertices:', nVertices)
adMat = np.zeros((nVertices, nVertices), np.float)

for line in edgeLines:
    v1, v2 = list(map(lambda x : int(x) - 1, line.split(' ')))
    adMat[v1, v2] = 1.0
    adMat[v2, v1] = 1.0

print('adjancency matrix:')
print(adMat)

['1 "1"', '2 "2"', '3 "3"', '4 "4"', '5 "5"', '6 "6"', '7 "7"', '8 "8"', '9 "9"', '10 "10"', '11 "11"', '12 "12"', '13 "13"', '14 "14"', '15 "15"', '16 "16"', '17 "17"', '18 "18"', '19 "19"', '20 "20"', '21 "21"', '22 "22"', '23 "23"', '24 "24"', '25 "25"', '26 "26"', '27 "27"', '28 "28"', '29 "29"', '30 "30"', '31 "31"', '32 "32"', '33 "33"', '34 "34"']
['1 2', '1 3', '2 3', '1 4', '2 4', '3 4', '1 5', '1 6', '1 7', '5 7', '6 7', '1 8', '2 8', '3 8', '4 8', '1 9', '3 9', '3 10', '1 11', '5 11', '6 11', '1 12', '1 13', '4 13', '1 14', '2 14', '3 14', '4 14', '6 17', '7 17', '1 18', '2 18', '1 20', '2 20', '1 22', '2 22', '24 26', '25 26', '3 28', '24 28', '25 28', '3 29', '24 30', '27 30', '2 31', '9 31', '1 32', '25 32', '26 32', '29 32', '3 33', '9 33', '15 33', '16 33', '19 33', '21 33', '23 33', '24 33', '30 33', '31 33', '32 33', '9 34', '10 34', '14 34', '15 34', '16 34', '19 34', '20 34', '21 34', '23 34', '24 34', '27 34', '28 34', '29 34', '30 34', '31 34', '32 34', '33 34']
num. of vertices: 34
adjancency matrix:
[[0. 1. 1. ... 1. 0. 0.]
 [1. 0. 1. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 1. 0.]
 ...
 [1. 0. 0. ... 0. 1. 1.]
 [0. 0. 1. ... 1. 0. 1.]
 [0. 0. 0. ... 1. 1. 0.]]

nxGraph = nx.from_numpy_matrix(adMat)
# generate a circular layout
nxPos_x = np.sin(np.linspace(0, 2.0 * np.pi, nVertices, endpoint=False)).tolist()
nxPos_y = np.cos(np.linspace(0, 2.0 * np.pi, nVertices, endpoint=False)).tolist()
nxPos_tup = zip(nxPos_x, nxPos_y)
nxPos = dict()
for i, loc in enumerate(nxPos_tup):
    nxPos[i] = loc
nx.draw(nxGraph, nxPos)
ax = plt.gca()
ax.set_aspect(1.0)
ax.set_title('Visualization of Karate club')
plt.show()

/home/alan/.local/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:563: MatplotlibDeprecationWarning: 
The iterable function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use np.iterable instead.
  if not cb.iterable(width):
/home/alan/.local/lib/python3.6/site-packages/networkx/drawing/nx_pylab.py:611: MatplotlibDeprecationWarning: 
The is_numlike function was deprecated in Matplotlib 3.0 and will be removed in 3.2. Use isinstance(..., numbers.Number) instead.
  if cb.is_numlike(alpha):

Spectral clustering¶

# compute the degree matrix
dMat = np.diag(np.sum(adMat, axis=0))
print(dMat)

[[16.  0.  0. ...  0.  0.  0.]
 [ 0.  9.  0. ...  0.  0.  0.]
 [ 0.  0. 10. ...  0.  0.  0.]
 ...
 [ 0.  0.  0. ...  6.  0.  0.]
 [ 0.  0.  0. ...  0. 12.  0.]
 [ 0.  0.  0. ...  0.  0. 17.]]

# compute the lapacian
laplacian_rcut = dMat - adMat
dMat_inv_sqrt = fractional_matrix_power(dMat, -0.5)
laplacian_ncut = np.identity(dMat.shape[0]) - dMat_inv_sqrt @ adMat @ dMat_inv_sqrt

def spectral_clustering(laplacian):
    eigVals, eigVecs = np.linalg.eig(laplacian)
    # find the second smallest eigenvalue
    eigValInds = list(zip(range(eigVals.size), eigVals.tolist()))
    eigValInds.sort(key=lambda x : x[1])
    #print(eigValInds)
    eigInd = eigValInds[1][0]
    vec = eigVecs[:, eigInd].reshape((-1, 1))
    #print(vec)
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(vec)
    assignment = kmeans.predict(vec)
    group1 = np.where(assignment == 0)[0] + 1
    group2 = np.where(assignment == 1)[0] + 1
    print('people in group 1:', group1)
    print('people in group 2:', group2)

Comparison¶

ratio cut

spectral_clustering(laplacian_rcut)

people in group 1: [ 2  3  4  8  9 10 14 15 16 19 20 21 23 24 25 26 27 28 29 30 31 32 33 34]
people in group 2: [ 1  5  6  7 11 12 13 17 18 22]

normalized cut

spectral_clustering(laplacian_ncut)

people in group 1: [ 1  2  4  5  6  7  8 11 12 13 14 17 18 20 22]
people in group 2: [ 3  9 10 15 16 19 21 23 24 25 26 27 28 29 30 31 32 33 34]

Result according to Zachary's paper:

Mr. Hi's: [1 2 3 4 5 6 7 8 9 11 12 13 14 17 18 20 22]

Officers': [10 15 16 19 21 23 24 25 26 27 28 29 30 31 32 33 34]