# **Elements of Machine Learning 2024** <font size=4 color='gray'>Alan Reyes-Figueroa</font>
#### <font color='gray'>Clustering Metrics</font>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import rand_score, adjusted_rand_score, homogeneity_completeness_v_measure
from sklearn.metrics import mutual_info_score, normalized_mutual_info_score, adjusted_mutual_info_score
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import fowlkes_mallows_score, calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score, silhouette_samples, silhouette_score

from sklearn.datasets import load_iris
from sklearn.cluster import k_means, KMeans, Birch, DBSCAN, OPTICS, mean_shift, KMeans, MeanShift
from sklearn.cluster import AgglomerativeClustering, SpectralClustering

from warnings import filterwarnings
filterwarnings('ignore')

# <center> Metrics for Clustering Methods </font> </center>

### External Metrics

In [None]:
data = load_iris()

In [None]:
XX = data['data']
yy = data['target']

In [None]:
y = np.array([0, 0, 0, 5, 5, 5, 2, 2, 2, 2])

yhat = np.array([0, 0, 0, 0, 1, 1, 2, 2, 2, 2])

In [None]:
RI  = rand_score(y, yhat)
ARI = adjusted_rand_score(y, yhat)

In [None]:
print(RI)
print(ARI)

In [None]:
MI  = mutual_info_score(y, yhat)
NMI = normalized_mutual_info_score(y, yhat)
AMI = adjusted_mutual_info_score(y, yhat)

In [None]:
print(MI)
print(NMI)
print(AMI)

In [None]:
HS = homogeneity_score(y, yhat)
CS = completeness_score(y, yhat)
V  = v_measure_score(y, yhat, beta=0.1)

In [None]:
print(HS)
print(CS)
print(V)

In [None]:
HCV = homogeneity_completeness_v_measure(y, yhat)

In [None]:
print(HCV)

In [None]:
FMI = fowlkes_mallows_score(y, yhat)

In [None]:
print(FMI)

In [None]:
CHI = calinski_harabasz_score(XX, yy)

In [None]:
print(CHI)

In [None]:
DB = davies_bouldin_score(XX, yy)

In [None]:
print(DB)

### Test

In [None]:
km = KMeans(n_clusters=3, random_state=2023).fit(XX)

In [None]:
#km_lab = km[1]
km.labels_

In [None]:
CHI2 = calinski_harabasz_score(XX, yy)

In [None]:
print(CHI2)

In [None]:
ms = MeanShift(bandwidth=0.85).fit(XX)

In [None]:
ms_labs = ms.labels_

In [None]:
CHI3 = calinski_harabasz_score(XX, ms_labs)

In [None]:
print(CHI3)

### Tabla Comparativa

In [None]:
yy

In [None]:
# Algoritmos de clustering a comparar

km = KMeans(n_clusters=3, random_state=2023)
ms = MeanShift(bandwidth=0.85)
bi = Birch(n_clusters=3)
sp = SpectralClustering(n_clusters=3, affinity='rbf', gamma=1.0)
ah = AgglomerativeClustering(n_clusters=3, affinity='euclidean', linkage='ward')

In [None]:
methods = [km, ms, sp, bi, ah]
names   = ['K-means', 'Mean-Shift', 'Spectral', 'Birch', 'Agglomerative']
metrics = [rand_score, adjusted_rand_score, 
           mutual_info_score, normalized_mutual_info_score, adjusted_mutual_info_score, 
           homogeneity_score, completeness_score, v_measure_score, 
           fowlkes_mallows_score, calinski_harabasz_score]
metr_names = ['RI','ARI','MI','NMI','AMI','H','C','V','FMI','CHI']

In [None]:
n = len(methods)
m = len(metrics)
beta = 1.

In [None]:
yhat = []
Met = np.zeros((m,n))

for i in range(0, n):
    clust = methods[i].fit(XX)
    yhat.append(clust.labels_)
    
    for j in range(0, m):
        if j == 7:
                Met[j,i] = metrics[j](yy, yhat[i], beta=beta)
        elif j == 9:
            Met[j,i] = metrics[j](XX, yhat[i])
        else:
            Met[j,i] = metrics[j](yy, yhat[i])

In [None]:
df = pd.DataFrame(data=Met, index=metr_names, columns=names)

In [None]:
df

### Visualizaci√≥n

In [None]:
df.iloc[:-1,:].mean()

In [None]:
U, S, V = np.linalg.svd(XX)
Proj = XX @ V[:,:2]

In [None]:
plt.figure(figsize=(12,8))
plt.subplot(2,3,1)
plt.xlabel('Ground-Truth')
plt.scatter(Proj[:,0], Proj[:,1], marker='o', c=yy)
for i in range(0, n):
    plt.subplot(2,3,i+2)
    plt.scatter(Proj[:,0], Proj[:,1], marker='o', c=yhat[i])
    plt.xlabel(names[i])
plt.show()

In [None]:
fig = plt.figure(figsize=(15,2))
plt.subplot(1,6,1)
plt.xlabel('Ground-Truth')
plt.scatter(Proj[:,0], Proj[:,1], marker='o', c=yy)
for i in range(0, n):
    plt.subplot(1,6,i+2)
    plt.scatter(Proj[:,0], Proj[:,1], marker='o', c=yhat[i])
    plt.xlabel(names[i])
plt.show()

In [None]:
#fig.savefig('metrics.jpg', bbox_inches=('tight'))

### Internal Metrics

In [None]:
methods = [km, ms, sp, bi, ah]
names   = ['K-means', 'Mean-Shift', 'Spectral', 'Birch', 'Agglomerative']
metrics = [davies_bouldin_score, silhouette_score]
metr_names = ['Davies-Bouldin', 'Silhouette']

In [None]:
n = len(methods)
m = len(metrics)
beta = 1.

In [None]:
yhat = []
Met = np.zeros((m,n))

for i in range(0, n):
    clust = methods[i].fit(XX)
    yhat.append(clust.labels_)
    
    for j in range(0, m):
        Met[j,i] = metrics[j](XX, yhat[i])

In [None]:
df = pd.DataFrame(data=Met, index=metr_names, columns=names)

In [None]:
df