# **Elements of Machine Learning 2024** <font size=4 color='gray'>Alan Reyes-Figueroa</font>
#### <font color='gray'>Agrupamiento Jerárquico</font>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

from sklearn.datasets import load_iris, make_circles, make_moons, make_blobs

# Agrupamiento Jerárquico

### Burbujas

In [None]:
# 2024, 0
np.random.seed(10)
data, colors = make_blobs(n_samples=250, n_features=2)

In [None]:
print(data.shape, colors.shape)

In [None]:
plt.figure()
plt.scatter(data[:,0], data[:,1], c=colors)
plt.show()

In [None]:
# parámetros

k = 3
dist = 'jaccard'

In [None]:
# método = 'single'
model_s = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='single')
model_s.fit(data)
labels_s = model_s.labels_

# método = 'complete'
model_c = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='complete')
model_c.fit(data)
labels_c = model_c.labels_

# método = 'average'
model_a = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='average')
model_a.fit(data)
labels_a = model_a.labels_

# método = 'ward'
model_w = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
model_w.fit(data)
labels_w = model_w.labels_

In [None]:
labels = [labels_s, labels_c, labels_a, labels_w]
models = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(16,4))
for i in range(0,4):
    plt.subplot(1,4,i+1)
    plt.scatter(data[:,0], data[:,1], c=labels[i])
    plt.title('method = {}'.format(models[i]))
plt.show()

In [None]:
labels_a

In [None]:
link = linkage(data, metric='euclidean', method='weighted', optimal_ordering=True)

In [None]:
plt.figure(figsize=(10,4))
D = dendrogram(link, color_threshold=5.0)
plt.show()

In [None]:
D.keys()

In [None]:
print(list(D['color_list']))

### Moons

In [None]:
data, colors = make_moons(n_samples=250, noise=0.1, random_state=2021)

In [None]:
print(data.shape, colors.shape)

In [None]:
plt.figure()
plt.scatter(data[:,0], data[:,1], c=colors)
plt.show()

In [None]:
# parámetros

k = 2
dist = 'euclidean'

In [None]:
# método = 'single'
model_s = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='single')
model_s.fit(data)
labels_s = model_s.labels_

# método = 'complete'
model_c = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='complete')
model_c.fit(data)
labels_c = model_c.labels_

# método = 'average'
model_a = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='average')
model_a.fit(data)
labels_a = model_a.labels_

# método = 'ward'
model_w = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
model_w.fit(data)
labels_w = model_w.labels_

In [None]:
labels = [labels_s, labels_c, labels_a, labels_w]
models = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(16,4))
for i in range(0,4):
    plt.subplot(1,4,i+1)
    plt.scatter(data[:,0], data[:,1], c=labels[i])
    plt.title('method = {}'.format(models[i]))
plt.show()

In [None]:
link = linkage(data, metric='euclidean', method='single', optimal_ordering=True)

plt.figure(figsize=(6,4))
D = dendrogram(link, color_threshold=.170)
plt.show()

### Circles

In [None]:
data, colors = make_circles(n_samples=200, noise=0.01)

In [None]:
print(data.shape, colors.shape)

In [None]:
plt.figure(figsize=(4,4))
plt.scatter(data[:,0], data[:,1], c=colors)
plt.show()

In [None]:
# parámetros

k = 2
#dist = 'jaccard'          # índice de similitud de Jaccard      D = 1 - Indice
dist = 'manhattan'

In [None]:
# método = 'single'
model_s = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='single')
model_s.fit(data)
labels_s = model_s.labels_

# método = 'complete'
model_c = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='complete')
model_c.fit(data)
labels_c = model_c.labels_

# método = 'average'
model_a = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='average')
model_a.fit(data)
labels_a = model_a.labels_

# método = 'ward'
model_w = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='ward')
model_w.fit(data)
labels_w = model_w.labels_

In [None]:
labels = [labels_s, labels_c, labels_a, labels_w]
models = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(16,4))
for i in range(0,4):
    plt.subplot(1,4,i+1)
    plt.scatter(data[:,0], data[:,1], c=labels[i])
    plt.title('method = {}'.format(models[i]))
plt.show()

In [None]:
link = linkage(data, metric='euclidean', method='single', optimal_ordering=True)

plt.figure(figsize=(6,4))
D = dendrogram(link, color_threshold=.150)
plt.show()

### Iris Dataset

In [None]:
data = load_iris()

In [None]:
X = data['data']
y = data['target']

In [None]:
plt.figure()
plt.scatter(X[:,2], X[:,3], c=y)
plt.show()

In [None]:
# parámetros

k = 3
dist = 'euclidean'

In [None]:
# método = 'single'
model_s = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='single')
model_s.fit(X)
labels_s = model_s.labels_

# método = 'complete'
model_c = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='complete')
model_c.fit(X)
labels_c = model_c.labels_

# método = 'average'
model_a = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='average')
model_a.fit(X)
labels_a = model_a.labels_

# método = 'ward'
model_w = AgglomerativeClustering(n_clusters=k, metric=dist, linkage='ward')
model_w.fit(X)
labels_w = model_w.labels_

In [None]:
labels = [labels_s, labels_c, labels_a, labels_w]
models = ['single', 'complete', 'average', 'ward']

plt.figure(figsize=(16,4))
for i in range(0,4):
    plt.subplot(1,4,i+1)
    plt.scatter(X[:,2], X[:,3], c=labels[i])
    plt.title('method = {}'.format(models[i]))
plt.show()