# <center> **Fitting distributions to data** </center>
## <font size=4> **Elements of Machine Learning 2025** </font> <font color=gray size=4> -- Alan Reyes-Figueroa </font>

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import scipy.stats as st

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes

In [None]:
import warnings
warnings.filterwarnings('ignore')

En este notebook vamos a usar la librería de $\texttt{scipy.stats}$ para generar datos aleatorios. Vamos a considerar el caso de varias variables aleatorias conocidas, algunas discretas y otras continuas.

In [None]:
#Get Data
data = load_diabetes()
X, y_ = data.data, data.target

In [None]:
#Organize Data
SR_y = pd.Series(y_, name="y_ (Target Vector Distribution)")

In [None]:
#Plot Data
fig, ax = plt.subplots()
sns.distplot(SR_y, bins=25, color="g", ax=ax)
plt.show()

In [None]:
def get_best_distribution(data, dist_names):
    dist_results = []
    params = {}
    for dist_name in dist_names:
        dist = getattr(st, dist_name)
        param = dist.fit(data)
        params[dist_name] = param
        
        # Applying the Kolmogorov-Smirnov test
        D, p = st.kstest(data, dist_name, args=param)
        print('p value for ' + dist_name + ' = ' + str(p))
        dist_results.append((dist_name, p))

    # select the best fitted distribution
    best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
    # store the name of the best fit and its p value

    print('\nBest fitting distribution: ' + str(best_dist))
    print('Best p value: ' + str(best_p))
    print('Parameters for the best fit: ' + str(params[best_dist]))

    return best_dist, best_p, params[best_dist]

In [None]:
dist_names = ['norm', 'exponweib', 'weibull_max', 'weibull_min', 'pareto', 'genextreme']

In [None]:
dist, pbest, params = get_best_distribution(y_, dist_names)

## Ejemplo Weibull Min

In [None]:
np.random.seed(12345)       # siempre es adecuado establecer una semilla para efectos de replicabilidad.

In [None]:
# Generar una muestra de una variable Bernoulli Ber(0.5)
# de tamaño N = 1000

N = 450
c, loc, scale = params
sample = st.weibull_min.rvs(c, loc, scale, size=N)

In [None]:
fig, ax = plt.subplots()
sns.distplot(SR_y, bins=25, color="g", ax=ax)
sns.distplot(sample, bins=25, color="orange", ax=ax)
plt.show()

In [None]:
print(y_.shape, sample.shape)

In [None]:
D, p = st.kstest(y_, sample)

In [None]:
print(D, p)