# **Elements of Machine Learning 2024** <font size=4 color='gray'>Alan Reyes-Figueroa</font>
#### <font color='gray'>Imputaci√≥n de Datos</font>

# Introduction

In this tutorial we will explore some basic techniques of data imputation, that is, how to fill missing values in a dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import KNNImputer

In [None]:
#pd.set_option('max_rows', 30)

In [None]:
data = pd.read_csv('hpi-data-2016.csv')

In [None]:
data.head(10)

In [None]:
data.shape

# Missing data

Entries missing values are given the value `NaN`, short for "Not a Number". For technical reasons these `NaN` values are always of the `float64` dtype.

Pandas provides some methods specific to missing data. To select `NaN` entries you can use `pd.isnull()` (or its companion `pd.notnull()`). This is meant to be used thusly:

In [None]:
data.isnull().sum()

Replacing missing values is a common operation.  Pandas provides a really handy method for this problem: `fillna()`. `fillna()` provides a few different strategies for mitigating such data. For example, we can simply replace each `NaN` with an `"Unknown"`:

In [None]:
data['GINI-index'].fillna("Unknown", inplace=True)

In [None]:
data.head(15)

## Fill with zeros

In [None]:
## reemplazar con 0
data = pd.read_csv('hpi-data-2016.csv')
data2 = data.copy()

In [None]:
data2['GINI-index'] = data2['GINI-index'].fillna(0)

In [None]:
data2.head()

In [None]:
data['GINI-index'].mean()

In [None]:
data2['GINI-index'].mean()

In [None]:
data['GINI-index'].hist()

In [None]:
data2['GINI-index'].hist()

## Fill with statistical resumes

In [None]:
## reemplazar con la media

data3 = data.copy()
media = data3['GINI-index'].mean()
print(media)

In [None]:
data3['GINI-index'] = data3['GINI-index'].fillna(media)

In [None]:
data3.head()

In [None]:
data3['GINI-index'].mean()

In [None]:
data['GINI-index'].hist()      # pandas histogram descarta los faltantes

In [None]:
data3['GINI-index'].hist()

In [None]:
# reemplazar con la mediana

mediana = data['GINI-index'].median()
print(mediana)
data3['GINI-index'] = data3['GINI-index'].fillna(mediana)

In [None]:
moda = data['Region'].mode()
print(moda)

In [None]:
moda = data['GINI-index'].mode()
print(moda)

In [None]:
data['GINI-index'].value_counts()

## Fill with group or cluster mean 

Sometimes it is more convenient to replace missing values with group means, where the groups correspond to some categorial variable values, or some clustering scheme.

In [None]:
regions = data.Region.unique()
regions

In [None]:
data.groupby('Region')['GINI-index'].mean()
#means = data.groupby('Region')['GINI-index'].mean().values
#means

In [None]:
means = data.groupby('Region')['GINI-index'].mean().values
means

In [None]:
data4 = data.copy()
data4['GINI-index'] = data4['GINI-index'].fillna(data3.groupby('Region')['GINI-index'].transform('mean'))
data4

In [None]:
data4['GINI-index'].hist()

## Fill with linear regression estimations

In [None]:
data.corr()

In [None]:
plt.figure()
sns.pairplot(data[['Inequality-of-Outcomes', 'GINI-index']])
plt.show()

In [None]:
datadropna = data.dropna()

In [None]:
datadropna

In [None]:
x = datadropna['Inequality-of-Outcomes'].values
y = datadropna['GINI-index'].values

In [None]:
x

In [None]:
y

In [None]:
x.shape

In [None]:
n = x.shape[0]
n

In [None]:
# construct design matrix

X = np.stack([np.ones(n), x]).T
X.shape

In [None]:
# compute linear regression coefficients

c = np.linalg.inv(X.T @ X) @ (X.T) @ y

In [None]:
c

In [None]:
def regresion(x, c):
    yhat = c @ np.hstack([np.array([1.]), x])
    return yhat

In [None]:
data4 = data.copy()

In [None]:
data4

In [None]:
for i in range(0, data4.shape[0]):
    if np.isnan(data4['GINI-index'][i]):
        y = regresion(data4['Inequality-of-Outcomes'][i], c)
        data4['GINI-index'][i] = y

In [None]:
data4

In [None]:
data['GINI-index'].hist()

In [None]:
data4['GINI-index'].hist()

## K nearest neighbour imputation

In [None]:
from sklearn.impute import KNNImputer

In [None]:
# define imputer
imputer = KNNImputer(n_neighbors=10, weights='uniform', metric='nan_euclidean')

In [None]:
X = data.iloc[:,2:].values

# fit
imputer.fit(X)

In [None]:
# transform the dataset
Xtrans = imputer.transform(X)

In [None]:
data4 = data.copy()
data4['GINI-index'] = imputer.transform(X)[:,-1]

In [None]:
data4

In [None]:
data4['GINI-index'].hist(bins=35)

In [None]:
data['GINI-index'].hist(bins=35)