## Aprendizaje Estadístico 2024
### <font size=3 color='gray'>Alan Reyes-Figueroa</font>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB

from sklearn.datasets import load_iris

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

En este notebook replicamos la construcción de un clasificador bayesiano naïve (*Naive Bayes*) para clasificar si un vehículo es más propenso a ser robado o no en función de atributos multivariables (color, tipo y procedencia).

En este caso, cada distribución condicionada $f_0(\mathbf{x})$ y $f_1(\mathbf{x})$ es Bernoulli.

## Load data

In [2]:
df = pd.read_csv('car_data.csv', header=0, sep=',')
df.shape

(10, 4)

In [3]:
df

Unnamed: 0,Color,Type,Origin,Stolen
0,Red,Sports,Domestic,Yes
1,Red,Sports,Domestic,No
2,Red,Sports,Domestic,Yes
3,Yellow,Sports,Domestic,No
4,Yellow,Sports,Imported,Yes
5,Yellow,SUV,Imported,No
6,Yellow,SUV,Imported,Yes
7,Yellow,SUV,Domestic,No
8,Red,SUV,Imported,No
9,Red,Sports,Imported,Yes


In [4]:
features = df.iloc[:,:-1]
target   = df.iloc[:,-1]

In [5]:
features

Unnamed: 0,Color,Type,Origin
0,Red,Sports,Domestic
1,Red,Sports,Domestic
2,Red,Sports,Domestic
3,Yellow,Sports,Domestic
4,Yellow,Sports,Imported
5,Yellow,SUV,Imported
6,Yellow,SUV,Imported
7,Yellow,SUV,Domestic
8,Red,SUV,Imported
9,Red,Sports,Imported


In [6]:
target

0    Yes
1     No
2    Yes
3     No
4    Yes
5     No
6    Yes
7     No
8     No
9    Yes
Name: Stolen, dtype: object

In [7]:
# convertimos las features a dummies 

X = pd.get_dummies(features)
y = pd.get_dummies(target).iloc[:,-1]
y.columns = ['Stolen']

print(X.shape, y.shape)

(10, 6) (10,)


In [8]:
X

Unnamed: 0,Color_Red,Color_Yellow,Type_SUV,Type_Sports,Origin_Domestic,Origin_Imported
0,1,0,0,1,1,0
1,1,0,0,1,1,0
2,1,0,0,1,1,0
3,0,1,0,1,1,0
4,0,1,0,1,0,1
5,0,1,1,0,0,1
6,0,1,1,0,0,1
7,0,1,1,0,1,0
8,1,0,1,0,0,1
9,1,0,0,1,0,1


In [9]:
y

0    1
1    0
2    1
3    0
4    1
5    0
6    1
7    0
8    0
9    1
Name: Yes, dtype: uint8

## Naive Bayes

In [10]:
cnb = CategoricalNB()
model = cnb.fit(X, y)

In [11]:
# predicción para un sólo vehículo
#point = np.array([[1,0,1,0,1,0]]) 

# predicción para todos los tipos de vehículo
point = np.array([[0,1,0,1,0,1], 
                  [0,1,0,1,1,0], 
                  [0,1,1,0,0,1], 
                  [0,1,1,0,1,0], 
                  [1,0,0,1,0,1], 
                  [1,0,0,1,1,0], 
                  [1,0,1,0,0,1], 
                  [1,0,1,0,1,0] ])

In [12]:
data = pd.DataFrame(data=point, columns=X.columns)
data.shape

(8, 6)

In [13]:
data

Unnamed: 0,Color_Red,Color_Yellow,Type_SUV,Type_Sports,Origin_Domestic,Origin_Imported
0,0,1,0,1,0,1
1,0,1,0,1,1,0
2,0,1,1,0,0,1
3,0,1,1,0,1,0
4,1,0,0,1,0,1
5,1,0,0,1,1,0
6,1,0,1,0,0,1
7,1,0,1,0,1,0


In [14]:
cnb.predict(data)

array([1, 0, 0, 0, 1, 1, 0, 0], dtype=uint8)