## Aprendizaje Estadístico 2024
### <font size=3 color='gray'>Alan Reyes-Figueroa</font>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB

from sklearn.datasets import load_iris

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

En este notebook haremos un miniejemplo de cómo usar un clasificador bayesiano Multinomial (aquí las distribuciones condicionales son Multinomiales), para hacer análisis de sentimientos en comentarios sobre películas. 

Clasificamos cada comentario como: 
 - Positive = 1 = True
 - Negative = 0 = False

## Load data

In [2]:
data = np.array([['I loved the movie', True], 
                 ['I hated the movie', False], 
                 ['A great movie. Good movie', True], 
                 ['Poor acting', False], 
                 ['Great acting. A good movie.', True] ])

In [3]:
docs = pd.DataFrame(data=data, columns=['Comment','Positive'])
docs.shape

(5, 2)

In [4]:
docs

Unnamed: 0,Comment,Positive
0,I loved the movie,True
1,I hated the movie,False
2,A great movie. Good movie,True
3,Poor acting,False
4,Great acting. A good movie.,True


In [5]:
corpus = list(docs['Comment'])
corpus

['I loved the movie',
 'I hated the movie',
 'A great movie. Good movie',
 'Poor acting',
 'Great acting. A good movie.']

In [6]:
vectorizer = CountVectorizer(analyzer = 'word', 
                             lowercase=True, 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = None, 
                             max_features = 5000,
                             token_pattern='[a-zA-Z0-9$&+,:;=?@#|<>^*()%!-]+')  # Regexp

In [7]:
wm = vectorizer.fit_transform(corpus)

In [8]:
print(wm.todense())

[[0 0 0 0 0 1 1 1 0 1]
 [0 0 0 0 1 1 0 1 0 1]
 [1 0 1 1 0 0 0 2 0 0]
 [0 1 0 0 0 0 0 0 1 0]
 [1 1 1 1 0 0 0 1 0 0]]


In [9]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector
vocabulary = vectorizer.vocabulary_
print(vocabulary)

{'i': 5, 'loved': 6, 'the': 9, 'movie': 7, 'hated': 4, 'a': 0, 'great': 3, 'good': 2, 'poor': 8, 'acting': 1}


In [10]:
tokens = vectorizer.get_feature_names_out()
print(tokens)

['a' 'acting' 'good' 'great' 'hated' 'i' 'loved' 'movie' 'poor' 'the']


In [11]:
# create an index for each row
doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=tokens)

In [12]:
df

Unnamed: 0,a,acting,good,great,hated,i,loved,movie,poor,the
Doc0,0,0,0,0,0,1,1,1,0,1
Doc1,0,0,0,0,1,1,0,1,0,1
Doc2,1,0,1,1,0,0,0,2,0,0
Doc3,0,1,0,0,0,0,0,0,1,0
Doc4,1,1,1,1,0,0,0,1,0,0


In [13]:
target   = docs.iloc[:,-1]

In [14]:
target

0     True
1    False
2     True
3    False
4     True
Name: Positive, dtype: object

In [15]:
y = pd.get_dummies(target).iloc[:,-1]
y.columns = ['+']

print(y.shape)

(5,)


In [16]:
y

0    1
1    0
2    1
3    0
4    1
Name: True, dtype: uint8

## Naive Bayes

In [17]:
mnb = MultinomialNB()
model = mnb.fit(df, y)

In [18]:
df

Unnamed: 0,a,acting,good,great,hated,i,loved,movie,poor,the
Doc0,0,0,0,0,0,1,1,1,0,1
Doc1,0,0,0,0,1,1,0,1,0,1
Doc2,1,0,1,1,0,0,0,2,0,0
Doc3,0,1,0,0,0,0,0,0,1,0
Doc4,1,1,1,1,0,0,0,1,0,0


In [19]:
# Ejemplo 1:   'I loved'
# Ejemplo 2:   'I hated'
# Ejemplo 3:   'Great!'

In [20]:
point = np.array([[0,0,0,0,0,1,1,0,0,0], 
                  [0,0,0,0,1,1,0,0,0,0],
                  [0,0,0,1,0,0,0,0,0,0]])

In [21]:
data = pd.DataFrame(data=point, columns=df.columns)
data.shape

(3, 10)

In [22]:
data

Unnamed: 0,a,acting,good,great,hated,i,loved,movie,poor,the
0,0,0,0,0,0,1,1,0,0,0
1,0,0,0,0,1,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0


In [23]:
mnb.predict(data)

array([1, 0, 1], dtype=uint8)