## *Elements of Machine Learning* 2024
### <font size=3 color='gray'>Alan Reyes-Figueroa</font>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, CategoricalNB, MultinomialNB

from sklearn.datasets import load_iris

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

## Load data

In [None]:
data = np.array([['I loved the movie', True], 
                 ['I hated the movie', False], 
                 ['A great movie. Good movie', True], 
                 ['Poor acting', False], 
                 ['Great acting. A good movie.', True] ])

In [None]:
docs = pd.DataFrame(data=data, columns=['Comment','Positive'])
docs.shape

In [None]:
docs

In [None]:
corpus = list(docs['Comment'])
corpus

In [None]:
vectorizer = CountVectorizer(analyzer = 'word', 
                             lowercase=True, 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = None, 
                             max_features = 5000,
                             token_pattern='[a-zA-Z0-9$&+,:;=?@#|<>^*()%!-]+')  # Regexp

In [None]:
wm = vectorizer.fit_transform(corpus)

In [None]:
print(wm.todense())

In [None]:
# show resulting vocabulary; the numbers are not counts, they are the position in the sparse vector
vocabulary = vectorizer.vocabulary_
print(vocabulary)

In [None]:
tokens = vectorizer.get_feature_names_out()
print(tokens)

In [None]:
# create an index for each row
doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
df = pd.DataFrame(data=wm.toarray(), index=doc_names, columns=tokens)

In [None]:
df

In [None]:
target   = docs.iloc[:,-1]

In [None]:
target

In [None]:
y = pd.get_dummies(target).iloc[:,-1]
y.columns = ['+']

print(y.shape)

In [None]:
y

## Naive Bayes

In [None]:
mnb = MultinomialNB()
model = mnb.fit(df, y)

In [None]:
point = np.array([[0,0,0,0,0,1,1,0,0,0], 
                  [0,0,0,0,1,1,0,0,0,0],
                  [0,0,0,1,0,0,0,0,0,0]])

In [None]:
data = pd.DataFrame(data=point, columns=df.columns)
data.shape

In [None]:
data

In [None]:
mnb.predict(data)