# **Aprendizaje Estadístico 2024** <font size=4 color='gray'>Alan Reyes-Figueroa</font>
#### <font color='gray'>Factoración NNMF</font>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA, NMF
from sklearn.metrics.pairwise import cosine_similarity

# Data

In [None]:
ratings = pd.read_csv('ratings.csv', header=0, sep=',')
ratings.shape

In [None]:
movies = pd.read_csv('movies.csv', header=0, sep=',')
movies.shape

In [None]:
ratings.head()

In [None]:
movies.head()

In [None]:
table = pd.pivot_table(ratings, index='userId', columns='movieId', values='rating', fill_value=0)
table = table.astype(int)

In [None]:
table.shape

In [None]:
table.head()

In [None]:
X = table.values
X.shape

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(X, vmin=0, vmax=5)
plt.title('Ratings Table')
plt.ylabel('Users')
plt.xlabel('Movies')
plt.show()

In [None]:
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())
 
print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

In [None]:
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']

In [None]:
user_freq.head()

In [None]:
# Find Lowest and Highest rated movies:
mean_rating = ratings.groupby('movieId')[['rating']].mean()

In [None]:
mean_rating.head()

In [None]:
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

In [None]:

# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

In [None]:
# show number of people who rated movies rated movie highest
ratings[ratings['movieId']==highest_rated]

In [None]:
# show number of people who rated movies rated movie lowest
ratings[ratings['movieId']==lowest_rated]

In [None]:
## the above movies has very low dataset. We will use bayesian average
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])

In [None]:
movie_stats.head()

In [None]:
movie_stats.columns = movie_stats.columns.droplevel()

In [None]:
movie_stats.head()

In [None]:
movie_stats = movie_stats.sort_values(by=['count', 'mean'], ascending=False)

In [None]:
movie_stats.iloc[1049,:]

### Select movies with more rating

In [None]:
cut = 25
movies_sample = movie_stats[movie_stats['count'] >= cut]

In [None]:
lista = list(movies_sample.index)

In [None]:
ratnumber = (X > 0).sum(0)
idX = np.argsort(ratnumber)[::-1]

In [None]:
plt.figure()
sns.displot(np.log10(1.+ratnumber), kde=True)
plt.show()

In [None]:
plt.figure()
plt.plot(ratnumber[idX])
plt.show()

In [None]:
# Select top 1050 movies (with more number of ratings)

#df_sample = pd.DataFrame(data=ratings, index=table.index, columns=table.columns[lista])

df = table[[x for x in movies_sample.index]]
Xs = df.values

#selection = [x for x in ratnumber[idX[:1000]]]
#df_sample = table.iloc[:, selection]
#X_sample =  df_sample.values

print(df.shape, Xs.shape)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15,5))
plt.imshow(Xs, vmin=0, vmax=5)
plt.title('Sampled Ratings Table')
plt.ylabel('Users')
plt.xlabel('Movies')
plt.show()

### Lista de Categorías

In [None]:
genres = movies['genres']

In [None]:
len(genres)

In [None]:
genres[0].split('|')

In [None]:
cat = []
for i in range(0, len(genres)):
    gen = genres[i].split('|')
    for g in gen:
        if g not in cat: 
            cat.append(g)

In [None]:
print(cat)

# NNMF Factorization

In [None]:
k = 20

In [None]:
model = NMF(n_components=k, init='random', max_iter=500, random_state=2024)

W = model.fit_transform(Xs)
H = model.components_

In [None]:
print(W.shape, H.shape)

In [None]:
plt.figure(figsize=(15,5))
plt.imshow(W, vmin=0, vmax=5)
plt.title('W')
plt.ylabel('Users')
plt.xlabel('Cat')
plt.show()

In [None]:
plt.figure(figsize=(15,5))
plt.imshow(H, vmin=0, vmax=5)
plt.title('H')
plt.ylabel('Cat')
plt.xlabel('Movies')
plt.show()

In [None]:
plt.figure(figsize=(12,12))
plt.subplot(1,2,1)
plt.imshow(W[:25,:], vmin=0, vmax=5)
plt.title('W')
plt.ylabel('Users 0-24')
plt.xlabel('Cat')
plt.subplot(1,2,2)
plt.imshow(W[25:50,:], vmin=0, vmax=5)
plt.title('W')
plt.ylabel('Users 25-49')
plt.xlabel('Cat')
plt.show()

In [None]:
# Top categorías para el userId 20
user = 20
np.argsort(W[user])[::-1]

In [None]:
# top 10 de la categoría 1
row = 1
top = np.argsort(H[row])[::-1][:10]
top

In [None]:
# MovieIds de las top 10
print(df.columns[top])

In [None]:
# Top 10 de la categoría
movies[movies['movieId'].isin(df.columns[top])]

# Sistema de Recomendación

**Ejercicio:** Construir un sistema de recomendación a partir de la factoración NNMF.

Con base en las matrices **df**, **W**, **H**, **movies**, se debe construir una función *recommender* de reciba como inputs:

* UserId   = el código del usuario (tomando como base los index de **df**).

* TopNeigh = número de usuarios vecinos a considerar.

* TopMov   = número de películas a recomendar.

La función debe devolver la lista o el DataFrame de las películas en la tabla **movies** que el usuario no haya visto, y que su sistema considere que son las más idóneas para el usuario.

In [None]:
def recommender(userId, topNeigh=10, topMov=10):
    
    return recommendations