# <center> **Exploración de datos (conjunto PENGUINS)** </center>
## <font size=4> **Inteligencia Artificial 2025** </font> <font color=gray size=4> -- Alan Reyes-Figueroa </font>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

In [None]:
import warnings
warnings.filterwarnings('ignore')

## Read data

In [None]:
df = sns.load_dataset('penguins')       # importar el conjunto de datos

In [None]:
df.shape

## DataFrame exploration

In [None]:
df

In [None]:
df.head(3)

In [None]:
df.tail(3)

In [None]:
df['body_mass_g']

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

#### Datos Faltantes

In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()     # para remover registros con algún dato "Null" o "NaN"
df.shape

In [None]:
df.head()

## Exploratory data analysis (EDA)

In [None]:
df.hist()

In [None]:
df.boxplot()
#df[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm']].boxplot()

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(data=df, x='bill_length_mm', y='flipper_length_mm')
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(data=df, x='bill_length_mm', y='flipper_length_mm', hue='species')
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(data=df, x='bill_length_mm', y='flipper_length_mm', 
                hue='species', style='sex')
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=df, x='bill_length_mm', y='flipper_length_mm', 
                hue='species', style='sex', size='body_mass_g')        # sizes=(10,200)
plt.show()

## Histogramas

In [None]:
rows = 2
cols = 4
maxx = 7

plt.figure(figsize=(12,7))
for i in range(0, rows):
    for j in range(0, cols):
        if i*cols+j < maxx:
            plt.subplot(rows, cols, i*cols+j+1)
            sns.histplot(data=df, x=df.columns[i*cols+j])
plt.show()

In [None]:
X = df.body_mass_g

In [None]:
X.mean()

In [None]:
X.median()

In [None]:
X.mode()

In [None]:
print(X.min(), X.max())

In [None]:
X.value_counts()

In [None]:
plt.figure(figsize=(4,4))
sns.displot(data=df, x='body_mass_g', kind='kde', fill=True)
plt.show()

In [None]:
plt.figure(figsize=(4,4))
sns.displot(data=df, x='body_mass_g', kind='kde', fill=True, hue='species')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
for i in range(0, 4):
    plt.subplot(1,4,i+1)
    sns.distplot(df[df.columns[2+i]], kde='True')
plt.show()

## Pairplots

In [None]:
plt.figure(figsize=(20,6))
sns.pairplot(data=df, diag_kind='kde')
plt.show()

In [None]:
plt.figure(figsize=(20,6))
sns.pairplot(data=df, diag_kind='kde', hue='species')
plt.show()

## Densidades 2D

In [None]:
plt.figure(figsize=(15,6))
sns.displot(data=df, x='bill_length_mm', y='bill_depth_mm', kind='kde')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.displot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='species')
plt.show()

In [None]:
plt.figure(figsize=(15,6))
sns.displot(data=df, x='bill_length_mm', y='bill_depth_mm', kind='kde', hue='species')
plt.show()

## Joinplots

In [None]:
plt.figure(figsize=(15,15))
sns.jointplot(data=df, x='bill_length_mm', y='bill_depth_mm', hue='species')
plt.show()

## Correlation

In [None]:
df.cov()

In [None]:
df.corr()

In [None]:
C = df.corr().values

In [None]:
print(C)

In [None]:
fig = plt.figure()
plt.imshow(df.corr(), vmin=-1, vmax=1, cmap=plt.cm.seismic)
plt.colorbar()
plt.title('Correlation Matrix')
plt.show()

In [None]:
# para guardar una figura a disco
fig.savefig('corr.png', bbox_inches='tight')

## Heatmap (Join Density)

In [None]:
plt.figure()
g = sns.JointGrid(data=df, x="body_mass_g", y="bill_depth_mm", space=0)
g.plot_joint(sns.kdeplot, fill=True, clip=((2200, 6800), (10, 25)),
             thresh=0, levels=50, cmap="rocket")
g.plot_marginals(sns.histplot, color="#A3051A", alpha=1, bins=25)