# Librerías de manejo y análisis de datos
import pandas as pd
import numpy as np

# Librerías de visualización
import matplotlib.pyplot as plt
import seaborn as sb
from mpl_toolkits.mplot3d import Axes3D

# Librerías de preprocesamiento
from sklearn.preprocessing import StandardScaler

# Librerías de detección de anomalías
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest

# Librerías de clustering
from sklearn.cluster import KMeans

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

# Ruta al directorio raíz en Google Drive
dataset_path = '/content/drive/My Drive'

# Cargamos el dataset en un DataFrame y mostramos las primeras filas
df = pd.read_csv(dataset_path + '/resources/CTG.csv')
df.head()

# Eliminamos columnas de metadatos administrativos que no aportan
# información diagnóstica relevante para el análisis
df = df.drop(["FileName", "Date", "SegFile", "b", "e"], axis=1)
df.head()

# Verificamos las dimensiones del dataset: (número de filas, número de columnas)
df.shape

(2129, 35)

# Contamos los valores nulos por columna para evaluar la calidad del dataset
df.isnull().sum()

# Eliminamos filas con valores nulos (estrategia de eliminación simple)
df = df.dropna()

# Confirmamos que no queden valores nulos en el dataset
df.isnull().sum()

# Inspeccionamos los tipos de datos de cada columna y el uso de memoria
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2126 entries, 0 to 2125
Data columns (total 35 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LBE       2126 non-null   float64
 1   LB        2126 non-null   float64
 2   AC        2126 non-null   float64
 3   FM        2126 non-null   float64
 4   UC        2126 non-null   float64
 5   ASTV      2126 non-null   float64
 6   MSTV      2126 non-null   float64
 7   ALTV      2126 non-null   float64
 8   MLTV      2126 non-null   float64
 9   DL        2126 non-null   float64
 10  DS        2126 non-null   float64
 11  DP        2126 non-null   float64
 12  DR        2126 non-null   float64
 13  Width     2126 non-null   float64
 14  Min       2126 non-null   float64
 15  Max       2126 non-null   float64
 16  Nmax      2126 non-null   float64
 17  Nzeros    2126 non-null   float64
 18  Mode      2126 non-null   float64
 19  Mean      2126 non-null   float64
 20  Median    2126 non-null   float64
 21  Variance  2126 non-null   float64
 22  Tendency  2126 non-null   float64
 23  A         2126 non-null   float64
 24  B         2126 non-null   float64
 25  C         2126 non-null   float64
 26  D         2126 non-null   float64
 27  E         2126 non-null   float64
 28  AD        2126 non-null   float64
 29  DE        2126 non-null   float64
 30  LD        2126 non-null   float64
 31  FS        2126 non-null   float64
 32  SUSP      2126 non-null   float64
 33  CLASS     2126 non-null   float64
 34  NSP       2126 non-null   float64
dtypes: float64(35)
memory usage: 597.9 KB

# Contamos instancias por clase de estado fetal (NSP):
# 1 = Normal, 2 = Sospechoso, 3 = Patológico
# Un dataset desbalanceado puede afectar el rendimiento de los modelos
class_counts = df['NSP'].value_counts()

plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values)
plt.xlabel('Estado Fetal (NSP)')
plt.ylabel('Número de Muestras')
plt.title('Balance del Dataset')
plt.xticks(class_counts.index, ['Normal (1)', 'Sospechoso (2)', 'Patológico (3)'])
plt.show()

# Identificamos variables que solo contienen valores 0 y 1
# para tratarlas como variables categóricas (indicadores binarios)
binary_variables = []
for column in df.columns:
    unique_values = df[column].unique()
    if all(value in [0, 1] for value in unique_values):
        binary_variables.append(column)

print("Variables binarias (tratadas como categóricas):", binary_variables)

Variables binarias (tratadas como categóricas): ['DS', 'DR', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP']

# Creamos una copia del DataFrame para no alterar el original durante el preprocesamiento
df_copy = df.copy()

# Convertimos las variables binarias a tipo entero explícito
for column in binary_variables:
    df_copy[column] = df_copy[column].astype(int)

# Verificamos los tipos de datos tras la conversión
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2126 entries, 0 to 2125
Data columns (total 35 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LBE       2126 non-null   float64
 1   LB        2126 non-null   float64
 2   AC        2126 non-null   float64
 3   FM        2126 non-null   float64
 4   UC        2126 non-null   float64
 5   ASTV      2126 non-null   float64
 6   MSTV      2126 non-null   float64
 7   ALTV      2126 non-null   float64
 8   MLTV      2126 non-null   float64
 9   DL        2126 non-null   float64
 10  DS        2126 non-null   int64  
 11  DP        2126 non-null   float64
 12  DR        2126 non-null   int64  
 13  Width     2126 non-null   float64
 14  Min       2126 non-null   float64
 15  Max       2126 non-null   float64
 16  Nmax      2126 non-null   float64
 17  Nzeros    2126 non-null   float64
 18  Mode      2126 non-null   float64
 19  Mean      2126 non-null   float64
 20  Median    2126 non-null   float64
 21  Variance  2126 non-null   float64
 22  Tendency  2126 non-null   float64
 23  A         2126 non-null   int64  
 24  B         2126 non-null   int64  
 25  C         2126 non-null   int64  
 26  D         2126 non-null   int64  
 27  E         2126 non-null   int64  
 28  AD        2126 non-null   int64  
 29  DE        2126 non-null   int64  
 30  LD        2126 non-null   int64  
 31  FS        2126 non-null   int64  
 32  SUSP      2126 non-null   int64  
 33  CLASS     2126 non-null   float64
 34  NSP       2126 non-null   float64
dtypes: float64(23), int64(12)
memory usage: 597.9 KB

# Contamos los valores igual a 0 por columna
# Una columna con casi todos ceros tiene baja variabilidad y poco poder discriminatorio
(df_copy == 0).sum(axis=0)

# 'DR' (deceleraciones repetitivas) tiene casi todos sus valores en 0,
# lo que indica baja variabilidad → la eliminamos para evitar ruido
df_copy = df_copy.drop('DR', axis=1)
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2126 entries, 0 to 2125
Data columns (total 34 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   LBE       2126 non-null   float64
 1   LB        2126 non-null   float64
 2   AC        2126 non-null   float64
 3   FM        2126 non-null   float64
 4   UC        2126 non-null   float64
 5   ASTV      2126 non-null   float64
 6   MSTV      2126 non-null   float64
 7   ALTV      2126 non-null   float64
 8   MLTV      2126 non-null   float64
 9   DL        2126 non-null   float64
 10  DS        2126 non-null   int64  
 11  DP        2126 non-null   float64
 12  Width     2126 non-null   float64
 13  Min       2126 non-null   float64
 14  Max       2126 non-null   float64
 15  Nmax      2126 non-null   float64
 16  Nzeros    2126 non-null   float64
 17  Mode      2126 non-null   float64
 18  Mean      2126 non-null   float64
 19  Median    2126 non-null   float64
 20  Variance  2126 non-null   float64
 21  Tendency  2126 non-null   float64
 22  A         2126 non-null   int64  
 23  B         2126 non-null   int64  
 24  C         2126 non-null   int64  
 25  D         2126 non-null   int64  
 26  E         2126 non-null   int64  
 27  AD        2126 non-null   int64  
 28  DE        2126 non-null   int64  
 29  LD        2126 non-null   int64  
 30  FS        2126 non-null   int64  
 31  SUSP      2126 non-null   int64  
 32  CLASS     2126 non-null   float64
 33  NSP       2126 non-null   float64
dtypes: float64(23), int64(11)
memory usage: 581.3 KB

# Convertimos a entero las variables ordinales:
# - Tendency: -1 (asimétrica izquierda), 0 (simétrica), 1 (asimétrica derecha)
# - NSP: 1 (Normal), 2 (Sospechoso), 3 (Patológico)
df_copy['Tendency'] = df_copy['Tendency'].astype('int64')
df_copy['NSP'] = df_copy['NSP'].astype('int64')

# Separamos variables según su tipo de dato:
# - Numéricas (float): mediciones continuas del CTG
# - Categóricas (int, object): etiquetas, clases y patrones morfológicos
lst_numvar = df_copy.select_dtypes(include=float).columns.to_list()
lst_catvar = df_copy.select_dtypes(include=int).columns.to_list() + df_copy.select_dtypes(include=object).columns.to_list()

print("Variables numéricas:", lst_numvar)
print("\nVariables categóricas:", lst_catvar)

Variables numéricas: ['LBE', 'LB', 'AC', 'FM', 'UC', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'DL', 'DP', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'CLASS']

Variables categóricas: ['DS', 'Tendency', 'A', 'B', 'C', 'D', 'E', 'AD', 'DE', 'LD', 'FS', 'SUSP', 'NSP']

# Graficamos la distribución de frecuencias de cada variable categórica
# usando barras horizontales para facilitar la lectura de etiquetas largas
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(25, 10))

plt.subplots_adjust(wspace=0.3, hspace=0.3)

for col, ax in zip(lst_catvar, axes.ravel()):
    df_copy[col].value_counts().plot.barh(ax=ax)
    ax.set_xlabel('Frecuencia')
    ax.set_title(col)
    ax.bar_label(ax.containers[0])  # Muestra el conteo exacto en cada barra

plt.tight_layout()
plt.show()

# Histogramas para variables numéricas: permiten visualizar
# la distribución (normal, asimétrica, bimodal) de cada medición continua
df_copy[lst_numvar].hist(grid=False, figsize=(10, 10))
plt.tight_layout()
plt.show()

cols = 3
rows = (len(lst_catvar) // cols) + 1

fig, axes = plt.subplots(rows, cols, figsize=(cols * 7, rows * 3))
axes = axes.flatten()

for idx, column in enumerate(lst_catvar):
    # Graficamos la distribución de cada variable categórica separada por clase NSP
    # Usamos df_copy (dataset preprocesado) para consistencia con el análisis
    ax = sb.countplot(x=column, hue='NSP', data=df_copy[lst_catvar], palette="Set2", ax=axes[idx])
    axes[idx].set_title(f"Dist. de {column}")
    axes[idx].set_xlabel(column)
    axes[idx].set_ylabel('Número de instancias')

    # Calculamos el porcentaje de cada barra respecto al total de instancias
    total = len(df_copy[lst_catvar])
    for p in ax.patches:
        height = p.get_height()
        if height > 0:
            percentage = f'{100 * height / total:.1f}%'
            x = p.get_x() + p.get_width() / 2
            y = p.get_y() + height
            ax.annotate(percentage, (x, y), ha='center', va='bottom', rotation=90)

# Ocultamos los subplots sobrantes que no tienen datos
for ax in axes[len(lst_catvar):]:
    fig.delaxes(ax)

plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.show()

# Calculamos la matriz de correlación de Pearson entre variables numéricas
# El mapa de calor facilita identificar pares de variables altamente correlacionadas
plt.figure(figsize=(30, 18))
corr = df_copy[lst_numvar].corr()
ax = sb.heatmap(corr, annot=True, cmap=plt.cm.RdBu, vmin=-1, vmax=1, center=0, square=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')

[Text(0.5, 0, 'LBE'),
 Text(1.5, 0, 'LB'),
 Text(2.5, 0, 'AC'),
 Text(3.5, 0, 'FM'),
 Text(4.5, 0, 'UC'),
 Text(5.5, 0, 'ASTV'),
 Text(6.5, 0, 'MSTV'),
 Text(7.5, 0, 'ALTV'),
 Text(8.5, 0, 'MLTV'),
 Text(9.5, 0, 'DL'),
 Text(10.5, 0, 'DP'),
 Text(11.5, 0, 'Width'),
 Text(12.5, 0, 'Min'),
 Text(13.5, 0, 'Max'),
 Text(14.5, 0, 'Nmax'),
 Text(15.5, 0, 'Nzeros'),
 Text(16.5, 0, 'Mode'),
 Text(17.5, 0, 'Mean'),
 Text(18.5, 0, 'Median'),
 Text(19.5, 0, 'Variance'),
 Text(20.5, 0, 'CLASS')]

# Diagramas de caja y bigotes para variables numéricas
# Los puntos fuera de los bigotes (1.5 × IQR) son candidatos a outliers
plt.figure(figsize=(15, 10))
sb.boxplot(data=df_copy[lst_numvar], orient="h", palette="Set2")
plt.title("Diagramas de caja y bigotes para variables numéricas")
plt.xlabel("Valores")
plt.ylabel("Variables")
plt.show()

# Estadísticas descriptivas: media, desviación estándar, mínimo, máximo y percentiles
# Ayuda a entender la escala y variabilidad de cada variable
df_copy.describe()

def iqr_outliers(df):
    """
    Detecta outliers con el método IQR (Rango Intercuartílico).
    Retorna un DataFrame con columnas booleanas: True = outlier.
    """
    outliers_df = pd.DataFrame()
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)   # Primer cuartil (25%)
        Q3 = df[column].quantile(0.75)   # Tercer cuartil (75%)
        IQR = Q3 - Q1                    # Rango intercuartílico
        lower_bound = Q1 - 1.5 * IQR    # Límite inferior
        upper_bound = Q3 + 1.5 * IQR    # Límite superior

        # True donde el valor está fuera de los límites (outlier)
        outliers_df[column + '_outlier'] = (df[column] < lower_bound) | (df[column] > upper_bound)

        print(column)
        print(f"Límite inferior: {lower_bound:.4f}, Límite superior: {upper_bound:.4f}")
        print("Total de outliers: " + str(df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]))

    return outliers_df

outliers_iqr = iqr_outliers(df_copy[lst_numvar])
outliers_iqr

LBE
Límite inferior: 105.0000, Límite superior: 161.0000
Total de outliers: 0
LB
Límite inferior: 105.0000, Límite superior: 161.0000
Total de outliers: 0
AC
Límite inferior: -6.0000, Límite superior: 10.0000
Total de outliers: 83
FM
Límite inferior: -3.0000, Límite superior: 5.0000
Total de outliers: 310
UC
Límite inferior: -5.0000, Límite superior: 11.0000
Total de outliers: 22
ASTV
Límite inferior: -11.5000, Límite superior: 104.5000
Total de outliers: 0
MSTV
Límite inferior: -0.8000, Límite superior: 3.2000
Total de outliers: 70
ALTV
Límite inferior: -16.5000, Límite superior: 27.5000
Total de outliers: 309
MLTV
Límite inferior: -4.7000, Límite superior: 20.1000
Total de outliers: 71
DL
Límite inferior: -4.5000, Límite superior: 7.5000
Total de outliers: 81
DP
Límite inferior: 0.0000, Límite superior: 0.0000
Total de outliers: 178
Width
Límite inferior: -57.5000, Límite superior: 194.5000
Total de outliers: 0
Min
Límite inferior: -12.5000, Límite superior: 199.5000
Total de outliers: 0
Max
Límite inferior: 119.0000, Límite superior: 207.0000
Total de outliers: 24
Nmax
Límite inferior: -4.0000, Límite superior: 12.0000
Total de outliers: 19
Nzeros
Límite inferior: 0.0000, Límite superior: 0.0000
Total de outliers: 502
Mode
Límite inferior: 100.5000, Límite superior: 176.5000
Total de outliers: 73
Mean
Límite inferior: 95.0000, Límite superior: 175.0000
Total de outliers: 45
Median
Límite inferior: 100.5000, Límite superior: 176.5000
Total de outliers: 28
Variance
Límite inferior: -31.0000, Límite superior: 57.0000
Total de outliers: 184
CLASS
Límite inferior: -5.5000, Límite superior: 14.5000
Total de outliers: 0

def count_below_zero_per_column(df):
    """
    Cuenta cuántos valores negativos hay por columna.
    Variables fisiológicas (frecuencias, conteos) no deberían ser negativas;
    su presencia puede indicar errores de medición.
    """
    below_zero_counts = {}
    for column in df.columns:
        below_zero_counts[column] = (df[column] < 0).sum()
    return below_zero_counts

below_zero_counts = count_below_zero_per_column(df_copy[lst_numvar])
below_zero_counts

{'LBE': np.int64(0),
 'LB': np.int64(0),
 'AC': np.int64(0),
 'FM': np.int64(0),
 'UC': np.int64(0),
 'ASTV': np.int64(0),
 'MSTV': np.int64(0),
 'ALTV': np.int64(0),
 'MLTV': np.int64(0),
 'DL': np.int64(0),
 'DP': np.int64(0),
 'Width': np.int64(0),
 'Min': np.int64(0),
 'Max': np.int64(0),
 'Nmax': np.int64(0),
 'Nzeros': np.int64(0),
 'Mode': np.int64(0),
 'Mean': np.int64(0),
 'Median': np.int64(0),
 'Variance': np.int64(0),
 'CLASS': np.int64(0)}

def iqr_outliers_adjust_lower_bound(df):
    """
    Versión ajustada de IQR: si el límite inferior calculado es negativo
    y la variable no admite valores negativos (excepto 'Tendency'),
    se reemplaza por 0 para evitar falsos positivos.
    """
    outliers_df = pd.DataFrame()
    tendency = 'Tendency'

    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        # Ajuste: las variables fisiológicas no pueden ser negativas
        lower_bound = 0 if lower_bound < 0 and column != tendency else lower_bound
        upper_bound = Q3 + 1.5 * IQR

        print("Columna: " + column)
        print(f"Límite inferior: {lower_bound:.4f}, Límite superior: {upper_bound:.4f}")
        print("Total de outliers: " + str(df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]))
        print(50 * '-')

    return outliers_df

outliers_iqr = iqr_outliers_adjust_lower_bound(df_copy[lst_numvar])

Columna: LBE
Límite inferior: 105.0000, Límite superior: 161.0000
Total de outliers: 0
--------------------------------------------------
Columna: LB
Límite inferior: 105.0000, Límite superior: 161.0000
Total de outliers: 0
--------------------------------------------------
Columna: AC
Límite inferior: 0.0000, Límite superior: 10.0000
Total de outliers: 83
--------------------------------------------------
Columna: FM
Límite inferior: 0.0000, Límite superior: 5.0000
Total de outliers: 310
--------------------------------------------------
Columna: UC
Límite inferior: 0.0000, Límite superior: 11.0000
Total de outliers: 22
--------------------------------------------------
Columna: ASTV
Límite inferior: 0.0000, Límite superior: 104.5000
Total de outliers: 0
--------------------------------------------------
Columna: MSTV
Límite inferior: 0.0000, Límite superior: 3.2000
Total de outliers: 70
--------------------------------------------------
Columna: ALTV
Límite inferior: 0.0000, Límite superior: 27.5000
Total de outliers: 309
--------------------------------------------------
Columna: MLTV
Límite inferior: 0.0000, Límite superior: 20.1000
Total de outliers: 71
--------------------------------------------------
Columna: DL
Límite inferior: 0.0000, Límite superior: 7.5000
Total de outliers: 81
--------------------------------------------------
Columna: DP
Límite inferior: 0.0000, Límite superior: 0.0000
Total de outliers: 178
--------------------------------------------------
Columna: Width
Límite inferior: 0.0000, Límite superior: 194.5000
Total de outliers: 0
--------------------------------------------------
Columna: Min
Límite inferior: 0.0000, Límite superior: 199.5000
Total de outliers: 0
--------------------------------------------------
Columna: Max
Límite inferior: 119.0000, Límite superior: 207.0000
Total de outliers: 24
--------------------------------------------------
Columna: Nmax
Límite inferior: 0.0000, Límite superior: 12.0000
Total de outliers: 19
--------------------------------------------------
Columna: Nzeros
Límite inferior: 0.0000, Límite superior: 0.0000
Total de outliers: 502
--------------------------------------------------
Columna: Mode
Límite inferior: 100.5000, Límite superior: 176.5000
Total de outliers: 73
--------------------------------------------------
Columna: Mean
Límite inferior: 95.0000, Límite superior: 175.0000
Total de outliers: 45
--------------------------------------------------
Columna: Median
Límite inferior: 100.5000, Límite superior: 176.5000
Total de outliers: 28
--------------------------------------------------
Columna: Variance
Límite inferior: 0.0000, Límite superior: 57.0000
Total de outliers: 184
--------------------------------------------------
Columna: CLASS
Límite inferior: 0.0000, Límite superior: 14.5000
Total de outliers: 0
--------------------------------------------------

def out_std(df, column):
    """
    Detecta outliers usando la regla de las 3 desviaciones estándar (3σ).
    Media ± 3σ cubre el 99.7% de los datos bajo una distribución normal.
    """
    tendency = 'Tendency'
    data_mean, data_std = df[column].mean(), df[column].std()
    cut_off = data_std * 3
    lower_bound = data_mean - cut_off
    # Ajuste: variables fisiológicas no pueden ser negativas (excepto Tendency)
    lower_bound = 0 if lower_bound < 0 and column != tendency else lower_bound
    upper_bound = data_mean + cut_off

    print(f'Límite inferior: {lower_bound:.2f} \t Límite superior: {upper_bound:.2f}')
    print("Total de outliers: " + str(df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]))


for col in list(df_copy[lst_numvar].columns):
    print(col)
    out_std(df_copy[lst_numvar], col)
    print(25 * '--')

LBE
Límite inferior: 103.78 	 Límite superior: 162.83
Total de outliers: 0
--------------------------------------------------
LB
Límite inferior: 103.78 	 Límite superior: 162.83
Total de outliers: 0
--------------------------------------------------
AC
Límite inferior: 0.00 	 Límite superior: 13.41
Total de outliers: 35
--------------------------------------------------
FM
Límite inferior: 0.00 	 Límite superior: 118.62
Total de outliers: 31
--------------------------------------------------
UC
Límite inferior: 0.00 	 Límite superior: 12.20
Total de outliers: 13
--------------------------------------------------
ASTV
Límite inferior: 0.00 	 Límite superior: 98.57
Total de outliers: 0
--------------------------------------------------
MSTV
Límite inferior: 0.00 	 Límite superior: 3.98
Total de outliers: 33
--------------------------------------------------
ALTV
Límite inferior: 0.00 	 Límite superior: 65.04
Total de outliers: 59
--------------------------------------------------
MLTV
Límite inferior: 0.00 	 Límite superior: 25.07
Total de outliers: 33
--------------------------------------------------
DL
Límite inferior: 0.00 	 Límite superior: 9.07
Total de outliers: 28
--------------------------------------------------
DP
Límite inferior: 0.00 	 Límite superior: 1.52
Total de outliers: 69
--------------------------------------------------
Width
Límite inferior: 0.00 	 Límite superior: 187.31
Total de outliers: 0
--------------------------------------------------
Min
Límite inferior: 4.90 	 Límite superior: 182.26
Total de outliers: 0
--------------------------------------------------
Max
Límite inferior: 110.19 	 Límite superior: 217.86
Total de outliers: 14
--------------------------------------------------
Nmax
Límite inferior: 0.00 	 Límite superior: 12.92
Total de outliers: 19
--------------------------------------------------
Nzeros
Límite inferior: 0.00 	 Límite superior: 2.44
Total de outliers: 28
--------------------------------------------------
Mode
Límite inferior: 88.31 	 Límite superior: 186.60
Total de outliers: 39
--------------------------------------------------
Mean
Límite inferior: 87.83 	 Límite superior: 181.39
Total de outliers: 26
--------------------------------------------------
Median
Límite inferior: 94.69 	 Límite superior: 181.49
Total de outliers: 16
--------------------------------------------------
Variance
Límite inferior: 0.00 	 Límite superior: 105.74
Total de outliers: 44
--------------------------------------------------
CLASS
Límite inferior: 0.00 	 Límite superior: 13.59
Total de outliers: 0
--------------------------------------------------

# Variables seleccionadas para el análisis de anomalías con Isolation Forest
# (variables con mayor relevancia diagnóstica según la correlación)
correlated_vars = ['FM', 'UC', 'MSTV', 'ASTV', 'MLTV', 'ALTV', 'Mean', 'Median', 'Nmax', 'Max']
df_correlated = df_copy[correlated_vars]

fig, axs = plt.subplots(2, 5, figsize=(25, 10), facecolor='w', edgecolor='k')
axs = axs.ravel()
plt.subplots_adjust(wspace=0.5, hspace=0.5)

for i, column in enumerate(correlated_vars):
    # Entrenamos un Isolation Forest univariado por cada variable
    # contamination=0.05: asumimos que ~5% de los datos son anomalías
    isolation_forest = IsolationForest(contamination=0.05, random_state=42)
    isolation_forest.fit(df_correlated[[column]])

    # Generamos una secuencia de puntos para graficar la curva del score
    xx = np.linspace(df_correlated[column].min(), df_correlated[column].max(), 1000).reshape(-1, 1)
    anomaly_score = isolation_forest.decision_function(xx)  # Score: valores negativos = anomalía
    outlier = isolation_forest.predict(xx)                  # -1 = outlier, 1 = inlier

    # Score de anomalía para cada punto real del dataset
    data_scores = isolation_forest.decision_function(df_correlated[[column]])

    axs[i].plot(xx, anomaly_score, label='Score de Anomalía', color='blue', linewidth=2)
    axs[i].fill_between(xx.ravel(), np.min(anomaly_score), np.max(anomaly_score),
                        where=outlier == -1, color='red', alpha=0.3, label='Región Outlier')
    axs[i].scatter(df_correlated[column], data_scores, color='black', s=20, alpha=0.7, label='Datos')

    axs[i].set_title(f"Detección en {column}", fontsize=12, pad=10)
    axs[i].set_xlabel(column, fontsize=10)
    axs[i].set_ylabel("Score de Anomalía", fontsize=10)
    axs[i].grid(True, linestyle='--', alpha=0.6)
    axs[i].legend(loc='upper right', fontsize=8)

plt.tight_layout()
plt.show()

/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(
/usr/local/lib/python3.12/dist-packages/sklearn/utils/validation.py:2739: UserWarning: X does not have valid feature names, but IsolationForest was fitted with feature names
  warnings.warn(

# Seleccionamos FM (movimientos fetales) y UC (contracciones uterinas)
# para el análisis bivariado de anomalías
columnas = ['FM', 'UC']

# Normalizamos con StandardScaler: media=0, std=1
# Obligatorio para algoritmos basados en distancias como LOF,
# para que ninguna variable domine por tener mayor escala
scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_copy[columnas])

print("Primeras 5 filas normalizadas:\n", data_scaled[:5])

Primeras 5 filas normalizadas:
 [[-0.19509607 -1.28579753]
 [-0.19509607  0.11947457]
 [-0.19509607  0.47079259]
 [-0.19509607  0.82211062]
 [-0.19509607  0.47079259]]

# Referencia a los datos originales (sin escalar) para visualizar en escala real
df_2 = df_copy[columnas]
X_2 = df_2.to_numpy()

# LOF: n_neighbors=50 → número de vecinos para estimar la densidad local
# contamination='auto' → estima la proporción de outliers automáticamente
clf = LocalOutlierFactor(n_neighbors=50, contamination='auto')
y_pred = clf.fit_predict(data_scaled)  # Entrenamos sobre datos escalados

# Guardamos predicciones en el DataFrame: 1 = inlier, -1 = outlier
df_copy['Outlier'] = y_pred

plt.figure(figsize=(10, 7))

# Máscaras para separar inliers y outliers en el gráfico
in_mask = [l == 1 for l in y_pred]
out_mask = [l == -1 for l in y_pred]

plt.title("Local Outlier Factor (LOF) — FM vs UC")

# Graficamos en espacio original para mejor interpretabilidad clínica
plt.scatter(X_2[in_mask, 0], X_2[in_mask, 1],
            color='blue', label='Inliers', alpha=0.6, s=30)
plt.scatter(X_2[out_mask, 0], X_2[out_mask, 1],
            color='red', label='Outliers', edgecolors='k', s=30)

plt.xlabel(columnas[0])
plt.ylabel(columnas[1])
plt.legend()
plt.grid(True)
plt.show()

/usr/local/lib/python3.12/dist-packages/sklearn/neighbors/_lof.py:322: UserWarning: Duplicate values are leading to incorrect results. Increase the number of neighbors for more accurate results.
  warnings.warn(

# Entrenamos con datos escalados para consistencia con LOF
one_SVM = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.05)
one_SVM.fit(data_scaled)            # Entrenamos con datos escalados
y_pred = one_SVM.predict(data_scaled)  # Predecimos sobre datos escalados

# Recuperamos los outliers en escala original para la visualización
outliers_values = df_2.iloc[np.where(y_pred == -1)]

plt.title("One-Class SVM: Outliers detectados — FM vs UC")
plt.scatter(df_2[columnas[0]], df_2[columnas[1]],
            alpha=0.6, label='Inliers')
plt.scatter(outliers_values[columnas[0]], outliers_values[columnas[1]],
            color='red', edgecolors='k', label='Outliers')
plt.axis('tight')
plt.xlabel(columnas[0])
plt.ylabel(columnas[1])
plt.legend()
plt.show()

# Comparamos outliers detectados por LOF vs etiquetas reales de estado fetal (NSP)
outlier_eval = df_copy[['NSP', 'Outlier']].copy()
outlier_eval['Outlier_label'] = outlier_eval['Outlier'].map({1: 'Inlier', -1: 'Outlier'})
outlier_eval['NSP_label'] = outlier_eval['NSP'].map({1: 'Normal', 2: 'Sospechoso', 3: 'Patológico'})

# Tabla cruzada: filas = clase NSP, columnas = clasificación LOF
ct = pd.crosstab(outlier_eval['NSP_label'], outlier_eval['Outlier_label'])
print("Distribución de outliers LOF por clase NSP:")
print(ct)
print()
print("Porcentaje de outliers por clase NSP:")
print((ct.div(ct.sum(axis=1), axis=0) * 100).round(1))

# Heatmap para visualizar la relación
plt.figure(figsize=(8, 5))
sb.heatmap(ct, annot=True, fmt='d', cmap='YlOrRd')
plt.title('Outliers detectados por LOF vs Etiqueta NSP real')
plt.ylabel('Estado Fetal (NSP)')
plt.xlabel('Clasificación LOF')
plt.tight_layout()
plt.show()

Distribución de outliers LOF por clase NSP:
Outlier_label  Inlier  Outlier
NSP_label                     
Normal           1208      447
Patológico        105       71
Sospechoso        246       49

Porcentaje de outliers por clase NSP:
Outlier_label  Inlier  Outlier
NSP_label                     
Normal           73.0     27.0
Patológico       59.7     40.3
Sospechoso       83.4     16.6

# Usamos df_copy (dataset preprocesado) para consistencia con el análisis
df_3 = df_copy[['LB', 'AC', 'FM', 'UC', 'ASTV', 'MSTV', 'Mean', 'Median']]
X_3 = df_3.to_numpy()

# Probamos K desde 1 hasta 9 y calculamos el score de cada modelo
Nc = range(1, 10)
kmeans = [KMeans(n_clusters=i, random_state=42) for i in Nc]
score = [kmeans[i].fit(X_3).score(X_3) for i in range(len(kmeans))]

plt.rcParams['figure.figsize'] = (10, 10)
plt.plot(Nc, score, marker='o')
plt.xlabel('Número de Clusters (K)')
plt.ylabel('Score (inercia negativa)')
plt.title('Método del Codo para K-Means')
plt.grid(True)
plt.show()

# Entrenamos K-Means con K=4 (elegido con el método del codo)
kmeans = KMeans(n_clusters=4, random_state=42).fit(X_3)
labels = kmeans.predict(X_3)   # Etiqueta de cluster para cada muestra
C = kmeans.cluster_centers_    # Coordenadas de los 4 centroides

print("Distribución de muestras por cluster:")
print(pd.Series(labels).value_counts().sort_index())

colores = ['red', 'blue', 'green', 'yellow']
asignar = [colores[row] for row in labels]  # Color asignado a cada muestra

# Visualización 3D: ejes = FM (idx 2), LB (idx 0), AC (idx 1)
fig = plt.figure(figsize=(10, 10))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

# Puntos coloreados por cluster
ax.scatter(X_3[:, 2], X_3[:, 0], X_3[:, 1], c=asignar, s=50, alpha=0.6)

# Centroides marcados con estrella (*)
ax.scatter(C[:, 2], C[:, 0], C[:, 1], marker='*', c=colores, s=1000, edgecolors='black')

ax.set_xlabel('FM (Movimientos Fetales)')
ax.set_ylabel('LB (Línea Base FHR)')
ax.set_zlabel('AC (Aceleraciones)')
ax.set_title('Clusters K-Means (K=4) — Visualización 3D')
plt.show()

Distribución de muestras por cluster:
0    1013
1     683
2      26
3     404
Name: count, dtype: int64

from sklearn.metrics import silhouette_score, davies_bouldin_score

# Silhouette Score: mide qué tan bien separados están los clusters
# Rango: -1 a 1 → valores cercanos a 1 indican clusters bien definidos
sil = silhouette_score(X_3, labels)

# Davies-Bouldin Score: ratio entre dispersión intra-cluster y separación inter-cluster
# Rango: 0 a ∞ → valores cercanos a 0 indican mejor agrupamiento
db = davies_bouldin_score(X_3, labels)

print(f"Silhouette Score:       {sil:.4f}  (óptimo: cercano a 1)")
print(f"Davies-Bouldin Score:   {db:.4f}  (óptimo: cercano a 0)")

Silhouette Score:       0.3596  (óptimo: cercano a 1)
Davies-Bouldin Score:   0.8644  (óptimo: cercano a 0)

# Analizamos qué etiquetas NSP predominan en cada cluster
# Un cluster con alta proporción de NSP=3 (patológico) sería clínicamente relevante
cluster_nsp = pd.DataFrame({'Cluster': labels, 'NSP': df_copy['NSP'].values})
cluster_nsp['NSP_label'] = cluster_nsp['NSP'].map({1: 'Normal', 2: 'Sospechoso', 3: 'Patológico'})

ct_cluster = pd.crosstab(cluster_nsp['Cluster'], cluster_nsp['NSP_label'])
print("Composición de clusters por clase NSP (conteos):")
print(ct_cluster)
print()
print("Composición proporcional:")
print((ct_cluster.div(ct_cluster.sum(axis=1), axis=0) * 100).round(1))

# Gráfico de barras apiladas proporcional
ct_cluster.div(ct_cluster.sum(axis=1), axis=0).plot(
    kind='bar', colormap='Set2', figsize=(10, 5)
)
plt.title('Composición proporcional de cada cluster por estado fetal (NSP)')
plt.xlabel('Cluster (K-Means)')
plt.ylabel('Proporción')
plt.legend(title='Estado Fetal', bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

Composición de clusters por clase NSP (conteos):
NSP_label  Normal  Patológico  Sospechoso
Cluster                                  
0             720          37         256
1             668           9           6
2              14           8           4
3             253         122          29

Composición proporcional:
NSP_label  Normal  Patológico  Sospechoso
Cluster                                  
0            71.1         3.7        25.3
1            97.8         1.3         0.9
2            53.8        30.8        15.4
3            62.6        30.2         7.2

	LBE	LB	AC	FM	UC	ASTV	MSTV	ALTV	MLTV	DL	...	C	D	E	AD	DE	LD	FS	SUSP	CLASS	NSP
count	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.00000	2126.000000	2126.000000	...	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000	2126.000000
mean	133.303857	133.303857	2.722484	7.241298	3.659925	46.990122	1.332785	9.84666	8.187629	1.570085	...	0.024929	0.038100	0.033866	0.156162	0.118532	0.050329	0.032455	0.092662	4.509878	1.304327
std	9.840844	9.840844	3.560850	37.125309	2.847094	17.192814	0.883241	18.39688	5.628247	2.499229	...	0.155947	0.191482	0.180928	0.363094	0.323314	0.218675	0.177248	0.290027	3.026883	0.614377
min	106.000000	106.000000	0.000000	0.000000	0.000000	12.000000	0.200000	0.00000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
25%	126.000000	126.000000	0.000000	0.000000	1.000000	32.000000	0.700000	0.00000	4.600000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	2.000000	1.000000
50%	133.000000	133.000000	1.000000	0.000000	3.000000	49.000000	1.200000	0.00000	7.400000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	4.000000	1.000000
75%	140.000000	140.000000	4.000000	2.000000	5.000000	61.000000	1.700000	11.00000	10.800000	3.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	7.000000	1.000000
max	160.000000	160.000000	26.000000	564.000000	23.000000	87.000000	7.000000	91.00000	50.700000	16.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	10.000000	3.000000

1. Importación de Librerías¶

2. Carga y Preprocesamiento del Dataset¶

3. Análisis Exploratorio de Datos (EDA)¶

3.1 Balance del Dataset¶

3.2 Variables con Baja Variabilidad¶

3.3 Separación de Variables Numéricas y Categóricas¶

3.4 Distribución de Variables Categóricas¶

3.5 Distribución de Variables Numéricas¶

3.6 Relación de Variables Categóricas con el Estado Fetal (NSP)¶

3.7 Matriz de Correlación (Variables Numéricas)¶

3.8 Boxplots y Estadísticas Descriptivas¶

4. Detección de Outliers Estadísticos¶

4.1 Método del Rango Intercuartílico (IQR)¶

4.2 IQR con Ajuste de Límite Inferior¶

4.3 Regla de las 3 Desviaciones Estándar (3σ)¶

5. Detección de Anomalías con Machine Learning¶

5.1 Isolation Forest¶

5.2 Local Outlier Factor (LOF)¶

5.3 One-Class SVM¶

6. Evaluación de los Métodos de Detección¶

7. Agrupamiento (Clustering) con K-Means¶

7.1 Método del Codo (Elbow Method)¶

7.2 Visualización 3D y Análisis de Clusters¶

8. Conclusiones¶

Dataset¶

Detección de Outliers Estadísticos¶

Detección de Anomalías con ML¶

Clustering con K-Means¶

Aplicaciones Clínicas¶

	FileName	Date	SegFile	b	e	LBE	LB	AC	UC	...	AD	FS	CLASS	NSP
0	Variab10.txt	12/1/1996	CTG0001.txt	240.0	357.0	120.0	120.0	0.0	0.0	...	0.0	1.0	9.0	2.0
1	Fmcs_1.txt	5/3/1996	CTG0002.txt	5.0	632.0	132.0	132.0	4.0	4.0	...	1.0	0.0	6.0	1.0
2	Fmcs_1.txt	5/3/1996	CTG0003.txt	177.0	779.0	133.0	133.0	2.0	5.0	...	1.0	0.0	6.0	1.0
3	Fmcs_1.txt	5/3/1996	CTG0004.txt	411.0	1192.0	134.0	134.0	2.0	6.0	...	1.0	0.0	6.0	1.0
4	Fmcs_1.txt	5/3/1996	CTG0005.txt	533.0	1147.0	132.0	132.0	4.0	5.0	...	0.0	0.0	2.0	1.0

	LBE	LB	AC	UC	ASTV	MSTV	ALTV	MLTV	DL	...	AD	FS	CLASS	NSP
0	120.0	120.0	0.0	0.0	73.0	0.5	43.0	2.4	0.0	...	0.0	1.0	9.0	2.0
1	132.0	132.0	4.0	4.0	17.0	2.1	0.0	10.4	2.0	...	1.0	0.0	6.0	1.0
2	133.0	133.0	2.0	5.0	16.0	2.1	0.0	13.4	2.0	...	1.0	0.0	6.0	1.0
3	134.0	134.0	2.0	6.0	16.0	2.4	0.0	23.0	2.0	...	1.0	0.0	6.0	1.0
4	132.0	132.0	4.0	5.0	16.0	2.4	0.0	19.9	0.0	...	0.0	0.0	2.0	1.0

	0
LBE	0
LB	0
AC	891
FM	1311
UC	332
ASTV	0
MSTV	0
ALTV	1240
MLTV	137
DL	1231
DS	2119
DP	1948
DR	2126
Width	0
Min	0
Max	0
Nmax	107
Nzeros	1624
Mode	0
Mean	0
Median	0
Variance	187
Tendency	1115
A	1742
B	1547
C	2073
D	2045
E	2054
AD	1794
DE	1874
LD	2019
FS	2057
SUSP	1929
CLASS	0
NSP	0

	LBE_outlier	LB_outlier	AC_outlier	FM_outlier	UC_outlier	ASTV_outlier	MSTV_outlier	ALTV_outlier	MLTV_outlier	DL_outlier	...	Width_outlier	Min_outlier	Max_outlier	Nmax_outlier	Nzeros_outlier	Mode_outlier	Mean_outlier	Median_outlier	Variance_outlier	CLASS_outlier
0	False	False	False	False	False	False	False	True	False	False	...	False	False	False	False	False	False	False	False	True	False
1	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	True	False	False	False	False	False
3	False	False	False	False	False	False	False	False	True	False	...	False	False	False	False	False	False	False	False	False	False
4	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2121	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2122	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2123	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2124	False	False	False	False	False	False	False	False	False	False	...	False	False	False	False	False	False	False	False	False	False
2125	False	False	False	False	False	False	False	True	False	False	...	False	False	False	False	True	False	False	False	False	False