Source code for ds_utils.metrics.error_analysis

"""Error Analysis Module.

This module provides functions to generate tabular error-analysis reports
to help identify specific feature ranges or categories where a model underperforms.
"""

from typing import List, Optional

import numpy as np
import pandas as pd


[docs] def generate_error_analysis_report( X: pd.DataFrame, y_true: np.ndarray, y_pred: np.ndarray, feature_columns: Optional[List[str]] = None, bins: int = 10, threshold: float = 0.5, min_count: int = 1, sort_metric: str = "error_rate", ascending: bool = False, ) -> pd.DataFrame: """Generate a tabular error-analysis report grouped by feature values. The report groups predictions by feature values and computes error metrics per group. For numerical features, values are binned into equal-width bins. For categorical features, raw values are used as groups. :param X: Feature DataFrame. :param y_true: True labels. :param y_pred: Predicted labels. :param feature_columns: List of columns to analyze. If None, all columns in X are used. :param bins: Number of bins for numerical features. :param threshold: Threshold for probability-based error definitions. Validated but not used in the current implementation; reserved for future probability-based error definitions. :param min_count: Minimum number of samples in a group to be included in the report. :param sort_metric: Metric to sort the report by. Valid options are: 'feature', 'group', 'count', 'error_count', 'error_rate', 'accuracy'. :param ascending: Whether to sort in ascending order. :return: DataFrame containing the error analysis report. :raises ValueError: If bins < 1, threshold not in [0, 1], min_count < 1, or invalid sort_metric. :raises KeyError: If any column in feature_columns is missing from X. """ if bins < 1: raise ValueError("bins must be at least 1") if not (0 <= threshold <= 1): raise ValueError("threshold must be between 0 and 1 inclusive") if min_count < 1: raise ValueError("min_count must be at least 1") valid_columns = ["feature", "group", "count", "error_count", "error_rate", "accuracy"] if sort_metric not in valid_columns: raise ValueError(f"sort_metric must be one of {valid_columns}") y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) if len(y_true) != len(X) or len(y_pred) != len(X): raise ValueError("X, y_true, and y_pred must have the same number of samples") if feature_columns is not None: missing_cols = [col for col in feature_columns if col not in X.columns] if missing_cols: raise KeyError(f"The following columns are missing from X: {missing_cols}") cols_to_use = feature_columns else: cols_to_use = X.columns.tolist() internal_df = X[cols_to_use].copy() internal_df["__is_error__"] = y_true != y_pred all_reports = [] for col in cols_to_use: col_series = internal_df[col] if pd.api.types.is_numeric_dtype(col_series): groups = pd.cut(col_series, bins=bins) else: groups = col_series report = ( internal_df.groupby(groups, observed=True) .agg(count=("__is_error__", "size"), error_count=("__is_error__", "sum")) .reset_index() ) report.rename(columns={col: "group"}, inplace=True) report["feature"] = col report["error_rate"] = report["error_count"] / report["count"] report["accuracy"] = 1 - report["error_rate"] report = report[report["count"] >= min_count] all_reports.append(report[valid_columns]) if not all_reports: return pd.DataFrame(columns=valid_columns) final_report = pd.concat(all_reports).sort_values(by=sort_metric, ascending=ascending).reset_index(drop=True) return final_report