Source code for tab_right.base_architecture.seg_protocols

"""Protocol definitions for data segmentation analysis in tab-right.

This module defines protocol classes and type aliases for segmentation analysis,
including interfaces for segmentation calculations and feature-based segmentation.
"""

from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional, Protocol, Union, runtime_checkable

import pandas as pd
from pandas.api.typing import DataFrameGroupBy

MetricType = Callable[[pd.Series, pd.Series], pd.Series]
ScoreMetricType = Callable[[pd.Series, pd.Series], float]


[docs] @runtime_checkable @dataclass class BaseSegmentationCalc(Protocol): """Base protocol for segmentation performance calculations. Parameters ---------- gdf : DataFrameGroupBy Grouped DataFrame, each group represents a segment. label_col : str Column name for the true target values. prediction_col : Union[str, List[str]] Column names for the predicted values. Can be a single column or a list of columns. Can be probabilities (multiple columns) or classes or continuous values. segment_names : Optional[Dict[int, Any]], default=None Optional mapping from an integer segment ID to the original group name (category, interval, or tuple). If provided, these IDs should match the grouping keys if gdf is grouped by integer IDs. """ gdf: DataFrameGroupBy label_col: str prediction_col: Union[str, List[str]] segment_names: Optional[Dict[int, Any]] = None def _reduce_metric_results( self, results: Union[float, pd.Series], ) -> float: """Reduce the metric results to a single value, the metric produce series of values. if produce a single value, return it. it used for getting single value for each segment. Parameters ---------- results : Union[float, pd.Series] The metric results to reduce. Returns ------- float The reduced metric result. """ def __call__(self, metric: Callable) -> pd.DataFrame: """Call method to apply the metric to each group in the DataFrameGroupBy object. Parameters ---------- metric : Callable[[pd.Series, pd.Series], pd.Series] A function that takes two pandas Series (true and predicted values) and returns a float representing the error metric. Returns ------- pd.DataFrame DataFrame containing the calculated error metrics for each segment. Expected columns: - `segment_id`: The ID of the segment (either the original group key or an assigned int). - `name`: The name of the segment (category or bin range string). - `score`: The avg error metric for each segment. """
[docs] @runtime_checkable @dataclass class DoubleSegmentation(Protocol): """Class schema for calculating double segmentation, segmentation based on two features. Parameters ---------- df : pd.DataFrame A DataFrame containing to analyze. label_col : str The name of the column containing the true target values. prediction_col : str The name of the column containing the predicted values. Can be probabilities (multiple columns) or classes or continuous values. """ df: pd.DataFrame label_col: str prediction_col: str def _group_2_features( self, feature1: str, feature2: str, bins_1: int, bins_2: int, ) -> BaseSegmentationCalc: """Group the DataFrame by two features and returns a DataFrameGroupBy object. Parameters ---------- feature1 : str The name of the first feature, which we want to find the segmentation for. feature2 : str The name of the second feature, which we want to find the segmentation for. bins_1 : int The number of bins to use for the first feature, if the feature is continuous. bins_2 : int The number of bins to use for the second feature, if the feature is continuous. Returns ------- BaseSegmentationCalc A SegmentationCalc instance with grouped data. The DataFrame is grouped by the two features, and the segments are defined by the bins. """ def __call__( self, feature1_col: str, feature2_col: str, score_metric: ScoreMetricType, bins_1: int, bins_2: int, ) -> pd.DataFrame: """Call method to apply grouping and scoring to the segment. Parameters ---------- feature1_col : str The name of the first feature, which we want to find the segmentation for. feature2_col : str The name of the second feature, which we want to find the segmentation for. score_metric : ScoreMetricType A function that takes two pandas Series (true and predicted values) and returns a float representing the error metric. bins_1 : int, default=4 The number of bins to use for the first feature, if the feature is continuous. ignore if the feature is categorical. bins_2 : int, default=4 The number of bins to use for the second feature, if the feature is continuous. ignore if the feature is categorical. Returns ------- pd.DataFrame A DataFrame containing the groups defined by the decision tree model. columns: - `segment_id`: The ID of the segment, for grouping. - `feature_1`: (str) the range or category of the first feature. - `feature_2`: (str) the range or category of the second feature. - `score`: (float) The calculated error metric for the segment. """