Source code for tab_right.base_architecture.drift_protocols

"""Protocol definitions for drift detection and analysis in tab-right.

This module defines protocol classes and type aliases used for implementing
drift detection functionality across different feature types. These protocols
establish a consistent interface for all drift detection implementations.
"""

from dataclasses import dataclass
from typing import Dict, Iterable, Mapping, Optional, Protocol, runtime_checkable

import pandas as pd



[docs]
@runtime_checkable
@dataclass
class DriftCalcP(Protocol):
    """Protocol for drift calculation implementations.

    This protocol defines the interface that all drift calculation classes must implement.
    It specifies methods for detecting distributional shifts between two datasets.

    Parameters
    ----------
    df1 : pd.DataFrame
        The reference DataFrame containing the baseline distribution.
    df2 : pd.DataFrame
        The current DataFrame to compare against the reference.
    kind : Union[str, Iterable[bool], Dict[str, str]], default "auto"
        Controls how columns are treated:
        - None: general policy to determine column types
        - Dict[str, str]: Explicit mapping from column name to "continuous" or "categorical"

    Notes
    -----
    Implementations of this protocol are responsible for:
    1. Comparing distribution shifts between reference and current data
    2. Automatically selecting appropriate metrics based on data types
    3. Providing normalized scores for comparison across features

    """

    df1: pd.DataFrame
    df2: pd.DataFrame
    kind: Optional[Dict[str, str]]

    def __call__(self, columns: Optional[Iterable[str]] = None, bins: int = 10, **kwargs: Mapping) -> pd.DataFrame:
        """Calculate drift between two DataFrames.

        Parameters
        ----------
        columns : Optional[Iterable[str]], default None
            Specific columns to analyze. If None, analyzes all common columns.
        bins : int, default 10
            Number of bins for histograms when analyzing continuous features.
        **kwargs : Mapping
            Additional parameters specific to the drift calculation implementation.

        Returns
        -------
        pd.DataFrame
            A DataFrame containing the drift metrics for each column.
            Must contain at least the following columns:
            - "feature": The name of the feature.
            - "type": The type of metric used (e.g., "wasserstein", "cramer_v").
            - "score": The calculated drift score (normalized to [0,1] when applicable).

            May also include:
            - "raw_score": The original, unnormalized score for continuous features.
            - "threshold": Optional threshold value for drift significance.

        """


[docs]
    def get_prob_density(
        self,
        columns: Optional[Iterable[str]] = None,
        bins: int = 10,
    ) -> pd.DataFrame:
        """Get the probability density functions for the features.

        Parameters
        ----------
        columns : Optional[Iterable[str]], default None
            Specific columns to analyze. If None, analyzes all common columns.
        bins : int, default 10
            Number of bins for histograms when analyzing continuous features.

        Returns
        -------
        pd.DataFrame
            A DataFrame containing the probability density functions.
            Must contain at least the following columns:
            - "feature": The name of the feature.
            - "bin": The bin or category.
            - "ref_density": The density in the reference dataset.
            - "cur_density": The density in the current dataset.

        """


    @classmethod
    def _categorical_drift_calc(cls, s1: pd.Series, s2: pd.Series) -> float:
        """Calculate drift for categorical features (normalized to [0,1]).

        Parameters
        ----------
        s1 : pd.Series
            Reference distribution.
        s2 : pd.Series
            Current distribution.

        Returns
        -------
        float
            Normalized drift score between 0 and 1.

        Notes
        -----
        Typically implemented using Cramér's V statistic:
        V = sqrt(χ² / (n * min(k-1, r-1)))
        where:
        - χ² is the chi-squared statistic
        - n is the total number of observations
        - k is the number of categories in the first variable
        - r is the number of categories in the second variable

        """

    @classmethod
    def _continuous_drift_calc(cls, s1: pd.Series, s2: pd.Series, bins: int = 10) -> float:
        """Calculate drift for continuous features (normalized to [0,1]).

        Parameters
        ----------
        s1 : pd.Series
            Reference distribution.
        s2 : pd.Series
            Current distribution.
        bins : int, default 10
            Number of bins for histogram comparison.

        Returns
        -------
        float
            Normalized drift score between 0 and 1.

        Notes
        -----
        Can be implemented using various metrics:
        - Wasserstein distance (normalized)
        - Kolmogorov-Smirnov test statistic

        """