Source code for tab_right.base_architecture.drift_protocols
"""Protocol definitions for drift detection and analysis in tab-right.
This module defines protocol classes and type aliases used for implementing
drift detection functionality across different feature types. These protocols
establish a consistent interface for all drift detection implementations.
"""
from dataclasses import dataclass
from typing import Dict, Iterable, Mapping, Optional, Protocol, runtime_checkable
import pandas as pd
[docs]
@runtime_checkable
@dataclass
class DriftCalcP(Protocol):
"""Protocol for drift calculation implementations.
This protocol defines the interface that all drift calculation classes must implement.
It specifies methods for detecting distributional shifts between two datasets.
Parameters
----------
df1 : pd.DataFrame
The reference DataFrame containing the baseline distribution.
df2 : pd.DataFrame
The current DataFrame to compare against the reference.
kind : Union[str, Iterable[bool], Dict[str, str]], default "auto"
Controls how columns are treated:
- None: general policy to determine column types
- Dict[str, str]: Explicit mapping from column name to "continuous" or "categorical"
Notes
-----
Implementations of this protocol are responsible for:
1. Comparing distribution shifts between reference and current data
2. Automatically selecting appropriate metrics based on data types
3. Providing normalized scores for comparison across features
"""
df1: pd.DataFrame
df2: pd.DataFrame
kind: Optional[Dict[str, str]]
def __call__(self, columns: Optional[Iterable[str]] = None, bins: int = 10, **kwargs: Mapping) -> pd.DataFrame:
"""Calculate drift between two DataFrames.
Parameters
----------
columns : Optional[Iterable[str]], default None
Specific columns to analyze. If None, analyzes all common columns.
bins : int, default 10
Number of bins for histograms when analyzing continuous features.
**kwargs : Mapping
Additional parameters specific to the drift calculation implementation.
Returns
-------
pd.DataFrame
A DataFrame containing the drift metrics for each column.
Must contain at least the following columns:
- "feature": The name of the feature.
- "type": The type of metric used (e.g., "wasserstein", "cramer_v").
- "score": The calculated drift score (normalized to [0,1] when applicable).
May also include:
- "raw_score": The original, unnormalized score for continuous features.
- "threshold": Optional threshold value for drift significance.
"""
[docs]
def get_prob_density(
self,
columns: Optional[Iterable[str]] = None,
bins: int = 10,
) -> pd.DataFrame:
"""Get the probability density functions for the features.
Parameters
----------
columns : Optional[Iterable[str]], default None
Specific columns to analyze. If None, analyzes all common columns.
bins : int, default 10
Number of bins for histograms when analyzing continuous features.
Returns
-------
pd.DataFrame
A DataFrame containing the probability density functions.
Must contain at least the following columns:
- "feature": The name of the feature.
- "bin": The bin or category.
- "ref_density": The density in the reference dataset.
- "cur_density": The density in the current dataset.
"""
@classmethod
def _categorical_drift_calc(cls, s1: pd.Series, s2: pd.Series) -> float:
"""Calculate drift for categorical features (normalized to [0,1]).
Parameters
----------
s1 : pd.Series
Reference distribution.
s2 : pd.Series
Current distribution.
Returns
-------
float
Normalized drift score between 0 and 1.
Notes
-----
Typically implemented using Cramér's V statistic:
V = sqrt(χ² / (n * min(k-1, r-1)))
where:
- χ² is the chi-squared statistic
- n is the total number of observations
- k is the number of categories in the first variable
- r is the number of categories in the second variable
"""
@classmethod
def _continuous_drift_calc(cls, s1: pd.Series, s2: pd.Series, bins: int = 10) -> float:
"""Calculate drift for continuous features (normalized to [0,1]).
Parameters
----------
s1 : pd.Series
Reference distribution.
s2 : pd.Series
Current distribution.
bins : int, default 10
Number of bins for histogram comparison.
Returns
-------
float
Normalized drift score between 0 and 1.
Notes
-----
Can be implemented using various metrics:
- Wasserstein distance (normalized)
- Kolmogorov-Smirnov test statistic
"""