fix: remake psi algorithm

18cbaee5 · Alexander Jason · 2b3a8b85 · 18cbaee5
Commit 18cbaee5 authored 2 months ago by Alexander Jason
--- a/airflow/dags/functions/drift_detection.py
+++ b/airflow/dags/functions/drift_detection.py
@@ -8,17 +8,33 @@ def calculate_psi(expected, actual):
    Args:
        expected (np.array): Expected distribution (training data).
        actual (np.array): Current distribution (new data).
-        bins (int): Number of bins for comparison.
+        bins (int, optional): Number of bins for comparison. If None, Doane's formula will be used.
    
    Returns:
        float: The PSI value.
    """
-    bins = 10
-    expected_hist, _ = np.histogram(expected, bins=bins)
-    actual_hist, _ = np.histogram(actual, bins=bins)
-    expected_perc = expected_hist / sum(expected_hist)
-    actual_perc = actual_hist / sum(actual_hist)
-    psi = np.sum((expected_perc - actual_perc) * np.log(expected_perc / actual_perc))
+    # Combine expected and actual datasets to determine bin edges
+    full_dataset = np.concatenate((expected, actual))
+
+    # Determine bin edges
+    bin_edges = np.linspace(min(min(expected), min(actual)), max(max(expected), max(actual)),  10)
+
+    # Calculate histograms for expected and actual distributions
+    expected_hist, _ = np.histogram(expected, bins=bin_edges)
+    actual_hist, _ = np.histogram(actual, bins=bin_edges)
+
+    # Convert counts to proportions
+    expected_proportions = expected_hist / np.sum(expected_hist)
+    actual_proportions = actual_hist / np.sum(actual_hist)
+
+    # Replace zero proportions to avoid division by zero or log of zero errors
+    expected_proportions = np.where(expected_proportions == 0, 1e-6, expected_proportions)
+    actual_proportions = np.where(actual_proportions == 0, 1e-6, actual_proportions)
+
+    # Calculate PSI
+    psi_values = (actual_proportions - expected_proportions) * np.log(actual_proportions / expected_proportions)
+    psi = np.sum(psi_values)
+
    return psi

 def log_psi(expected, actual):