From 8c92b941ccb630e245ff891939f26ce1164db17b Mon Sep 17 00:00:00 2001
From: yansans <66671259+yansans@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:46:31 +0700
Subject: [PATCH] fix: update nan totalcharges

---
 spark/clean.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/spark/clean.py b/spark/clean.py
index 49b0c56..463099a 100644
--- a/spark/clean.py
+++ b/spark/clean.py
@@ -1,4 +1,5 @@
 from pyspark.sql import SparkSession
+from pyspark.sql import functions as F
 
 spark = SparkSession.builder \
     .appName("TelcoCustomerChurn") \
@@ -18,7 +19,15 @@ df = df.withColumn("TotalCharges", col("TotalCharges").cast("double"))
 # tenure_zero_rows = df.filter(col("tenure") == 0)
 # tenure_zero_rows.show()
 
-df = df.fillna({"TotalCharges": 0})
+df = df.withColumn(
+    "TotalCharges",
+    F.when(F.col("tenure") == 0, 0).otherwise(F.col("TotalCharges"))
+)
+
+percentile = df.filter(F.col("tenure") != 0).approxQuantile("TotalCharges", [0.5], 0.01)
+median_total_charges = percentile[0]
+
+df = df.fillna({"TotalCharges": median_total_charges})
 
 # missing_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
 # missing_counts.show()
-- 
GitLab