From 8c92b941ccb630e245ff891939f26ce1164db17b Mon Sep 17 00:00:00 2001 From: yansans <66671259+yansans@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:46:31 +0700 Subject: [PATCH] fix: update nan totalcharges --- spark/clean.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spark/clean.py b/spark/clean.py index 49b0c56..463099a 100644 --- a/spark/clean.py +++ b/spark/clean.py @@ -1,4 +1,5 @@ from pyspark.sql import SparkSession +from pyspark.sql import functions as F spark = SparkSession.builder \ .appName("TelcoCustomerChurn") \ @@ -18,7 +19,15 @@ df = df.withColumn("TotalCharges", col("TotalCharges").cast("double")) # tenure_zero_rows = df.filter(col("tenure") == 0) # tenure_zero_rows.show() -df = df.fillna({"TotalCharges": 0}) +df = df.withColumn( + "TotalCharges", + F.when(F.col("tenure") == 0, 0).otherwise(F.col("TotalCharges")) +) + +percentile = df.filter(F.col("tenure") != 0).approxQuantile("TotalCharges", [0.5], 0.01) +median_total_charges = percentile[0] + +df = df.fillna({"TotalCharges": median_total_charges}) # missing_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]) # missing_counts.show() -- GitLab