Skip to content
Snippets Groups Projects
Commit c26ea0f0 authored by yansans's avatar yansans
Browse files

feat: spark cleanup

parent 77863697
Branches
No related merge requests found
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("TelcoCustomerChurn") \
.getOrCreate()
df = spark.read.csv("../data/TelcoCustomerChurn.csv", header=True, inferSchema=True)
from pyspark.sql.functions import col, when, count, mean, isnan
df = df.drop("customerID")
df = df.withColumn("TotalCharges", col("TotalCharges").cast("double"))
# missing_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
# missing_counts.show()
# tenure_zero_rows = df.filter(col("tenure") == 0)
# tenure_zero_rows.show()
df = df.fillna({"TotalCharges": 0})
# missing_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
# missing_counts.show()
# redundant
columns_to_replace = [
"MultipleLines", "OnlineSecurity", "OnlineBackup",
"DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"
]
for col_name in columns_to_replace:
df = df.withColumn(
col_name,
when(col(col_name).isin(["No phone service", "No internet service"]), "No")
.otherwise(col(col_name))
)
# df.show()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment