Skip to content
Snippets Groups Projects
Commit 2e41819f authored by yansans's avatar yansans
Browse files

init: preprocessing

parent 4d51a832
No related merge requests found
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Preprocessing") \
.getOrCreate()
# cleaned data
df = spark.read.csv("../data/clean.csv", header=True, inferSchema=True)
# df.printSchema()
# df.count()
# split data
data_X = df.drop('churn') # Features
data_y = df.select('churn') # Target
data_train, data_test = df.randomSplit([0.7, 0.3], seed=1)
#print(f"Training data count: {data_train.count()}")
#print(f"Testing data count: {data_test.count()}")
# encode
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol="churn", outputCol="churn_encoded")
indexer_model = indexer.fit(data_train)
data_train_encoded = indexer_model.transform(data_train)
data_test_encoded = indexer_model.transform(data_test)
#data_train_encoded.select("churn", "churn_encoded").show(5)
#data_test_encoded.select("churn", "churn_encoded").show(5)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment