From 4d51a8328d10ea4d3d5f31028ecdc4faf0ecd142 Mon Sep 17 00:00:00 2001 From: yansans <66671259+yansans@users.noreply.github.com> Date: Wed, 8 Jan 2025 14:13:20 +0700 Subject: [PATCH] feat: rename and save file --- spark/clean.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/spark/clean.py b/spark/clean.py index 8ab12d4..49b0c56 100644 --- a/spark/clean.py +++ b/spark/clean.py @@ -36,4 +36,40 @@ for col_name in columns_to_replace: .otherwise(col(col_name)) ) -# df.show() \ No newline at end of file +# df.show() + +# rename to snake_case +import re + +column_list = df.columns + +def to_snake_case(name): + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() + +new_columns = [to_snake_case(col) for col in column_list] + +for old_col, new_col in zip(df.columns, new_columns): + df = df.withColumnRenamed(old_col, new_col) + +#df.show(5) + +# output to one csv file +import shutil +import os + +output_dir = "../data/clean" + +if os.path.exists(output_dir): + shutil.rmtree(output_dir) + +df.coalesce(1).write.option("header", "true").csv(output_dir) + +part_file = os.path.join(output_dir, "part-00000-*.csv") + +final_file = os.path.join("../data", "clean.csv") + +for file in os.listdir(output_dir): + if file.startswith("part-"): + shutil.move(os.path.join(output_dir, file), final_file) + +shutil.rmtree(output_dir) \ No newline at end of file -- GitLab