From 4d51a8328d10ea4d3d5f31028ecdc4faf0ecd142 Mon Sep 17 00:00:00 2001
From: yansans <66671259+yansans@users.noreply.github.com>
Date: Wed, 8 Jan 2025 14:13:20 +0700
Subject: [PATCH] feat: rename and save file

---
 spark/clean.py | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/spark/clean.py b/spark/clean.py
index 8ab12d4..49b0c56 100644
--- a/spark/clean.py
+++ b/spark/clean.py
@@ -36,4 +36,40 @@ for col_name in columns_to_replace:
         .otherwise(col(col_name))
     )
 
-# df.show()
\ No newline at end of file
+# df.show()
+
+# rename to snake_case
+import re
+
+column_list = df.columns
+
+def to_snake_case(name):
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()
+
+new_columns = [to_snake_case(col) for col in column_list]
+
+for old_col, new_col in zip(df.columns, new_columns):
+    df = df.withColumnRenamed(old_col, new_col)
+
+#df.show(5)
+
+# output to one csv file
+import shutil
+import os
+
+output_dir = "../data/clean"
+
+if os.path.exists(output_dir):
+    shutil.rmtree(output_dir)
+
+df.coalesce(1).write.option("header", "true").csv(output_dir)
+
+part_file = os.path.join(output_dir, "part-00000-*.csv")
+
+final_file = os.path.join("../data", "clean.csv")
+
+for file in os.listdir(output_dir):
+    if file.startswith("part-"):
+        shutil.move(os.path.join(output_dir, file), final_file)
+
+shutil.rmtree(output_dir)
\ No newline at end of file
-- 
GitLab