From a49dedba5250d852c4cfa027ccee976c089e85c8 Mon Sep 17 00:00:00 2001 From: agshaathalla <13521027@mahasiswa.itb.ac.id> Date: Fri, 10 Jan 2025 16:01:52 +0700 Subject: [PATCH] data drift --- dags/data_drift.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 dags/data_drift.py diff --git a/dags/data_drift.py b/dags/data_drift.py new file mode 100644 index 0000000..ca149d0 --- /dev/null +++ b/dags/data_drift.py @@ -0,0 +1,25 @@ +import pandas as pd + +def sample_data(input_csv, output_csv, sample_fraction, random_state=None): + data = pd.read_csv(input_csv) + + print("Data sebelum sampling:") + print(data.info()) + print(data.head()) + + sampled_data = data.sample(frac=sample_fraction, random_state=random_state) + + print("\nData setelah sampling:") + print(sampled_data.info()) + print(sampled_data.head()) + + sampled_data.to_csv(output_csv, index=False) + print(f"\nHasil sampling disimpan ke {output_csv}") + +if __name__ == "__main__": + input_csv = "telco_customer_churn.csv" + output_csv = "telco_customer_churn_drift.csv" + sample_fraction = 0.2 + random_state = 42 + + sample_data(input_csv, output_csv, sample_fraction, random_state) -- GitLab