diff --git a/dags/data_drift.py b/dags/data_drift.py new file mode 100644 index 0000000000000000000000000000000000000000..ca149d05d9815cd2b203ae2fcb7687a487b4a1d2 --- /dev/null +++ b/dags/data_drift.py @@ -0,0 +1,25 @@ +import pandas as pd + +def sample_data(input_csv, output_csv, sample_fraction, random_state=None): + data = pd.read_csv(input_csv) + + print("Data sebelum sampling:") + print(data.info()) + print(data.head()) + + sampled_data = data.sample(frac=sample_fraction, random_state=random_state) + + print("\nData setelah sampling:") + print(sampled_data.info()) + print(sampled_data.head()) + + sampled_data.to_csv(output_csv, index=False) + print(f"\nHasil sampling disimpan ke {output_csv}") + +if __name__ == "__main__": + input_csv = "telco_customer_churn.csv" + output_csv = "telco_customer_churn_drift.csv" + sample_fraction = 0.2 + random_state = 42 + + sample_data(input_csv, output_csv, sample_fraction, random_state)