feat: create basic ML model

e40d249f · Naufal-Nalendra · 4e8349a4 · e40d249f · e40d249f · e40d249f
Commit e40d249f authored 6 months ago by Naufal-Nalendra
--- a/.gitignore
+++ b/.gitignore
+model/mlruns
\ No newline at end of file
--- a/data/churn.csv
+++ b/data/churn.csv
--- a/model/model.py
+++ b/model/model.py
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
+from sklearn.ensemble import VotingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+import mlflow
+import mlflow.sklearn
+
+# Load dataset
+# TODO clean dataset
+data = pd.read_csv('../data/churn.csv')
+
+# Split into features and target
+X = data.drop(columns=['Churn'])
+y = data['Churn'].map({'Yes': 1, 'No': 0})
+
+# Split into train and test sets
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Define individual models
+logistic_model = LogisticRegression(max_iter=1000, random_state=42)
+random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
+gradient_boosting_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
+
+ensemble_model = VotingClassifier(
+    estimators=[
+        ('logistic', logistic_model),
+        ('random_forest', random_forest_model),
+        ('gradient_boosting', gradient_boosting_model)
+    ],
+    voting='soft'
+)
+
+# Start MLflow run
+with mlflow.start_run():
+    # Train the model
+    ensemble_model.fit(X_train, y_train)
+
+    # Make predictions
+    y_pred = ensemble_model.predict(X_test)
+
+    # Evaluate model performance
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+
+    # Log parameters
+    mlflow.log_param("n_estimators", 100)
+    mlflow.log_param("random_state", 42)
+
+    # Log metrics
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+
+    # Log the model
+    mlflow.sklearn.log_model(ensemble_model, "random_forest_model")
+
+    print("Model training and logging complete.")
+    print(f"Metrics - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")