diff --git a/models/Gradient_Boosting.joblib b/models/Gradient_Boosting.joblib new file mode 100644 index 0000000000000000000000000000000000000000..3455361eecab029a6cec4202a5eb73d62d29d33c Binary files /dev/null and b/models/Gradient_Boosting.joblib differ diff --git a/models/K-Nearest_Neighbors.joblib b/models/K-Nearest_Neighbors.joblib new file mode 100644 index 0000000000000000000000000000000000000000..b750959d9b601e983d695c0faa0db43bddd52bfe Binary files /dev/null and b/models/K-Nearest_Neighbors.joblib differ diff --git a/models/Logistic_Regression.joblib b/models/Logistic_Regression.joblib new file mode 100644 index 0000000000000000000000000000000000000000..a6577106112ba5333fe14e7b1c9b70cc1e3bcbc9 Binary files /dev/null and b/models/Logistic_Regression.joblib differ diff --git a/models/Random_Forest.joblib b/models/Random_Forest.joblib new file mode 100644 index 0000000000000000000000000000000000000000..4ae9b15db751bb261d077bcbcd0fa63bd6c2f9a8 Binary files /dev/null and b/models/Random_Forest.joblib differ diff --git a/models/Support_Vector_Machine.joblib b/models/Support_Vector_Machine.joblib new file mode 100644 index 0000000000000000000000000000000000000000..118dc2fa6ff6ae1ced1377b1cb5b0a524eb24b7c Binary files /dev/null and b/models/Support_Vector_Machine.joblib differ diff --git a/models/XGBoost.joblib b/models/XGBoost.joblib new file mode 100644 index 0000000000000000000000000000000000000000..3a785476e65488ba5b5ec2c70a1baf4a8ae8bd07 Binary files /dev/null and b/models/XGBoost.joblib differ diff --git a/notebooks/customer_churn_machine_learning_models.ipynb b/notebooks/customer_churn_machine_learning_models.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f4da41f5504d82521093635f0c661e85aeff8352 --- /dev/null +++ b/notebooks/customer_churn_machine_learning_models.ipynb @@ -0,0 +1,293 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Customer Churn Prediction: Machine Learning Models \n", + "\n", + "This notebook will load the training, validation, and test datasets, train multiple machine learning models, evaluate their performance, and save the models for future use. " + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# Import Libraries \n", + "import pandas as pd \n", + "import numpy as np \n", + "from sklearn.model_selection import train_test_split \n", + "from sklearn.ensemble import RandomForestClassifier \n", + "from sklearn.linear_model import LogisticRegression \n", + "from sklearn.metrics import accuracy_score, classification_report \n", + "from sklearn.preprocessing import StandardScaler \n", + "import joblib \n", + "\n", + "from sklearn.svm import SVC \n", + "from sklearn.ensemble import GradientBoostingClassifier \n", + "from sklearn.neighbors import KNeighborsClassifier \n", + "from xgboost import XGBClassifier\n", + "\n", + "# Set display options for better readability \n", + "pd.set_option('display.max_columns', None) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load the Data \n", + "\n", + "Load the training, validation, and test datasets that we saved in the preprocessing step. " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training set size: 5977\n", + "Validation set size: 527\n", + "Test set size: 528\n" + ] + } + ], + "source": [ + "# Load the datasets \n", + "X_train = pd.read_csv('../data/X_train.csv') \n", + "y_train = pd.read_csv('../data/y_train.csv') \n", + "X_val = pd.read_csv('../data/X_val.csv') \n", + "y_val = pd.read_csv('../data/y_val.csv') \n", + "X_test = pd.read_csv('../data/X_test.csv') \n", + "y_test = pd.read_csv('../data/y_test.csv') \n", + "\n", + "# Display the shapes of the datasets \n", + "print(f\"Training set size: {X_train.shape[0]}\") \n", + "print(f\"Validation set size: {X_val.shape[0]}\") \n", + "print(f\"Test set size: {X_test.shape[0]}\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Scale the Data \n", + "\n", + "Scale the features using StandardScaler. " + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the scaler \n", + "scaler = StandardScaler() \n", + "\n", + "# Fit the scaler on the training data and transform the training, validation, and test sets \n", + "X_train_scaled = scaler.fit_transform(X_train) \n", + "X_val_scaled = scaler.transform(X_val) \n", + "X_test_scaled = scaler.transform(X_test) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Model Training \n", + "\n", + "Train multiple machine learning models " + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression Validation Accuracy: 0.8027\n", + "Random Forest Validation Accuracy: 0.7742\n", + "Support Vector Machine Validation Accuracy: 0.7989\n", + "Gradient Boosting Validation Accuracy: 0.7951\n", + "K-Nearest Neighbors Validation Accuracy: 0.7666\n", + "XGBoost Validation Accuracy: 0.7837\n" + ] + } + ], + "source": [ + "# Initialize models \n", + "models = { \n", + " 'Logistic Regression': LogisticRegression(max_iter=2000), \n", + " 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=127), \n", + " 'Support Vector Machine': SVC(probability=True), \n", + " 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=127), \n", + " 'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5), \n", + " 'XGBoost': XGBClassifier(eval_metric='logloss') \n", + "} \n", + "\n", + "# Train models and evaluate on the validation set \n", + "for model_name, model in models.items(): \n", + " model.fit(X_train_scaled, y_train.values.ravel()) \n", + " y_val_pred = model.predict(X_val_scaled) \n", + " accuracy = accuracy_score(y_val, y_val_pred) \n", + " print(f\"{model_name} Validation Accuracy: {accuracy:.4f}\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Save the Models \n", + "\n", + "Save the trained models for future use. " + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression saved successfully.\n", + "Random Forest saved successfully.\n", + "Support Vector Machine saved successfully.\n", + "Gradient Boosting saved successfully.\n", + "K-Nearest Neighbors saved successfully.\n", + "XGBoost saved successfully.\n" + ] + } + ], + "source": [ + "# Save each model \n", + "for model_name, model in models.items(): \n", + " joblib.dump(model, f'../models/{model_name.replace(\" \", \"_\")}.joblib') \n", + " print(f\"{model_name} saved successfully.\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Load and Evaluate the Models \n", + "\n", + "Load the saved models and evaluate them on the test set. " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Logistic Regression Test Accuracy: 0.8030\n", + " precision recall f1-score support\n", + "\n", + " 0 0.84 0.90 0.87 388\n", + " 1 0.66 0.54 0.59 140\n", + "\n", + " accuracy 0.80 528\n", + " macro avg 0.75 0.72 0.73 528\n", + "weighted avg 0.79 0.80 0.80 528\n", + "\n", + "Random Forest Test Accuracy: 0.7841\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.91 0.86 388\n", + " 1 0.63 0.44 0.52 140\n", + "\n", + " accuracy 0.78 528\n", + " macro avg 0.73 0.68 0.69 528\n", + "weighted avg 0.77 0.78 0.77 528\n", + "\n", + "Support Vector Machine Test Accuracy: 0.8087\n", + " precision recall f1-score support\n", + "\n", + " 0 0.84 0.92 0.88 388\n", + " 1 0.69 0.51 0.58 140\n", + "\n", + " accuracy 0.81 528\n", + " macro avg 0.76 0.71 0.73 528\n", + "weighted avg 0.80 0.81 0.80 528\n", + "\n", + "Gradient Boosting Test Accuracy: 0.8030\n", + " precision recall f1-score support\n", + "\n", + " 0 0.84 0.91 0.87 388\n", + " 1 0.67 0.51 0.58 140\n", + "\n", + " accuracy 0.80 528\n", + " macro avg 0.75 0.71 0.72 528\n", + "weighted avg 0.79 0.80 0.79 528\n", + "\n", + "K-Nearest Neighbors Test Accuracy: 0.7746\n", + " precision recall f1-score support\n", + "\n", + " 0 0.83 0.87 0.85 388\n", + " 1 0.58 0.52 0.55 140\n", + "\n", + " accuracy 0.77 528\n", + " macro avg 0.71 0.69 0.70 528\n", + "weighted avg 0.77 0.77 0.77 528\n", + "\n", + "XGBoost Test Accuracy: 0.7765\n", + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.88 0.85 388\n", + " 1 0.60 0.48 0.53 140\n", + "\n", + " accuracy 0.78 528\n", + " macro avg 0.71 0.68 0.69 528\n", + "weighted avg 0.76 0.78 0.77 528\n", + "\n" + ] + } + ], + "source": [ + "# Load and evaluate each model on the test set \n", + "for model_name in models.keys(): \n", + " loaded_model = joblib.load(f'../models/{model_name.replace(\" \", \"_\")}.joblib') \n", + " y_test_pred = loaded_model.predict(X_test_scaled) \n", + " accuracy = accuracy_score(y_test, y_test_pred) \n", + " print(f\"{model_name} Test Accuracy: {accuracy:.4f}\") \n", + " print(classification_report(y_test, y_test_pred)) " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}