diff --git a/Dockerfile.airflow b/Dockerfile.airflow index 8f76273bc279c6fa53709eb90d8fbda1189522d7..938b1a32321a6520cebc61785474d98178e9d46f 100644 --- a/Dockerfile.airflow +++ b/Dockerfile.airflow @@ -11,6 +11,8 @@ ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64/ RUN export JAVA_HOME +RUN mkdir -p /mlflow && chown -R airflow:root /mlflow && chmod 777 /mlflow + USER airflow RUN pip install apache-airflow diff --git a/Dockerfile.mlflow b/Dockerfile.mlflow index 471ecfa586edc56d8b123df57c4a50d94113fb1e..69012ac57461db252170a2c3374e7ae9413c6982 100644 --- a/Dockerfile.mlflow +++ b/Dockerfile.mlflow @@ -13,9 +13,11 @@ WORKDIR $MLFLOW_HOME COPY ./requirements/mlflow.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt + EXPOSE 5000 CMD ["mlflow", "server", \ "--backend-store-uri", "postgresql://mlflow:mlflow@mlflow-db:5432/mlflow", \ - "--default-artifact-root", "/mlflow/artifacts", \ + "--default-artifact-root", "s3://mlflow-artifacts", \ "--host", "0.0.0.0"] diff --git a/airflow/airflow.db b/airflow/airflow.db index e2c8f2dfbdc40e38f71a8e1d116ab2643da855c3..3849e8df3286bc97eefdae25ab42e35e6ba799c8 100644 Binary files a/airflow/airflow.db and b/airflow/airflow.db differ diff --git a/airflow/dags/functions/train_model.py b/airflow/dags/functions/train_model.py index ef7947800c220d3e62921e5b458540bb8231abb9..4af1e3e3bca6b14b3f6757e88b315b1728aeba83 100644 --- a/airflow/dags/functions/train_model.py +++ b/airflow/dags/functions/train_model.py @@ -10,7 +10,7 @@ import mlflow.sklearn def train_model(df_dict_str): df_dict = ast.literal_eval(df_dict_str) df = pd.DataFrame(df_dict) - + print("=============================================================") print(df.head()) print("=============================================================") @@ -19,13 +19,6 @@ def train_model(df_dict_str): print("Train model") print("=============================================================") - # Drop non-relevant columns (e.g., ID columns) - if 'customerID' in df.columns: - df.drop(columns=['customerID'], inplace=True) - - # Select the specified features - # TODO: nanti kalo ada var features - # X = df[features] X = df.drop(columns=["Churn"]) y = df["Churn"] @@ -36,6 +29,7 @@ def train_model(df_dict_str): mlflow.set_tracking_uri("http://mlflow:5000") model_name = "customer_churn_model" + # Start MLflow run with mlflow.start_run(run_name="model_training"): # Train model @@ -44,12 +38,11 @@ def train_model(df_dict_str): # Evaluate model y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) - f1 = f1_score(y_test, y_pred) + f1 = f1_score(y_test, y_pred) # Log parameters and metrics mlflow.log_param("model_type", "RandomForest") mlflow.log_param("n_estimators", 100) - # mlflow.log_param("features", features) mlflow.log_metric("accuracy", acc) mlflow.log_metric("f1_score", f1) diff --git a/docker-compose.yml b/docker-compose.yml index fe2e645f00bb90cbea4f4039eb694eefbdcd0c61..98ee0f59216e01fb2a99b4c8050b9f72bed36587 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -81,8 +81,6 @@ services: MLFLOW_TRACKING_URI: "http://0.0.0.0:5000" BACKEND_STORE_URI: "postgresql://mlflow:mlflow@mlflow-db:5432/mlflow" ARTIFACT_STORE_URI: "s3://mlflow-artifacts" - volumes: - - ./artifacts:/mlflow/artifacts build: context: . dockerfile: Dockerfile.mlflow