diff --git a/Dockerfile.temp b/Dockerfile.airflow similarity index 67% rename from Dockerfile.temp rename to Dockerfile.airflow index 6c1cda86e32ca51c5a503441db70309885bce367..8f76273bc279c6fa53709eb90d8fbda1189522d7 100644 --- a/Dockerfile.temp +++ b/Dockerfile.airflow @@ -1,4 +1,5 @@ FROM apache/airflow:latest + USER root RUN apt-get update && \ @@ -7,8 +8,22 @@ RUN apt-get update && \ apt-get clean ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64/ + RUN export JAVA_HOME USER airflow +RUN pip install apache-airflow + +RUN pip install pyspark + RUN pip install apache-airflow-providers-apache-spark + +RUN pip install boto3 + +RUN pip install scikit-learn + +RUN pip install mlflow + + + diff --git a/Dockerfile.mlflow b/Dockerfile.mlflow new file mode 100644 index 0000000000000000000000000000000000000000..471ecfa586edc56d8b123df57c4a50d94113fb1e --- /dev/null +++ b/Dockerfile.mlflow @@ -0,0 +1,21 @@ +FROM python:3.9-slim + +ENV MLFLOW_HOME=/mlflow +ENV PATH=$MLFLOW_HOME/bin:$PATH + +RUN apt-get update && apt-get install -y \ + curl \ + libpq-dev \ + gcc \ + && apt-get clean + +WORKDIR $MLFLOW_HOME + +COPY ./requirements/mlflow.txt /tmp/requirements.txt + +EXPOSE 5000 + +CMD ["mlflow", "server", \ + "--backend-store-uri", "postgresql://mlflow:mlflow@mlflow-db:5432/mlflow", \ + "--default-artifact-root", "/mlflow/artifacts", \ + "--host", "0.0.0.0"] diff --git a/airflow/airflow-webserver.pid b/airflow/airflow-webserver.pid index 409940768f2a684935a7d15a29f96e82c487f439..2bd5a0a98a36cc08ada88b804d3be047e6aa5b8a 100644 --- a/airflow/airflow-webserver.pid +++ b/airflow/airflow-webserver.pid @@ -1 +1 @@ -23 +22 diff --git a/airflow/airflow.db b/airflow/airflow.db index 907a8e95c536a310efee44d72a78275f219ca44d..e2c8f2dfbdc40e38f71a8e1d116ab2643da855c3 100644 Binary files a/airflow/airflow.db and b/airflow/airflow.db differ diff --git a/airflow/dags/functions/train_model.py b/airflow/dags/functions/train_model.py index 6d814bef439295edebeaf71e860c9ca0e5e09601..ef7947800c220d3e62921e5b458540bb8231abb9 100644 --- a/airflow/dags/functions/train_model.py +++ b/airflow/dags/functions/train_model.py @@ -34,7 +34,7 @@ def train_model(df_dict_str): model = RandomForestClassifier(n_estimators=100, random_state=42) - mlflow.set_tracking_uri("http://127.0.0.1:5000") + mlflow.set_tracking_uri("http://mlflow:5000") model_name = "customer_churn_model" # Start MLflow run with mlflow.start_run(run_name="model_training"): diff --git a/docker-compose.yml b/docker-compose.yml index 679b080e04cc2f882d92803674a86c34181ead53..fe2e645f00bb90cbea4f4039eb694eefbdcd0c61 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,10 @@ version: "3" + services: sleek-airflow: build: context: . - dockerfile: Dockerfile.temp + dockerfile: Dockerfile.airflow volumes: - ./airflow:/opt/airflow - ./workspace:/opt/workspace @@ -71,9 +72,57 @@ services: volumes: - minio-data:/data + mlflow: + image: mlflow/mlflow:latest + container_name: mlflow-server + ports: + - "5000:5000" + environment: + MLFLOW_TRACKING_URI: "http://0.0.0.0:5000" + BACKEND_STORE_URI: "postgresql://mlflow:mlflow@mlflow-db:5432/mlflow" + ARTIFACT_STORE_URI: "s3://mlflow-artifacts" + volumes: + - ./artifacts:/mlflow/artifacts + build: + context: . + dockerfile: Dockerfile.mlflow + depends_on: + - mlflow-db + - minio + networks: + - spark-cluster + + mlflow-db: + image: postgres:13 + container_name: mlflow-db + environment: + POSTGRES_USER: mlflow + POSTGRES_PASSWORD: mlflow + POSTGRES_DB: mlflow + ports: + - "5432:5432" + volumes: + - ./postgres_data:/var/lib/postgresql/data + networks: + - spark-cluster + + minio-client: + image: minio/mc + depends_on: + - minio + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc alias set mlflow http://minio:9000 minioadmin minioadmin) do echo 'waiting for minio'; sleep 5; done; + /usr/bin/mc mb -p mlflow/mlflow-artifacts; + /usr/bin/mc policy set public mlflow/mlflow-artifacts; + " + networks: + - spark-cluster + volumes: shared-workspace: minio-data: + postgres_data: networks: spark-cluster: diff --git a/requirements/mlflow.txt b/requirements/mlflow.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3740c51f482113726628756c553381def4ba805 --- /dev/null +++ b/requirements/mlflow.txt @@ -0,0 +1,3 @@ +mlflow +boto3 +psycopg2-binary