Merge branch 'main' of https://gitlab.informatika.org/rayhanp1402/xops-gamimir

61b8f78c · rayhanp1402 · a60bcda9 · faa9a281 · 61b8f78c
Commit 61b8f78c authored 4 months ago by rayhanp1402
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
 stages:
-  - train
+  - setup
+  - deploy
+  - test
 variables:
-  # Define the virtual environment or base image
+  # Define variables
-  VENV: venv
+  AIRFLOW_HOME: /opt/airflow
-  MLFLOW_TRACKING_URI: $MLFLOW_TRACKING_URI
+  AIRFLOW_DAGS_DIR: /opt/airflow/dags
-  MLFLOW_EXPERIMENT_NAME: $MLFLOW_EXPERIMENT_NAME
+  AIRFLOW_DATA_DIR: /opt/airflow/data
  DATA_PATH: "data/churn.csv"
+  MLFLOW_TRACKING_URI: http://mlflow:5000
+  MLFLOW_EXPERIMENT_NAME: "Customer Churn Model"
-train_model:
+# Setup Airflow environment
-  stage: train
+setup_airflow:
+  stage: setup
  image: python:3.11
  script:
    # Install dependencies
-    - python -m venv $VENV
+    - python -m venv venv
-    - source $VENV/bin/activate
+    - source venv/bin/activate
    - pip install --upgrade pip
-    - pip install numpy pandas scikit-learn mlflow
+    - pip install apache-airflow[celery,postgres,s3]==2.7.1 mlflow pandas scikit-learn
-    # Train the model and log to MLflow
-    - mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns --host 127.0.0.1 --port 5000 &
-    - python model/train_model.py
-  only:
+    # Initialize Airflow directories
-    - main
+    - mkdir -p $AIRFLOW_DATA_DIR
+    # Copy the data to Airflow data directory
+    - cp $DATA_PATH $AIRFLOW_DATA_DIR
+  cache:
+    key: "$CI_COMMIT_REF_SLUG"
+    paths:
+      - $AIRFLOW_DATA_DIR
+# Deploy DAGs
+deploy_dag:
+  stage: deploy
+  image: python:3.11
+  dependencies:
+    - setup_airflow
+  script:
+    # Deploy DAG to Airflow
+    - mkdir -p $AIRFLOW_HOME/dags $AIRFLOW_HOME/logs $AIRFLOW_HOME/plugins
+    - cp dags/model_dag.py $AIRFLOW_DAGS_DIR
+    # List deployed DAGs for verification
+    - ls -l $AIRFLOW_DAGS_DIR
+  artifacts:
+    paths:
+      - $AIRFLOW_DAGS_DIR
+# Test the DAG execution
+test_dag:
+  stage: test
+  image: python:3.11
+  dependencies:
+    - deploy_dag
+  services:
+    - name: postgres:13
+      alias: postgres
+  variables:
+    POSTGRES_USER: airflow
+    POSTGRES_PASSWORD: airflow
+    POSTGRES_DB: airflow
+    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
+    AIRFLOW__CORE__LOAD_EXAMPLES: "False"
+    AIRFLOW__CORE__EXECUTOR: "LocalExecutor"
+  before_script:
+    # Set up the Airflow environment
+    - python -m venv venv
+    - source venv/bin/activate
+    - pip install --upgrade pip
+    - pip install apache-airflow[postgres]==2.7.1 apache-airflow-providers-apache-spark==2.1.1
+    - pip install pyspark==3.5.0 apache-airflow-providers-openlineage>=1.8.0 pandas mlflow scikit-learn
+  script:
+    - airflow db init
+    - airflow scheduler &
+    - airflow webserver -p 8080 &
+    - sleep 20
+    # Trigger the DAG
+    - airflow dags trigger model_training_and_tracking
+    # Wait for the DAG run to complete
+    - |
+      DAG_RUN_ID=$(airflow dags list-runs -d model_training_and_tracking --no-table-headers --state running | awk '{print $1}')
+      while [ -n "$DAG_RUN_ID" ]; do
+        echo "Waiting for DAG run to complete..."
+        sleep 10
+        DAG_RUN_ID=$(airflow dags list-runs -d model_training_and_tracking --no-table-headers --state running | awk '{print $1}')
+      done
+    # Check the task state
+    - airflow tasks state model_training_and_tracking train_and_log_model
+    # Fetch logs for the task
+    - airflow tasks logs model_training_and_tracking train_and_log_model