diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 56f235f7b66e9dcc4a5d1ce9cb07062bcdeddb47..d42835a43d1ec34bcdb4f6725118919abcb63798 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,14 +1,23 @@ -before_script: - - if [ ! -d "venv" ]; then python3 -m venv .venv; fi - - source .venv/bin/activate - - pip install -r requirements.txt +stages: + - build + - run -test_job: - stage: test +build: + stage: build tags: - macos only: - - main + - test script: - - echo "Running tests" - - .venv/bin/python test.py + - echo "Build Docker Container" + - docker compose build + +run: + stage: run + tags: + - macos + only: + - test + script: + - echo "Run Docker Container" + - docker compose up -d \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..484cbe9720d8893189b0ca29c760f441c972b9cc --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM apache/airflow:2.7.1-python3.11 + +USER root + +RUN apt-get update && \ + apt-get install -y gcc python3-dev openjdk-11-jdk && \ + apt-get clean + +ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-arm64 + +USER airflow + +RUN pip install \ + apache-airflow \ + apache-airflow-providers-apache-spark \ + 'apache-airflow-providers-openlineage>=1.8.0' \ + pyspark \ No newline at end of file diff --git a/airflow.env b/airflow.env new file mode 100644 index 0000000000000000000000000000000000000000..329e1eee0e3154c554eede5cc6e60c7e55cb4571 --- /dev/null +++ b/airflow.env @@ -0,0 +1,5 @@ +AIRFLOW__CORE__LOAD__EXAMPLES=FALSE +AIRFLOW__CORE__EXECUTOR=LocalExecutor +AIRFLOW_WEBSERVER_BASE_URL=http://localhost:8080 +AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow +AIRFLOW_WEBSERVER_SECRET_KEY=password \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..af4f61c26161b77796b8c484931b1f30bc80d291 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,66 @@ +version: '3' + +x-spark-common: &spark-common + image: bitnami/spark:latest + volumes: + - ./jobs:/opt/bitnami/spark/jobs + networks: + - airflow + +x-airflow-common: &airflow-common + build: + context: . + dockerfile: Dockerfile + env_file: + - airflow.env + volumes: + - ./jobs:/opt/airflow/jobs + - ./dags:/opt/airflow/dags + - ./logs:/opt/airflow/logs + depends_on: + - postgres + networks: + - airflow + +services: + spark-master: + <<: *spark-common + command: bin/spark-class org.apache.spark.deploy.master.Master + ports: + - "9090:8080" + - "7077:7077" + + spark-worker: + <<: *spark-common + command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 + depends_on: + - spark-master + environment: + SPARK_MODE: worker + SPARK_WORKER_CORES: 2 + SPARK_WORKER_MEMORY: 1g + SPARK_MASTER_URL: spark://spark-master:7077 + + postgres: + image: postgres:14.0 + environment: + - POSTGRES_USER=airflow + - POSTGRES_PASSWORD=airflow + - POSTGRES_DB=airflow + networks: + - airflow + + webserver: + <<: *airflow-common + command: bash -c "airflow db init && airflow webserver" + ports: + - "8080:8080" + depends_on: + - scheduler + + scheduler: + <<: *airflow-common + command: bash -c "airflow db init && airflow db migrate && airflow users create --username admin --firstname Admin --lastname Admin --role Admin --email admin@example.com --password admin && airflow scheduler" + +networks: + airflow: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e214769a654c0d5b900d76a14e8910929d9dc52f..3955dc92fab04652fe945fd703f45f1d9b481663 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ pyspark +apache-airflow +apache-airflow-providers-apache-spark