diff --git a/Dockerfile.temp b/Dockerfile.temp
new file mode 100644
index 0000000000000000000000000000000000000000..6c1cda86e32ca51c5a503441db70309885bce367
--- /dev/null
+++ b/Dockerfile.temp
@@ -0,0 +1,14 @@
+FROM apache/airflow:latest
+USER root
+
+RUN apt-get update && \
+  apt-get -y install git && \
+  apt-get install -y openjdk-17-jdk && \
+  apt-get clean
+
+ENV JAVA_HOME /usr/lib/jvm/java-17-openjdk-amd64/
+RUN export JAVA_HOME
+
+USER airflow
+
+RUN pip install apache-airflow-providers-apache-spark
diff --git a/airflow/airflow-webserver.pid b/airflow/airflow-webserver.pid
index f5c89552bd3e62bfce023a230e90d141f7a46b2f..2bd5a0a98a36cc08ada88b804d3be047e6aa5b8a 100644
--- a/airflow/airflow-webserver.pid
+++ b/airflow/airflow-webserver.pid
@@ -1 +1 @@
-32
+22
diff --git a/airflow/airflow.db b/airflow/airflow.db
index 8eeb1c7af263e7ddde55342aa0bc49fc43968beb..f9568a1d587b8d949abc3e561275d03af28b8bb5 100644
Binary files a/airflow/airflow.db and b/airflow/airflow.db differ
diff --git a/airflow/dags/__pycache__/pyspark_dag.cpython-312.pyc b/airflow/dags/__pycache__/pyspark_dag.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e61ec5ac67db3fae9a2a0e6de6ffee5901b07947
Binary files /dev/null and b/airflow/dags/__pycache__/pyspark_dag.cpython-312.pyc differ
diff --git a/airflow/dags/__pycache__/pyspark_functions.cpython-312.pyc b/airflow/dags/__pycache__/pyspark_functions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..153fd0ee0be64d76855bc4619719757934025ebd
Binary files /dev/null and b/airflow/dags/__pycache__/pyspark_functions.cpython-312.pyc differ
diff --git a/airflow/dags/__pycache__/test_dag.cpython-312.pyc b/airflow/dags/__pycache__/test_dag.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11ece8b187243d74b82c12a08c5b121e96c3687a
Binary files /dev/null and b/airflow/dags/__pycache__/test_dag.cpython-312.pyc differ
diff --git a/airflow/dags/__pycache__/welcome_dag.cpython-312.pyc b/airflow/dags/__pycache__/welcome_dag.cpython-312.pyc
index 6f135560f187375c54b9a6628250c0c5dacc4cdd..1e9e002c9e84a7d86a9bdbb4a2a6e5b4352db186 100644
Binary files a/airflow/dags/__pycache__/welcome_dag.cpython-312.pyc and b/airflow/dags/__pycache__/welcome_dag.cpython-312.pyc differ
diff --git a/airflow/dags/test_dag.py b/airflow/dags/test_dag.py
new file mode 100644
index 0000000000000000000000000000000000000000..1df5d0b2c3a4bc3c4126664089857817e85ba42a
--- /dev/null
+++ b/airflow/dags/test_dag.py
@@ -0,0 +1,23 @@
+from airflow import DAG
+from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
+from datetime import datetime
+
+with DAG(
+    dag_id='spark_submit_test',
+    schedule_interval=None,
+    start_date=datetime(2025, 1, 8),
+    catchup=False,
+) as dag:
+
+    spark_submit_task = SparkSubmitOperator(
+        task_id='submit_spark_job',
+        application='/opt/workspace/wordcount.py',
+        conn_id='spark_default',
+        name='wordcount_job',
+        application_args=['/opt/workspace/input.txt', '/opt/workspace/output/'],
+        conf={'spark.executor.memory': '512m'},
+        verbose=True,
+    )
+
+
+    spark_submit_task
diff --git a/airflow/standalone_admin_password.txt b/airflow/standalone_admin_password.txt
index 11c68bd43b69f4c7f263a472564d8f85933e0c58..01c68be6bc141aa75a41f09419862c3f04c302c6 100644
--- a/airflow/standalone_admin_password.txt
+++ b/airflow/standalone_admin_password.txt
@@ -1 +1 @@
-SbyPAHK96d2ZCYR3
\ No newline at end of file
+me4cz7vXfYQsx4HF
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 2b790a339a0f6d0e90af2326d1952e1c209ed409..a8d278d7a3235f615920ce1fc189fd0d766a5759 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,12 +1,61 @@
-version: '3'
+version: "3"
 services:
   sleek-airflow:
-    build: 
+    build:
       context: .
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile.temp
     volumes:
       - ./airflow:/opt/airflow
+      - ./workspace:/opt/workspace
     ports:
       - "8080:8080"
-    command: airflow standalone
+    networks:
+      - airflow-spark-network
+    environment:
+      - AIRFLOW_CONN_SPARK_DEFAULT=spark://spark-master:7077
+    command: bash -c "rm -f /opt/airflow/airflow-webserver.pid && airflow db init && (airflow scheduler & airflow webserver)"
+  spark-master:
+    image: andreper/spark-master:3.0.0
+    container_name: spark-master
+    ports:
+      - 8081:8080
+      - 7077:7077
+    networks:
+      - airflow-spark-network
+    volumes:
+      - ./workspace:/opt/workspace
+  spark-worker-1:
+    image: andreper/spark-worker:3.0.0
+    container_name: spark-worker-1
+    environment:
+      - SPARK_WORKER_CORES=1
+      - SPARK_WORKER_MEMORY=512m
+    ports:
+      - 8082:8081
+    networks:
+      - airflow-spark-network
+    volumes:
+      - ./workspace:/opt/workspace
+    depends_on:
+      - spark-master
+  spark-worker-2:
+    image: andreper/spark-worker:3.0.0
+    container_name: spark-worker-2
+    environment:
+      - SPARK_WORKER_CORES=1
+      - SPARK_WORKER_MEMORY=512m
+    ports:
+      - 8083:8081
+    networks:
+      - airflow-spark-network
+    volumes:
+      - ./workspace:/opt/workspace
+    depends_on:
+      - spark-master
+
+volumes:
+  shared-workspace:
 
+networks:
+  airflow-spark-network:
+    driver: bridge
diff --git a/pre-process.ipynb b/pre-process.ipynb
deleted file mode 100644
index 7ad204662ca5faf48bb73918d4aca9ed06eb6e7a..0000000000000000000000000000000000000000
--- a/pre-process.ipynb
+++ /dev/null
@@ -1,22 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "language_info": {
-   "name": "python"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/requirements.txt b/requirements.txt
index bbd6d1e70810390671c8daa671df271df5b3bca0..4bdb0ff2118622396a429f4ea89f65e514c37bcf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 apache-airflow
-pendulum
\ No newline at end of file
+pendulum
+pyspark
\ No newline at end of file
diff --git a/workspace/input.txt b/workspace/input.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1764b02c24ac3b9d8dfa0550b5c1353a7f2f1915
--- /dev/null
+++ b/workspace/input.txt
@@ -0,0 +1,2 @@
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum
+Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?
\ No newline at end of file
diff --git a/workspace/wordcount.py b/workspace/wordcount.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9361f943a08f53e8545e25a828a3b0daace570
--- /dev/null
+++ b/workspace/wordcount.py
@@ -0,0 +1,59 @@
+from pyspark.sql import SparkSession
+import sys
+
+def main():
+    # Check if input and output paths are provided
+    if len(sys.argv) != 3:
+        print("Usage: wordcount.py <input_path> <output_path>")
+        sys.exit(-1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    print("=============================================================")
+    print("input_path: ", input_path)
+    print("output_path: ", output_path)
+    print("=============================================================")
+
+    # Initialize SparkSession
+    spark = SparkSession.builder \
+        .appName("WordCount") \
+        .getOrCreate()
+    
+    print("=============================================================")
+    print("Spark Session Created")
+    print("=============================================================")
+
+
+    # Read input file
+    text_file = spark.read.text(input_path).rdd
+
+    print("=============================================================")
+    print("Text File Read")
+    print("=============================================================")
+
+
+    # Count words
+    word_counts = (
+        text_file.flatMap(lambda line: line.value.split())  # Split lines into words
+        .map(lambda word: (word, 1))  # Create (word, 1) tuples
+        .reduceByKey(lambda a, b: a + b)  # Reduce by key to sum word counts
+    )
+
+    print("=============================================================")
+    print("Word count: ", word_counts)
+    print("=============================================================")
+
+
+    # Collect results and save to the output file
+    word_counts.saveAsTextFile(output_path)
+
+    print("=============================================================")
+    print("Word counts saved")
+    print("=============================================================")
+
+
+    # Stop the SparkSession
+    spark.stop()
+
+if __name__ == "__main__":
+    main()