diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..26d4a6c --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +#Exclude the .env file +.env + +#Exclude the postres_data +postgres_data +postgres_data/* + diff --git a/Dockerfile_dagster b/Dockerfile_dagster new file mode 100644 index 0000000..179e8bd --- /dev/null +++ b/Dockerfile_dagster @@ -0,0 +1,20 @@ +# Dagster libraries to run both dagster-webserver and the dagster-daemon. Does not +# need to have access to any pipeline code. + +FROM python:3.10-slim + +RUN pip install \ + dagster \ + dagster-graphql \ + dagster-webserver \ + dagster-postgres \ + dagster-docker + +# Set $DAGSTER_HOME and copy dagster instance and workspace YAML there +ENV DAGSTER_HOME=/opt/dagster/dagster_home/ + +RUN mkdir -p $DAGSTER_HOME + +COPY dagster.yaml workspace.yaml $DAGSTER_HOME + +WORKDIR $DAGSTER_HOME \ No newline at end of file diff --git a/Dockerfile_user_code_gtfs b/Dockerfile_user_code_gtfs new file mode 100644 index 0000000..27fc72a --- /dev/null +++ b/Dockerfile_user_code_gtfs @@ -0,0 +1,20 @@ +FROM python:3.10-slim + +# Checkout and install dagster libraries needed to run the gRPC server +# exposing your repository to dagster-webserver and dagster-daemon, and to load the DagsterInstance + +RUN pip install \ + dagster \ + dagster-postgres \ + dagster-docker + +WORKDIR /opt/dagster/app +COPY user_code/gtfs /opt/dagster/app + +COPY definitions.py /opt/dagster/app + +# Run dagster gRPC server on port 4000 + +EXPOSE 4000 + +CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-f", "definitions.py"] \ No newline at end of file diff --git a/dagster.yaml b/dagster.yaml new file mode 100644 index 0000000..33cec38 --- /dev/null +++ b/dagster.yaml @@ -0,0 +1,70 @@ +scheduler: + module: dagster.core.scheduler + class: DagsterDaemonScheduler + + +run_coordinator: + module: dagster.core.run_coordinator + class: QueuedRunCoordinator + config: + max_concurrent_runs: 5 + tag_concurrency_limits: + - key: "operation" + value: "example" + limit: 5 + +run_launcher: + module: dagster_docker + class: DockerRunLauncher + config: + env_vars: + - DAGSTER_POSTGRES_USER + - DAGSTER_POSTGRES_PASSWORD + - DAGSTER_POSTGRES_DB + network: dagster + container_kwargs: + volumes: # Make docker client accessible to any launched containers as well + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/io_manager_storage:/tmp/io_manager_storage + +run_storage: + module: dagster_postgres.run_storage + class: PostgresRunStorage + config: + postgres_db: + hostname: dagster_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 + +schedule_storage: + module: dagster_postgres.schedule_storage + class: PostgresScheduleStorage + config: + postgres_db: + hostname: dagster_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 + +event_log_storage: + module: dagster_postgres.event_log + class: PostgresEventLogStorage + config: + postgres_db: + hostname: dagster_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 \ No newline at end of file diff --git a/data/gtfs/agency_list.csv b/data/gtfs/agency_list.csv new file mode 100644 index 0000000..228aaab --- /dev/null +++ b/data/gtfs/agency_list.csv @@ -0,0 +1,2 @@ +Name,GTFS,GTFS-RT_vehicles,GTFS-RT_trips,GTFS-RT_alerts +Madison Metro,mdb-394,mdb-2097,mdb-2096,mdb-2095 diff --git a/definitions.py b/definitions.py new file mode 100644 index 0000000..473a0f4 diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..d4ae94a --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,107 @@ +--- + +services: + # This service runs the postgres DB used by dagster for run storage, schedule storage, + # and event log storage. Depending on the hardware you run this Compose on, you may be able + # to reduce the interval and timeout in the healthcheck to speed up your `docker-compose up` times. + dagster_postgresql: + image: postgres:17 + container_name: dagster_postgresql + environment: + POSTGRES_USER: ${POSTGRES_USER} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + POSTGRES_DB: ${POSTGRES_DB} + volumes: + - ./postgres_data:/var/lib/postgresql/data + networks: + - dagster + healthcheck: + test: ['CMD-SHELL', 'pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}'] + interval: 10s + timeout: 8s + retries: 5 + + # This service runs the gRPC server that loads your user code, in both dagster-webserver + # and dagster-daemon. By setting DAGSTER_CURRENT_IMAGE to its own image, we tell the + # run launcher to use this same image when launching runs in a new container as well. + # Multiple containers like this can be deployed separately - each just needs to run on + # its own port, and have its own entry in the workspace.yaml file that's loaded by the + # webserver. + dagster_user_code_gtfs: + build: + context: . + dockerfile: ./Dockerfile_user_code_gtfs + container_name: dagster_user_code_gtfs + image: dagster_user_code_gtfs + restart: always + environment: + DAGSTER_POSTGRES_USER: ${POSTGRES_USER} + DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + DAGSTER_POSTGRES_DB: ${POSTGRES_DB} + DAGSTER_CURRENT_IMAGE: 'dagster_user_code_gtfs' + networks: + - dagster + + # This service runs dagster-webserver, which loads your user code from the user code container. + # Since our instance uses the QueuedRunCoordinator, any runs submitted from the webserver will be put on + # a queue and later dequeued and launched by dagster-daemon. + dagster_webserver: + build: + context: . + dockerfile: ./Dockerfile_dagster + entrypoint: + - dagster-webserver + - -h + - '0.0.0.0' + - -p + - '3000' + - -w + - workspace.yaml + container_name: dagster_webserver + ports: + - 3001:3000 + environment: + DAGSTER_POSTGRES_USER: ${POSTGRES_USER} + DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + DAGSTER_POSTGRES_DB: ${POSTGRES_DB} + volumes: # Make docker client accessible so we can terminate containers from the webserver + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/io_manager_storage:/tmp/io_manager_storage + networks: + - dagster + depends_on: + dagster_postgresql: + condition: service_healthy + dagster_user_code_gtfs: + condition: service_started + + # This service runs the dagster-daemon process, which is responsible for taking runs + # off of the queue and launching them, as well as creating runs from schedules or sensors. + dagster_daemon: + build: + context: . + dockerfile: ./Dockerfile_dagster + entrypoint: + - dagster-daemon + - run + container_name: dagster_daemon + restart: on-failure + environment: + DAGSTER_POSTGRES_USER: ${POSTGRES_USER} + DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} + DAGSTER_POSTGRES_DB: ${POSTGRES_DB} + volumes: # Make docker client accessible so we can launch containers using host docker + - /var/run/docker.sock:/var/run/docker.sock + - /tmp/io_manager_storage:/tmp/io_manager_storage + networks: + - dagster + depends_on: + dagster_postgresql: + condition: service_healthy + dagster_user_code_gtfs: + condition: service_started + +networks: + dagster: + driver: bridge + name: dagster diff --git a/workspace.yaml b/workspace.yaml new file mode 100644 index 0000000..5a35fc2 --- /dev/null +++ b/workspace.yaml @@ -0,0 +1,6 @@ +load_from: + # Each entry here corresponds to a service in the docker-compose file that exposes user code. + - grpc_server: + host: dagster_user_code_gtfs + port: 4000 + location_name: "gtfs" \ No newline at end of file