initial dagster setup

This commit is contained in:
Ben Varick 2025-11-05 17:24:58 -08:00
parent af2213f0ab
commit 7791d034ae
Signed by: ben
SSH key fingerprint: SHA256:jWnpFDAcacYM5aPFpYRqlsamlDyKNpSj3jj+k4ojtUo
8 changed files with 232 additions and 0 deletions

7
.gitignore vendored Normal file
View file

@ -0,0 +1,7 @@
#Exclude the .env file
.env
#Exclude the postres_data
postgres_data
postgres_data/*

20
Dockerfile_dagster Normal file
View file

@ -0,0 +1,20 @@
# Dagster libraries to run both dagster-webserver and the dagster-daemon. Does not
# need to have access to any pipeline code.
FROM python:3.10-slim
RUN pip install \
dagster \
dagster-graphql \
dagster-webserver \
dagster-postgres \
dagster-docker
# Set $DAGSTER_HOME and copy dagster instance and workspace YAML there
ENV DAGSTER_HOME=/opt/dagster/dagster_home/
RUN mkdir -p $DAGSTER_HOME
COPY dagster.yaml workspace.yaml $DAGSTER_HOME
WORKDIR $DAGSTER_HOME

20
Dockerfile_user_code_gtfs Normal file
View file

@ -0,0 +1,20 @@
FROM python:3.10-slim
# Checkout and install dagster libraries needed to run the gRPC server
# exposing your repository to dagster-webserver and dagster-daemon, and to load the DagsterInstance
RUN pip install \
dagster \
dagster-postgres \
dagster-docker
WORKDIR /opt/dagster/app
COPY user_code/gtfs /opt/dagster/app
COPY definitions.py /opt/dagster/app
# Run dagster gRPC server on port 4000
EXPOSE 4000
CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-f", "definitions.py"]

70
dagster.yaml Normal file
View file

@ -0,0 +1,70 @@
scheduler:
module: dagster.core.scheduler
class: DagsterDaemonScheduler
run_coordinator:
module: dagster.core.run_coordinator
class: QueuedRunCoordinator
config:
max_concurrent_runs: 5
tag_concurrency_limits:
- key: "operation"
value: "example"
limit: 5
run_launcher:
module: dagster_docker
class: DockerRunLauncher
config:
env_vars:
- DAGSTER_POSTGRES_USER
- DAGSTER_POSTGRES_PASSWORD
- DAGSTER_POSTGRES_DB
network: dagster
container_kwargs:
volumes: # Make docker client accessible to any launched containers as well
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/io_manager_storage:/tmp/io_manager_storage
run_storage:
module: dagster_postgres.run_storage
class: PostgresRunStorage
config:
postgres_db:
hostname: dagster_postgresql
username:
env: DAGSTER_POSTGRES_USER
password:
env: DAGSTER_POSTGRES_PASSWORD
db_name:
env: DAGSTER_POSTGRES_DB
port: 5432
schedule_storage:
module: dagster_postgres.schedule_storage
class: PostgresScheduleStorage
config:
postgres_db:
hostname: dagster_postgresql
username:
env: DAGSTER_POSTGRES_USER
password:
env: DAGSTER_POSTGRES_PASSWORD
db_name:
env: DAGSTER_POSTGRES_DB
port: 5432
event_log_storage:
module: dagster_postgres.event_log
class: PostgresEventLogStorage
config:
postgres_db:
hostname: dagster_postgresql
username:
env: DAGSTER_POSTGRES_USER
password:
env: DAGSTER_POSTGRES_PASSWORD
db_name:
env: DAGSTER_POSTGRES_DB
port: 5432

View file

@ -0,0 +1,2 @@
Name,GTFS,GTFS-RT_vehicles,GTFS-RT_trips,GTFS-RT_alerts
Madison Metro,mdb-394,mdb-2097,mdb-2096,mdb-2095
1 Name GTFS GTFS-RT_vehicles GTFS-RT_trips GTFS-RT_alerts
2 Madison Metro mdb-394 mdb-2097 mdb-2096 mdb-2095

0
definitions.py Normal file
View file

107
docker-compose.yaml Normal file
View file

@ -0,0 +1,107 @@
---
services:
# This service runs the postgres DB used by dagster for run storage, schedule storage,
# and event log storage. Depending on the hardware you run this Compose on, you may be able
# to reduce the interval and timeout in the healthcheck to speed up your `docker-compose up` times.
dagster_postgresql:
image: postgres:17
container_name: dagster_postgresql
environment:
POSTGRES_USER: ${POSTGRES_USER}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
POSTGRES_DB: ${POSTGRES_DB}
volumes:
- ./postgres_data:/var/lib/postgresql/data
networks:
- dagster
healthcheck:
test: ['CMD-SHELL', 'pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}']
interval: 10s
timeout: 8s
retries: 5
# This service runs the gRPC server that loads your user code, in both dagster-webserver
# and dagster-daemon. By setting DAGSTER_CURRENT_IMAGE to its own image, we tell the
# run launcher to use this same image when launching runs in a new container as well.
# Multiple containers like this can be deployed separately - each just needs to run on
# its own port, and have its own entry in the workspace.yaml file that's loaded by the
# webserver.
dagster_user_code_gtfs:
build:
context: .
dockerfile: ./Dockerfile_user_code_gtfs
container_name: dagster_user_code_gtfs
image: dagster_user_code_gtfs
restart: always
environment:
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
DAGSTER_CURRENT_IMAGE: 'dagster_user_code_gtfs'
networks:
- dagster
# This service runs dagster-webserver, which loads your user code from the user code container.
# Since our instance uses the QueuedRunCoordinator, any runs submitted from the webserver will be put on
# a queue and later dequeued and launched by dagster-daemon.
dagster_webserver:
build:
context: .
dockerfile: ./Dockerfile_dagster
entrypoint:
- dagster-webserver
- -h
- '0.0.0.0'
- -p
- '3000'
- -w
- workspace.yaml
container_name: dagster_webserver
ports:
- 3001:3000
environment:
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
volumes: # Make docker client accessible so we can terminate containers from the webserver
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/io_manager_storage:/tmp/io_manager_storage
networks:
- dagster
depends_on:
dagster_postgresql:
condition: service_healthy
dagster_user_code_gtfs:
condition: service_started
# This service runs the dagster-daemon process, which is responsible for taking runs
# off of the queue and launching them, as well as creating runs from schedules or sensors.
dagster_daemon:
build:
context: .
dockerfile: ./Dockerfile_dagster
entrypoint:
- dagster-daemon
- run
container_name: dagster_daemon
restart: on-failure
environment:
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
volumes: # Make docker client accessible so we can launch containers using host docker
- /var/run/docker.sock:/var/run/docker.sock
- /tmp/io_manager_storage:/tmp/io_manager_storage
networks:
- dagster
depends_on:
dagster_postgresql:
condition: service_healthy
dagster_user_code_gtfs:
condition: service_started
networks:
dagster:
driver: bridge
name: dagster

6
workspace.yaml Normal file
View file

@ -0,0 +1,6 @@
load_from:
# Each entry here corresponds to a service in the docker-compose file that exposes user code.
- grpc_server:
host: dagster_user_code_gtfs
port: 4000
location_name: "gtfs"