From 2b47a45b8fe4edf0e8e929df1b8b776a31b3cfc5476a35ff1382bf258eb0b1df Mon Sep 17 00:00:00 2001 From: Ben Varick Date: Wed, 5 Nov 2025 18:22:26 -0800 Subject: [PATCH] added asset to read agency_list.csv and add it to table in gtfs.duckdb --- .gitignore | 5 +++++ Dockerfile_user_code_gtfs | 6 +++--- dagster.yaml | 1 + definitions.py | 0 docker-compose.yaml | 2 ++ user_code/gtfs/__init__.py | 2 ++ user_code/gtfs/assets.py | 18 ++++++++++++++++++ user_code/gtfs/definitions.py | 12 ++++++++++++ workspace.yaml | 2 +- 9 files changed, 44 insertions(+), 4 deletions(-) delete mode 100644 definitions.py create mode 100644 user_code/gtfs/__init__.py create mode 100644 user_code/gtfs/assets.py create mode 100644 user_code/gtfs/definitions.py diff --git a/.gitignore b/.gitignore index 26d4a6c..ea2f8b0 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,8 @@ postgres_data postgres_data/* +#Exclude data directory +data +#except for agency_list.csv +!data/gtfs/agency_list.csv + diff --git a/Dockerfile_user_code_gtfs b/Dockerfile_user_code_gtfs index 27fc72a..c3ae1ea 100644 --- a/Dockerfile_user_code_gtfs +++ b/Dockerfile_user_code_gtfs @@ -6,13 +6,13 @@ FROM python:3.10-slim RUN pip install \ dagster \ dagster-postgres \ - dagster-docker + dagster-docker \ + dagster-duckdb \ + pandas WORKDIR /opt/dagster/app COPY user_code/gtfs /opt/dagster/app -COPY definitions.py /opt/dagster/app - # Run dagster gRPC server on port 4000 EXPOSE 4000 diff --git a/dagster.yaml b/dagster.yaml index 33cec38..bf75022 100644 --- a/dagster.yaml +++ b/dagster.yaml @@ -26,6 +26,7 @@ run_launcher: volumes: # Make docker client accessible to any launched containers as well - /var/run/docker.sock:/var/run/docker.sock - /tmp/io_manager_storage:/tmp/io_manager_storage + - /home/ben/code/gtfs-dagster/data:/opt/dagster/app/data run_storage: module: dagster_postgres.run_storage diff --git a/definitions.py b/definitions.py deleted file mode 100644 index 473a0f4..0000000 diff --git a/docker-compose.yaml b/docker-compose.yaml index d4ae94a..6bdc67f 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -39,6 +39,8 @@ services: DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} DAGSTER_POSTGRES_DB: ${POSTGRES_DB} DAGSTER_CURRENT_IMAGE: 'dagster_user_code_gtfs' + volumes: + - ./data:/opt/dagster/app/data networks: - dagster diff --git a/user_code/gtfs/__init__.py b/user_code/gtfs/__init__.py new file mode 100644 index 0000000..4c4960f --- /dev/null +++ b/user_code/gtfs/__init__.py @@ -0,0 +1,2 @@ +# user_code/gtfs/__init__.py +from .assets import * diff --git a/user_code/gtfs/assets.py b/user_code/gtfs/assets.py new file mode 100644 index 0000000..6387270 --- /dev/null +++ b/user_code/gtfs/assets.py @@ -0,0 +1,18 @@ +import pandas as pd +from dagster import asset +from dagster_duckdb import DuckDBResource + + +@asset +def agency_list(duckdb: DuckDBResource) -> None: + """Load agency list from CSV into DuckDB.""" + + # Read the CSV (path is relative to container working directory) + df = pd.read_csv('data/gtfs/agency_list.csv') + + # Write to DuckDB + with duckdb.get_connection() as conn: + conn.execute(""" + CREATE TABLE IF NOT EXISTS agency_list AS + SELECT * FROM df + """) diff --git a/user_code/gtfs/definitions.py b/user_code/gtfs/definitions.py new file mode 100644 index 0000000..ef43db8 --- /dev/null +++ b/user_code/gtfs/definitions.py @@ -0,0 +1,12 @@ +from dagster import Definitions +from dagster_duckdb import DuckDBResource +from assets import agency_list + +defs = Definitions( + assets=[agency_list], + resources={ + "duckdb": DuckDBResource( + database="data/gtfs/gtfs.duckdb" + ) + } +) diff --git a/workspace.yaml b/workspace.yaml index 5a35fc2..6ab94f4 100644 --- a/workspace.yaml +++ b/workspace.yaml @@ -3,4 +3,4 @@ load_from: - grpc_server: host: dagster_user_code_gtfs port: 4000 - location_name: "gtfs" \ No newline at end of file + location_name: "gtfs_user_code" \ No newline at end of file