From e068cbed208f2ec877d1adc096c06d686a624145dca7006a2853a71676f7348f Mon Sep 17 00:00:00 2001 From: Ben Varick Date: Sun, 7 Dec 2025 09:27:41 -0700 Subject: [PATCH] edited sensors to account for a fresh start, edited README --- README.md | 37 ++++++++------ user_code/assets/config.py | 6 ++- user_code/sensors/gtfs_realtime.py | 80 ++++++++++++++++++------------ user_code/sensors/gtfs_static.py | 79 ++++++++++++++++++----------- 4 files changed, 124 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index e53afa2..73842fd 100644 --- a/README.md +++ b/README.md @@ -2,24 +2,31 @@ Dagster setup that scrapes GTFS and GTFS-RT for specified transit agencies and adds them to a DuckDB -## Input -You define which agencies and feeds to scrape with the file`config/agency_list.csv` +## Quick start -To include the transit agencies that you want to scrape, add the relevant IDs from mobilitydatabase.org - -See `config/agency_list.csv.sample` for an example. - -## set your environment - -### .env file +1. Edit the .env file. copy `env.sample` to `.env` and change: -- Postgres database password - make it something random before the first run -- MobilityDatabase.org API token -- Location of data, config, and postgres_data directories (default is in working directory) + - Postgres database password - make it something random before the first run + - MobilityDatabase.org API token + - Location of `data`, `config`, and `postgres_data` directories (default is in working directory). `config` is part of the repo as it comes with sample configuration files. +2. Edit `config/agency_list.csv` + - See `config/agency_list.csv.sample` for an example. + - Define which agencies and feeds to scrape with the file. + - To include the transit agencies that you want to scrape, add the relevant Feed IDs from mobilitydatabase.org - -# Run it +3. Build the docker containers `docker compose build` + +4. Run the docker containers `docker compose up -d` -access the Dagster web ui at 127.0.0.1:3001 + +5. Access the Dagster web ui at 127.0.0.1:3001 + +6. Materialize the first asset: `agency_list` + +## To-do: +1. Change mobilitydata from using the API with a key, to using the csv on their GitHub page. +2. Load data into duckdb +3. Transform data in duckdb +4. Analyze data \ No newline at end of file diff --git a/user_code/assets/config.py b/user_code/assets/config.py index bd45f61..223c806 100644 --- a/user_code/assets/config.py +++ b/user_code/assets/config.py @@ -1,4 +1,5 @@ import pandas as pd +from pathlib import Path from dagster import ( asset, ) @@ -11,6 +12,10 @@ from dagster_duckdb import DuckDBResource def agency_list(duckdb: DuckDBResource) -> None: """Load agency list from CSV into DuckDB.""" + # Ensure the database directory exists + db_path = Path(duckdb.database) + db_path.parent.mkdir(parents=True, exist_ok=True) + # Read the CSV (path is relative to container working directory) df = pd.read_csv('config/agency_list.csv') @@ -20,4 +25,3 @@ def agency_list(duckdb: DuckDBResource) -> None: CREATE OR REPLACE TABLE agency_list AS SELECT * FROM df """) - diff --git a/user_code/sensors/gtfs_realtime.py b/user_code/sensors/gtfs_realtime.py index 3e27602..ae44709 100644 --- a/user_code/sensors/gtfs_realtime.py +++ b/user_code/sensors/gtfs_realtime.py @@ -1,13 +1,13 @@ from dagster import ( sensor, + SensorEvaluationContext, RunRequest, SkipReason, - SensorEvaluationContext, DefaultSensorStatus, + AssetKey, ) from dagster_duckdb import DuckDBResource - @sensor( name="gtfs_rt_vehicles_sensor", minimum_interval_seconds=60, @@ -22,40 +22,56 @@ def gtfs_rt_vehicles_sensor( Sensor that triggers gtfs_rt_vehicles_downloads every 60 seconds. Fetches feed metadata once and passes it to each partition run. """ - with duckdb.get_connection() as conn: - # Get all active feeds with their metadata in one query - feeds = conn.execute(""" - SELECT feed_id, provider, producer_url - FROM gtfs_rt_vehicles_metadata - WHERE producer_url IS NOT NULL AND producer_url != '' - ORDER BY feed_id - """).fetchall() + # Check if upstream asset has been materialized at least once + upstream_asset_key = AssetKey("gtfs_rt_vehicles_partitions") + latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key) - if not feeds: - return SkipReason("No GTFS-RT vehicle feeds configured") + if latest_materialization is None: + return SkipReason( + "Waiting for upstream asset 'gtfs_rt_vehicles_partitions' to be materialized. " + "Run the upstream assets first." + ) - # Create a RunRequest for each partition with metadata - run_requests = [ - RunRequest( - partition_key=feed_id, - run_config={ - "ops": { - "gtfs_rt_vehicles_downloads": { - "config": { - "provider": provider, - "producer_url": producer_url, + try: + with duckdb.get_connection() as conn: + # Get all active feeds with their metadata in one query + feeds = conn.execute(""" + SELECT feed_id, provider, producer_url + FROM gtfs_rt_vehicles_metadata + WHERE producer_url IS NOT NULL AND producer_url != '' + ORDER BY feed_id + """).fetchall() + + if not feeds: + return SkipReason("No GTFS-RT vehicle feeds configured") + + # Create a RunRequest for each partition with metadata + run_requests = [ + RunRequest( + partition_key=feed_id, + run_config={ + "ops": { + "gtfs_rt_vehicles_downloads": { + "config": { + "provider": provider, + "producer_url": producer_url, + } } } + }, + tags={ + "feed_id": feed_id, + "sensor": "gtfs_rt_vehicles_sensor" } - }, - tags={ - "feed_id": feed_id, - "sensor": "gtfs_rt_vehicles_sensor" - } - ) - for feed_id, provider, producer_url in feeds - ] + ) + for feed_id, provider, producer_url in feeds + ] - context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds") + context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds") - return run_requests + return run_requests + + except Exception as e: + # Handle case where table doesn't exist yet or other DB errors + context.log.warning(f"Database query failed: {e}") + return SkipReason(f"Database not ready or query failed: {e}") diff --git a/user_code/sensors/gtfs_static.py b/user_code/sensors/gtfs_static.py index d95f5d2..47390ef 100644 --- a/user_code/sensors/gtfs_static.py +++ b/user_code/sensors/gtfs_static.py @@ -6,6 +6,7 @@ from dagster import ( DefaultSensorStatus, SensorResult, EventLogEntry, + AssetKey, ) from dagster_duckdb import DuckDBResource @@ -24,43 +25,61 @@ def gtfs_static_hourly_sensor( Sensor that triggers gtfs_feed_downloads every 60 minutes. Fetches feed metadata once and passes it to each partition run. """ - with duckdb.get_connection() as conn: - # Get all active feeds with their metadata in one query - feeds = conn.execute(""" - SELECT feed_id, provider, producer_url - FROM gtfs_feed_metadata - WHERE producer_url IS NOT NULL AND producer_url != '' - ORDER BY feed_id - """).fetchall() + # Check if upstream asset has been materialized at least once + # Update this asset name if your upstream asset has a different name + upstream_asset_key = AssetKey("gtfs_feed_partitions") + latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key) - if not feeds: - return SkipReason("No GTFS feeds configured") + if latest_materialization is None: + return SkipReason( + "Waiting for upstream asset 'gtfs_feed_partitions' to be materialized. " + "Run the upstream assets first." + ) - # Create a RunRequest for each partition with metadata - run_requests = [ - RunRequest( - partition_key=feed_id, - run_config={ - "ops": { - "gtfs_feed_downloads": { - "config": { - "provider": provider, - "producer_url": producer_url, + try: + with duckdb.get_connection() as conn: + # Get all active feeds with their metadata in one query + feeds = conn.execute(""" + SELECT feed_id, provider, producer_url + FROM gtfs_feed_metadata + WHERE producer_url IS NOT NULL AND producer_url != '' + ORDER BY feed_id + """).fetchall() + + if not feeds: + return SkipReason("No GTFS feeds configured") + + # Create a RunRequest for each partition with metadata + run_requests = [ + RunRequest( + partition_key=feed_id, + run_config={ + "ops": { + "gtfs_feed_downloads": { + "config": { + "provider": provider, + "producer_url": producer_url, + } } } + }, + tags={ + "feed_id": feed_id, + "sensor": "gtfs_static_sensor" } - }, - tags={ - "feed_id": feed_id, - "sensor": "gtfs_static_sensor" - } - ) - for feed_id, provider, producer_url in feeds - ] + ) + for feed_id, provider, producer_url in feeds + ] - context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds") + context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds") + + return run_requests + + except Exception as e: + # Handle case where table doesn't exist yet or other DB errors + context.log.warning(f"Database query failed: {e}") + return SkipReason(f"Database not ready or query failed: {e}") - return run_requests @sensor( name="gtfs_static_partition_update_sensor",