edited sensors to account for a fresh start, edited README

2025-12-07 09:27:41 -07:00 · 2025-12-07 09:27:41 -07:00 · e068cbed20
commit e068cbed20
parent b2571a8a48
4 changed files with 124 additions and 78 deletions
--- a/README.md
+++ b/README.md
@ -2,24 +2,31 @@
 Dagster setup that scrapes GTFS and GTFS-RT for specified transit agencies and adds them to a DuckDB
-## Input
+## Quick start
 You define which agencies and feeds to scrape with the file`config/agency_list.csv`
-To include the transit agencies that you want to scrape, add the relevant IDs from mobilitydatabase.org
+1. Edit the .env file.
 See `config/agency_list.csv.sample` for an example.
 ## set your environment
 ### .env file
 copy `env.sample` to `.env` and change:
  - Postgres database password - make it something random before the first run
  - MobilityDatabase.org API token
- Location of data, config, and postgres_data directories (default is in working directory)
+  - Location of `data`, `config`, and `postgres_data` directories (default is in working directory). `config` is part of the repo as it comes with sample configuration files.
 2. Edit `config/agency_list.csv`
  - See `config/agency_list.csv.sample` for an example.
  - Define which agencies and feeds to scrape with the file.
  - To include the transit agencies that you want to scrape, add the relevant Feed IDs from mobilitydatabase.org
-
+3. Build the docker containers
 # Run it
 `docker compose build`
 4. Run the docker containers
 `docker compose up -d`
-access the Dagster web ui at 127.0.0.1:3001
+
 5. Access the Dagster web ui at 127.0.0.1:3001
 6. Materialize the first asset: `agency_list`
 ## To-do:
 1. Change mobilitydata from using the API with a key, to using the csv on their GitHub page.
 2. Load data into duckdb
 3. Transform data in duckdb
 4. Analyze data
--- a/user_code/assets/config.py
+++ b/user_code/assets/config.py
@ -1,4 +1,5 @@
 import pandas as pd
 from pathlib import Path
 from dagster import (
    asset,
 )
@ -11,6 +12,10 @@ from dagster_duckdb import DuckDBResource
 def agency_list(duckdb: DuckDBResource) -> None:
    """Load agency list from CSV into DuckDB."""
    # Ensure the database directory exists
    db_path = Path(duckdb.database)
    db_path.parent.mkdir(parents=True, exist_ok=True)
    # Read the CSV (path is relative to container working directory)
    df = pd.read_csv('config/agency_list.csv')
@ -20,4 +25,3 @@ def agency_list(duckdb: DuckDBResource) -> None:
            CREATE OR REPLACE TABLE agency_list AS
            SELECT * FROM df
        """)
--- a/user_code/sensors/gtfs_realtime.py
+++ b/user_code/sensors/gtfs_realtime.py
@ -1,13 +1,13 @@
 from dagster import (
    sensor,
    SensorEvaluationContext,
    RunRequest,
    SkipReason,
    SensorEvaluationContext,
    DefaultSensorStatus,
    AssetKey,
 )
 from dagster_duckdb import DuckDBResource
@sensor(
    name="gtfs_rt_vehicles_sensor",
    minimum_interval_seconds=60,
@ -22,6 +22,17 @@ def gtfs_rt_vehicles_sensor(
    Sensor that triggers gtfs_rt_vehicles_downloads every 60 seconds.
    Fetches feed metadata once and passes it to each partition run.
    """
    # Check if upstream asset has been materialized at least once
    upstream_asset_key = AssetKey("gtfs_rt_vehicles_partitions")
    latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
    if latest_materialization is None:
        return SkipReason(
            "Waiting for upstream asset 'gtfs_rt_vehicles_partitions' to be materialized. "
            "Run the upstream assets first."
        )
    try:
        with duckdb.get_connection() as conn:
            # Get all active feeds with their metadata in one query
            feeds = conn.execute("""
@ -59,3 +70,8 @@ def gtfs_rt_vehicles_sensor(
            context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds")
            return run_requests
    except Exception as e:
        # Handle case where table doesn't exist yet or other DB errors
        context.log.warning(f"Database query failed: {e}")
        return SkipReason(f"Database not ready or query failed: {e}")
--- a/user_code/sensors/gtfs_static.py
+++ b/user_code/sensors/gtfs_static.py
@ -6,6 +6,7 @@ from dagster import (
    DefaultSensorStatus,
    SensorResult,
    EventLogEntry,
    AssetKey,
 )
 from dagster_duckdb import DuckDBResource
@ -24,6 +25,18 @@ def gtfs_static_hourly_sensor(
    Sensor that triggers gtfs_feed_downloads every 60 minutes.
    Fetches feed metadata once and passes it to each partition run.
    """
    # Check if upstream asset has been materialized at least once
    # Update this asset name if your upstream asset has a different name
    upstream_asset_key = AssetKey("gtfs_feed_partitions")
    latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
    if latest_materialization is None:
        return SkipReason(
            "Waiting for upstream asset 'gtfs_feed_partitions' to be materialized. "
            "Run the upstream assets first."
        )
    try:
        with duckdb.get_connection() as conn:
            # Get all active feeds with their metadata in one query
            feeds = conn.execute("""
@ -62,6 +75,12 @@ def gtfs_static_hourly_sensor(
            return run_requests
    except Exception as e:
        # Handle case where table doesn't exist yet or other DB errors
        context.log.warning(f"Database query failed: {e}")
        return SkipReason(f"Database not ready or query failed: {e}")
@sensor(
    name="gtfs_static_partition_update_sensor",
    asset_selection=["gtfs_feed_downloads"],