edited sensors to account for a fresh start, edited README
This commit is contained in:
parent
b2571a8a48
commit
e068cbed20
4 changed files with 124 additions and 78 deletions
33
README.md
33
README.md
|
|
@ -2,24 +2,31 @@
|
||||||
|
|
||||||
Dagster setup that scrapes GTFS and GTFS-RT for specified transit agencies and adds them to a DuckDB
|
Dagster setup that scrapes GTFS and GTFS-RT for specified transit agencies and adds them to a DuckDB
|
||||||
|
|
||||||
## Input
|
## Quick start
|
||||||
You define which agencies and feeds to scrape with the file`config/agency_list.csv`
|
|
||||||
|
|
||||||
To include the transit agencies that you want to scrape, add the relevant IDs from mobilitydatabase.org
|
1. Edit the .env file.
|
||||||
|
|
||||||
See `config/agency_list.csv.sample` for an example.
|
|
||||||
|
|
||||||
## set your environment
|
|
||||||
|
|
||||||
### .env file
|
|
||||||
copy `env.sample` to `.env` and change:
|
copy `env.sample` to `.env` and change:
|
||||||
- Postgres database password - make it something random before the first run
|
- Postgres database password - make it something random before the first run
|
||||||
- MobilityDatabase.org API token
|
- MobilityDatabase.org API token
|
||||||
- Location of data, config, and postgres_data directories (default is in working directory)
|
- Location of `data`, `config`, and `postgres_data` directories (default is in working directory). `config` is part of the repo as it comes with sample configuration files.
|
||||||
|
|
||||||
|
2. Edit `config/agency_list.csv`
|
||||||
|
- See `config/agency_list.csv.sample` for an example.
|
||||||
|
- Define which agencies and feeds to scrape with the file.
|
||||||
|
- To include the transit agencies that you want to scrape, add the relevant Feed IDs from mobilitydatabase.org
|
||||||
|
|
||||||
|
3. Build the docker containers
|
||||||
# Run it
|
|
||||||
`docker compose build`
|
`docker compose build`
|
||||||
|
|
||||||
|
4. Run the docker containers
|
||||||
`docker compose up -d`
|
`docker compose up -d`
|
||||||
access the Dagster web ui at 127.0.0.1:3001
|
|
||||||
|
5. Access the Dagster web ui at 127.0.0.1:3001
|
||||||
|
|
||||||
|
6. Materialize the first asset: `agency_list`
|
||||||
|
|
||||||
|
## To-do:
|
||||||
|
1. Change mobilitydata from using the API with a key, to using the csv on their GitHub page.
|
||||||
|
2. Load data into duckdb
|
||||||
|
3. Transform data in duckdb
|
||||||
|
4. Analyze data
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
from dagster import (
|
from dagster import (
|
||||||
asset,
|
asset,
|
||||||
)
|
)
|
||||||
|
|
@ -11,6 +12,10 @@ from dagster_duckdb import DuckDBResource
|
||||||
def agency_list(duckdb: DuckDBResource) -> None:
|
def agency_list(duckdb: DuckDBResource) -> None:
|
||||||
"""Load agency list from CSV into DuckDB."""
|
"""Load agency list from CSV into DuckDB."""
|
||||||
|
|
||||||
|
# Ensure the database directory exists
|
||||||
|
db_path = Path(duckdb.database)
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Read the CSV (path is relative to container working directory)
|
# Read the CSV (path is relative to container working directory)
|
||||||
df = pd.read_csv('config/agency_list.csv')
|
df = pd.read_csv('config/agency_list.csv')
|
||||||
|
|
||||||
|
|
@ -20,4 +25,3 @@ def agency_list(duckdb: DuckDBResource) -> None:
|
||||||
CREATE OR REPLACE TABLE agency_list AS
|
CREATE OR REPLACE TABLE agency_list AS
|
||||||
SELECT * FROM df
|
SELECT * FROM df
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,13 @@
|
||||||
from dagster import (
|
from dagster import (
|
||||||
sensor,
|
sensor,
|
||||||
|
SensorEvaluationContext,
|
||||||
RunRequest,
|
RunRequest,
|
||||||
SkipReason,
|
SkipReason,
|
||||||
SensorEvaluationContext,
|
|
||||||
DefaultSensorStatus,
|
DefaultSensorStatus,
|
||||||
|
AssetKey,
|
||||||
)
|
)
|
||||||
from dagster_duckdb import DuckDBResource
|
from dagster_duckdb import DuckDBResource
|
||||||
|
|
||||||
|
|
||||||
@sensor(
|
@sensor(
|
||||||
name="gtfs_rt_vehicles_sensor",
|
name="gtfs_rt_vehicles_sensor",
|
||||||
minimum_interval_seconds=60,
|
minimum_interval_seconds=60,
|
||||||
|
|
@ -22,6 +22,17 @@ def gtfs_rt_vehicles_sensor(
|
||||||
Sensor that triggers gtfs_rt_vehicles_downloads every 60 seconds.
|
Sensor that triggers gtfs_rt_vehicles_downloads every 60 seconds.
|
||||||
Fetches feed metadata once and passes it to each partition run.
|
Fetches feed metadata once and passes it to each partition run.
|
||||||
"""
|
"""
|
||||||
|
# Check if upstream asset has been materialized at least once
|
||||||
|
upstream_asset_key = AssetKey("gtfs_rt_vehicles_partitions")
|
||||||
|
latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
|
||||||
|
|
||||||
|
if latest_materialization is None:
|
||||||
|
return SkipReason(
|
||||||
|
"Waiting for upstream asset 'gtfs_rt_vehicles_partitions' to be materialized. "
|
||||||
|
"Run the upstream assets first."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
with duckdb.get_connection() as conn:
|
with duckdb.get_connection() as conn:
|
||||||
# Get all active feeds with their metadata in one query
|
# Get all active feeds with their metadata in one query
|
||||||
feeds = conn.execute("""
|
feeds = conn.execute("""
|
||||||
|
|
@ -59,3 +70,8 @@ def gtfs_rt_vehicles_sensor(
|
||||||
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds")
|
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds")
|
||||||
|
|
||||||
return run_requests
|
return run_requests
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Handle case where table doesn't exist yet or other DB errors
|
||||||
|
context.log.warning(f"Database query failed: {e}")
|
||||||
|
return SkipReason(f"Database not ready or query failed: {e}")
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ from dagster import (
|
||||||
DefaultSensorStatus,
|
DefaultSensorStatus,
|
||||||
SensorResult,
|
SensorResult,
|
||||||
EventLogEntry,
|
EventLogEntry,
|
||||||
|
AssetKey,
|
||||||
)
|
)
|
||||||
from dagster_duckdb import DuckDBResource
|
from dagster_duckdb import DuckDBResource
|
||||||
|
|
||||||
|
|
@ -24,6 +25,18 @@ def gtfs_static_hourly_sensor(
|
||||||
Sensor that triggers gtfs_feed_downloads every 60 minutes.
|
Sensor that triggers gtfs_feed_downloads every 60 minutes.
|
||||||
Fetches feed metadata once and passes it to each partition run.
|
Fetches feed metadata once and passes it to each partition run.
|
||||||
"""
|
"""
|
||||||
|
# Check if upstream asset has been materialized at least once
|
||||||
|
# Update this asset name if your upstream asset has a different name
|
||||||
|
upstream_asset_key = AssetKey("gtfs_feed_partitions")
|
||||||
|
latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
|
||||||
|
|
||||||
|
if latest_materialization is None:
|
||||||
|
return SkipReason(
|
||||||
|
"Waiting for upstream asset 'gtfs_feed_partitions' to be materialized. "
|
||||||
|
"Run the upstream assets first."
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
with duckdb.get_connection() as conn:
|
with duckdb.get_connection() as conn:
|
||||||
# Get all active feeds with their metadata in one query
|
# Get all active feeds with their metadata in one query
|
||||||
feeds = conn.execute("""
|
feeds = conn.execute("""
|
||||||
|
|
@ -62,6 +75,12 @@ def gtfs_static_hourly_sensor(
|
||||||
|
|
||||||
return run_requests
|
return run_requests
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Handle case where table doesn't exist yet or other DB errors
|
||||||
|
context.log.warning(f"Database query failed: {e}")
|
||||||
|
return SkipReason(f"Database not ready or query failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
@sensor(
|
@sensor(
|
||||||
name="gtfs_static_partition_update_sensor",
|
name="gtfs_static_partition_update_sensor",
|
||||||
asset_selection=["gtfs_feed_downloads"],
|
asset_selection=["gtfs_feed_downloads"],
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue