edited sensors to account for a fresh start, edited README

This commit is contained in:
Ben Varick 2025-12-07 09:27:41 -07:00
parent b2571a8a48
commit e068cbed20
Signed by: ben
SSH key fingerprint: SHA256:jWnpFDAcacYM5aPFpYRqlsamlDyKNpSj3jj+k4ojtUo
4 changed files with 124 additions and 78 deletions

View file

@ -6,6 +6,7 @@ from dagster import (
DefaultSensorStatus,
SensorResult,
EventLogEntry,
AssetKey,
)
from dagster_duckdb import DuckDBResource
@ -24,43 +25,61 @@ def gtfs_static_hourly_sensor(
Sensor that triggers gtfs_feed_downloads every 60 minutes.
Fetches feed metadata once and passes it to each partition run.
"""
with duckdb.get_connection() as conn:
# Get all active feeds with their metadata in one query
feeds = conn.execute("""
SELECT feed_id, provider, producer_url
FROM gtfs_feed_metadata
WHERE producer_url IS NOT NULL AND producer_url != ''
ORDER BY feed_id
""").fetchall()
# Check if upstream asset has been materialized at least once
# Update this asset name if your upstream asset has a different name
upstream_asset_key = AssetKey("gtfs_feed_partitions")
latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
if not feeds:
return SkipReason("No GTFS feeds configured")
if latest_materialization is None:
return SkipReason(
"Waiting for upstream asset 'gtfs_feed_partitions' to be materialized. "
"Run the upstream assets first."
)
# Create a RunRequest for each partition with metadata
run_requests = [
RunRequest(
partition_key=feed_id,
run_config={
"ops": {
"gtfs_feed_downloads": {
"config": {
"provider": provider,
"producer_url": producer_url,
try:
with duckdb.get_connection() as conn:
# Get all active feeds with their metadata in one query
feeds = conn.execute("""
SELECT feed_id, provider, producer_url
FROM gtfs_feed_metadata
WHERE producer_url IS NOT NULL AND producer_url != ''
ORDER BY feed_id
""").fetchall()
if not feeds:
return SkipReason("No GTFS feeds configured")
# Create a RunRequest for each partition with metadata
run_requests = [
RunRequest(
partition_key=feed_id,
run_config={
"ops": {
"gtfs_feed_downloads": {
"config": {
"provider": provider,
"producer_url": producer_url,
}
}
}
},
tags={
"feed_id": feed_id,
"sensor": "gtfs_static_sensor"
}
},
tags={
"feed_id": feed_id,
"sensor": "gtfs_static_sensor"
}
)
for feed_id, provider, producer_url in feeds
]
)
for feed_id, provider, producer_url in feeds
]
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds")
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds")
return run_requests
except Exception as e:
# Handle case where table doesn't exist yet or other DB errors
context.log.warning(f"Database query failed: {e}")
return SkipReason(f"Database not ready or query failed: {e}")
return run_requests
@sensor(
name="gtfs_static_partition_update_sensor",