edited sensors to account for a fresh start, edited README
This commit is contained in:
parent
b2571a8a48
commit
e068cbed20
4 changed files with 124 additions and 78 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from dagster import (
|
||||
asset,
|
||||
)
|
||||
|
|
@ -11,6 +12,10 @@ from dagster_duckdb import DuckDBResource
|
|||
def agency_list(duckdb: DuckDBResource) -> None:
|
||||
"""Load agency list from CSV into DuckDB."""
|
||||
|
||||
# Ensure the database directory exists
|
||||
db_path = Path(duckdb.database)
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Read the CSV (path is relative to container working directory)
|
||||
df = pd.read_csv('config/agency_list.csv')
|
||||
|
||||
|
|
@ -20,4 +25,3 @@ def agency_list(duckdb: DuckDBResource) -> None:
|
|||
CREATE OR REPLACE TABLE agency_list AS
|
||||
SELECT * FROM df
|
||||
""")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,13 +1,13 @@
|
|||
from dagster import (
|
||||
sensor,
|
||||
SensorEvaluationContext,
|
||||
RunRequest,
|
||||
SkipReason,
|
||||
SensorEvaluationContext,
|
||||
DefaultSensorStatus,
|
||||
AssetKey,
|
||||
)
|
||||
from dagster_duckdb import DuckDBResource
|
||||
|
||||
|
||||
@sensor(
|
||||
name="gtfs_rt_vehicles_sensor",
|
||||
minimum_interval_seconds=60,
|
||||
|
|
@ -22,40 +22,56 @@ def gtfs_rt_vehicles_sensor(
|
|||
Sensor that triggers gtfs_rt_vehicles_downloads every 60 seconds.
|
||||
Fetches feed metadata once and passes it to each partition run.
|
||||
"""
|
||||
with duckdb.get_connection() as conn:
|
||||
# Get all active feeds with their metadata in one query
|
||||
feeds = conn.execute("""
|
||||
SELECT feed_id, provider, producer_url
|
||||
FROM gtfs_rt_vehicles_metadata
|
||||
WHERE producer_url IS NOT NULL AND producer_url != ''
|
||||
ORDER BY feed_id
|
||||
""").fetchall()
|
||||
# Check if upstream asset has been materialized at least once
|
||||
upstream_asset_key = AssetKey("gtfs_rt_vehicles_partitions")
|
||||
latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
|
||||
|
||||
if not feeds:
|
||||
return SkipReason("No GTFS-RT vehicle feeds configured")
|
||||
if latest_materialization is None:
|
||||
return SkipReason(
|
||||
"Waiting for upstream asset 'gtfs_rt_vehicles_partitions' to be materialized. "
|
||||
"Run the upstream assets first."
|
||||
)
|
||||
|
||||
# Create a RunRequest for each partition with metadata
|
||||
run_requests = [
|
||||
RunRequest(
|
||||
partition_key=feed_id,
|
||||
run_config={
|
||||
"ops": {
|
||||
"gtfs_rt_vehicles_downloads": {
|
||||
"config": {
|
||||
"provider": provider,
|
||||
"producer_url": producer_url,
|
||||
try:
|
||||
with duckdb.get_connection() as conn:
|
||||
# Get all active feeds with their metadata in one query
|
||||
feeds = conn.execute("""
|
||||
SELECT feed_id, provider, producer_url
|
||||
FROM gtfs_rt_vehicles_metadata
|
||||
WHERE producer_url IS NOT NULL AND producer_url != ''
|
||||
ORDER BY feed_id
|
||||
""").fetchall()
|
||||
|
||||
if not feeds:
|
||||
return SkipReason("No GTFS-RT vehicle feeds configured")
|
||||
|
||||
# Create a RunRequest for each partition with metadata
|
||||
run_requests = [
|
||||
RunRequest(
|
||||
partition_key=feed_id,
|
||||
run_config={
|
||||
"ops": {
|
||||
"gtfs_rt_vehicles_downloads": {
|
||||
"config": {
|
||||
"provider": provider,
|
||||
"producer_url": producer_url,
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
tags={
|
||||
"feed_id": feed_id,
|
||||
"sensor": "gtfs_rt_vehicles_sensor"
|
||||
}
|
||||
},
|
||||
tags={
|
||||
"feed_id": feed_id,
|
||||
"sensor": "gtfs_rt_vehicles_sensor"
|
||||
}
|
||||
)
|
||||
for feed_id, provider, producer_url in feeds
|
||||
]
|
||||
)
|
||||
for feed_id, provider, producer_url in feeds
|
||||
]
|
||||
|
||||
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds")
|
||||
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS-RT vehicle feeds")
|
||||
|
||||
return run_requests
|
||||
return run_requests
|
||||
|
||||
except Exception as e:
|
||||
# Handle case where table doesn't exist yet or other DB errors
|
||||
context.log.warning(f"Database query failed: {e}")
|
||||
return SkipReason(f"Database not ready or query failed: {e}")
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ from dagster import (
|
|||
DefaultSensorStatus,
|
||||
SensorResult,
|
||||
EventLogEntry,
|
||||
AssetKey,
|
||||
)
|
||||
from dagster_duckdb import DuckDBResource
|
||||
|
||||
|
|
@ -24,43 +25,61 @@ def gtfs_static_hourly_sensor(
|
|||
Sensor that triggers gtfs_feed_downloads every 60 minutes.
|
||||
Fetches feed metadata once and passes it to each partition run.
|
||||
"""
|
||||
with duckdb.get_connection() as conn:
|
||||
# Get all active feeds with their metadata in one query
|
||||
feeds = conn.execute("""
|
||||
SELECT feed_id, provider, producer_url
|
||||
FROM gtfs_feed_metadata
|
||||
WHERE producer_url IS NOT NULL AND producer_url != ''
|
||||
ORDER BY feed_id
|
||||
""").fetchall()
|
||||
# Check if upstream asset has been materialized at least once
|
||||
# Update this asset name if your upstream asset has a different name
|
||||
upstream_asset_key = AssetKey("gtfs_feed_partitions")
|
||||
latest_materialization = context.instance.get_latest_materialization_event(upstream_asset_key)
|
||||
|
||||
if not feeds:
|
||||
return SkipReason("No GTFS feeds configured")
|
||||
if latest_materialization is None:
|
||||
return SkipReason(
|
||||
"Waiting for upstream asset 'gtfs_feed_partitions' to be materialized. "
|
||||
"Run the upstream assets first."
|
||||
)
|
||||
|
||||
# Create a RunRequest for each partition with metadata
|
||||
run_requests = [
|
||||
RunRequest(
|
||||
partition_key=feed_id,
|
||||
run_config={
|
||||
"ops": {
|
||||
"gtfs_feed_downloads": {
|
||||
"config": {
|
||||
"provider": provider,
|
||||
"producer_url": producer_url,
|
||||
try:
|
||||
with duckdb.get_connection() as conn:
|
||||
# Get all active feeds with their metadata in one query
|
||||
feeds = conn.execute("""
|
||||
SELECT feed_id, provider, producer_url
|
||||
FROM gtfs_feed_metadata
|
||||
WHERE producer_url IS NOT NULL AND producer_url != ''
|
||||
ORDER BY feed_id
|
||||
""").fetchall()
|
||||
|
||||
if not feeds:
|
||||
return SkipReason("No GTFS feeds configured")
|
||||
|
||||
# Create a RunRequest for each partition with metadata
|
||||
run_requests = [
|
||||
RunRequest(
|
||||
partition_key=feed_id,
|
||||
run_config={
|
||||
"ops": {
|
||||
"gtfs_feed_downloads": {
|
||||
"config": {
|
||||
"provider": provider,
|
||||
"producer_url": producer_url,
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
tags={
|
||||
"feed_id": feed_id,
|
||||
"sensor": "gtfs_static_sensor"
|
||||
}
|
||||
},
|
||||
tags={
|
||||
"feed_id": feed_id,
|
||||
"sensor": "gtfs_static_sensor"
|
||||
}
|
||||
)
|
||||
for feed_id, provider, producer_url in feeds
|
||||
]
|
||||
)
|
||||
for feed_id, provider, producer_url in feeds
|
||||
]
|
||||
|
||||
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds")
|
||||
context.log.info(f"Triggering downloads for {len(run_requests)} GTFS feeds")
|
||||
|
||||
return run_requests
|
||||
|
||||
except Exception as e:
|
||||
# Handle case where table doesn't exist yet or other DB errors
|
||||
context.log.warning(f"Database query failed: {e}")
|
||||
return SkipReason(f"Database not ready or query failed: {e}")
|
||||
|
||||
return run_requests
|
||||
|
||||
@sensor(
|
||||
name="gtfs_static_partition_update_sensor",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue