added gtfs_feed_metadata
This commit is contained in:
parent
ed220222a7
commit
8fd1406c32
8 changed files with 239 additions and 7 deletions
|
|
@ -8,7 +8,8 @@ RUN pip install \
|
||||||
dagster-postgres \
|
dagster-postgres \
|
||||||
dagster-docker \
|
dagster-docker \
|
||||||
dagster-duckdb \
|
dagster-duckdb \
|
||||||
pandas
|
pandas \
|
||||||
|
requests
|
||||||
|
|
||||||
WORKDIR /opt/dagster/app
|
WORKDIR /opt/dagster/app
|
||||||
COPY user_code/gtfs /opt/dagster/app
|
COPY user_code/gtfs /opt/dagster/app
|
||||||
|
|
@ -17,4 +18,4 @@ COPY user_code/gtfs /opt/dagster/app
|
||||||
|
|
||||||
EXPOSE 4000
|
EXPOSE 4000
|
||||||
|
|
||||||
CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-f", "definitions.py"]
|
CMD ["dagster", "api", "grpc", "-h", "0.0.0.0", "-p", "4000", "-f", "definitions.py"]
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ run_launcher:
|
||||||
- DAGSTER_POSTGRES_USER
|
- DAGSTER_POSTGRES_USER
|
||||||
- DAGSTER_POSTGRES_PASSWORD
|
- DAGSTER_POSTGRES_PASSWORD
|
||||||
- DAGSTER_POSTGRES_DB
|
- DAGSTER_POSTGRES_DB
|
||||||
|
- MOBILITY_DB_REFRESH_TOKEN
|
||||||
network: dagster
|
network: dagster
|
||||||
container_kwargs:
|
container_kwargs:
|
||||||
volumes: # Make docker client accessible to any launched containers as well
|
volumes: # Make docker client accessible to any launched containers as well
|
||||||
|
|
@ -28,6 +29,11 @@ run_launcher:
|
||||||
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
||||||
- /home/ben/code/gtfs-dagster/data:/opt/dagster/app/data
|
- /home/ben/code/gtfs-dagster/data:/opt/dagster/app/data
|
||||||
|
|
||||||
|
auto_materialize:
|
||||||
|
enabled: true
|
||||||
|
run_tags:
|
||||||
|
source: auto-materialize
|
||||||
|
|
||||||
run_storage:
|
run_storage:
|
||||||
module: dagster_postgres.run_storage
|
module: dagster_postgres.run_storage
|
||||||
class: PostgresRunStorage
|
class: PostgresRunStorage
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,8 @@ services:
|
||||||
POSTGRES_USER: ${POSTGRES_USER}
|
POSTGRES_USER: ${POSTGRES_USER}
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
POSTGRES_DB: ${POSTGRES_DB}
|
POSTGRES_DB: ${POSTGRES_DB}
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
volumes:
|
volumes:
|
||||||
- ./postgres_data:/var/lib/postgresql/data
|
- ./postgres_data:/var/lib/postgresql/data
|
||||||
networks:
|
networks:
|
||||||
|
|
@ -39,6 +41,9 @@ services:
|
||||||
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
||||||
DAGSTER_CURRENT_IMAGE: 'dagster_user_code_gtfs'
|
DAGSTER_CURRENT_IMAGE: 'dagster_user_code_gtfs'
|
||||||
|
MOBILITY_DB_REFRESH_TOKEN: ${MOBILITY_DB_REFRESH_TOKEN}
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
volumes:
|
volumes:
|
||||||
- ./data:/opt/dagster/app/data
|
- ./data:/opt/dagster/app/data
|
||||||
networks:
|
networks:
|
||||||
|
|
@ -66,6 +71,8 @@ services:
|
||||||
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
|
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
|
||||||
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
volumes: # Make docker client accessible so we can terminate containers from the webserver
|
volumes: # Make docker client accessible so we can terminate containers from the webserver
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
||||||
|
|
@ -92,6 +99,8 @@ services:
|
||||||
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
|
DAGSTER_POSTGRES_USER: ${POSTGRES_USER}
|
||||||
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
DAGSTER_POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
DAGSTER_POSTGRES_DB: ${POSTGRES_DB}
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
volumes: # Make docker client accessible so we can launch containers using host docker
|
volumes: # Make docker client accessible so we can launch containers using host docker
|
||||||
- /var/run/docker.sock:/var/run/docker.sock
|
- /var/run/docker.sock:/var/run/docker.sock
|
||||||
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
- /tmp/io_manager_storage:/tmp/io_manager_storage
|
||||||
|
|
|
||||||
2
user_code/__init__.py
Normal file
2
user_code/__init__.py
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
# user_code/__init__.py
|
||||||
|
# This file makes user_code a package
|
||||||
|
|
@ -1,2 +1,2 @@
|
||||||
# user_code/gtfs/__init__.py
|
# user_code/gtfs/__init__.py
|
||||||
from .assets import *
|
from . import assets, resources
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,14 @@
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from dagster import asset
|
from dagster import (
|
||||||
|
asset,
|
||||||
|
AssetExecutionContext,
|
||||||
|
Output,
|
||||||
|
MetadataValue,
|
||||||
|
AutoMaterializePolicy
|
||||||
|
)
|
||||||
from dagster_duckdb import DuckDBResource
|
from dagster_duckdb import DuckDBResource
|
||||||
|
from resources import MobilityDatabaseAPI # Direct import instead of relative
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
@asset
|
@asset
|
||||||
|
|
@ -16,3 +24,116 @@ def agency_list(duckdb: DuckDBResource) -> None:
|
||||||
CREATE OR REPLACE TABLE agency_list AS
|
CREATE OR REPLACE TABLE agency_list AS
|
||||||
SELECT * FROM df
|
SELECT * FROM df
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
@asset(
|
||||||
|
deps=["agency_list"],
|
||||||
|
group_name="gtfs_metadata",
|
||||||
|
auto_materialize_policy=AutoMaterializePolicy.eager()
|
||||||
|
)
|
||||||
|
def gtfs_feed_metadata(
|
||||||
|
context: AssetExecutionContext,
|
||||||
|
duckdb: DuckDBResource,
|
||||||
|
mobility_db: MobilityDatabaseAPI
|
||||||
|
) -> Output[None]:
|
||||||
|
"""
|
||||||
|
Fetch GTFS feed metadata from Mobility Database API for all agencies
|
||||||
|
and store in DuckDB.
|
||||||
|
"""
|
||||||
|
|
||||||
|
with duckdb.get_connection() as conn:
|
||||||
|
# Create the metadata table if it doesn't exist
|
||||||
|
conn.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS gtfs_feed_metadata (
|
||||||
|
feed_id VARCHAR PRIMARY KEY,
|
||||||
|
provider VARCHAR,
|
||||||
|
status VARCHAR,
|
||||||
|
official BOOLEAN,
|
||||||
|
producer_url VARCHAR,
|
||||||
|
authentication_type INTEGER,
|
||||||
|
authentication_info_url VARCHAR,
|
||||||
|
api_key_parameter_name VARCHAR,
|
||||||
|
license_url VARCHAR,
|
||||||
|
feed_contact_email VARCHAR,
|
||||||
|
raw_json JSON,
|
||||||
|
fetched_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Get all GTFS feed IDs from agency_list
|
||||||
|
feed_ids = conn.execute("""
|
||||||
|
SELECT DISTINCT GTFS as feed_id
|
||||||
|
FROM agency_list
|
||||||
|
WHERE GTFS IS NOT NULL AND GTFS != ''
|
||||||
|
""").fetchall()
|
||||||
|
|
||||||
|
context.log.info(f"Found {len(feed_ids)} feeds to fetch")
|
||||||
|
|
||||||
|
successful = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for (feed_id,) in feed_ids:
|
||||||
|
try:
|
||||||
|
feed_info = mobility_db.get_feed_info(feed_id)
|
||||||
|
|
||||||
|
# Extract relevant fields
|
||||||
|
source_info = feed_info.get("source_info", {})
|
||||||
|
|
||||||
|
# Insert or update the record
|
||||||
|
conn.execute("""
|
||||||
|
INSERT OR REPLACE INTO gtfs_feed_metadata (
|
||||||
|
feed_id,
|
||||||
|
provider,
|
||||||
|
status,
|
||||||
|
official,
|
||||||
|
producer_url,
|
||||||
|
authentication_type,
|
||||||
|
authentication_info_url,
|
||||||
|
api_key_parameter_name,
|
||||||
|
license_url,
|
||||||
|
feed_contact_email,
|
||||||
|
raw_json
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", [
|
||||||
|
feed_id,
|
||||||
|
feed_info.get("provider"),
|
||||||
|
feed_info.get("status"),
|
||||||
|
feed_info.get("official"),
|
||||||
|
source_info.get("producer_url"),
|
||||||
|
source_info.get("authentication_type"),
|
||||||
|
source_info.get("authentication_info_url"),
|
||||||
|
source_info.get("api_key_parameter_name"),
|
||||||
|
source_info.get("license_url"),
|
||||||
|
feed_info.get("feed_contact_email"),
|
||||||
|
json.dumps(feed_info)
|
||||||
|
])
|
||||||
|
|
||||||
|
context.log.info(f"✓ Fetched and stored metadata for {feed_id}")
|
||||||
|
successful += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
context.log.error(f"✗ Failed to fetch {feed_id}: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
# Get summary stats
|
||||||
|
total_records = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM gtfs_feed_metadata"
|
||||||
|
).fetchone()[0]
|
||||||
|
|
||||||
|
# Get preview for metadata
|
||||||
|
preview_df = conn.execute("""
|
||||||
|
SELECT feed_id, provider, status, producer_url
|
||||||
|
FROM gtfs_feed_metadata
|
||||||
|
LIMIT 5
|
||||||
|
""").df()
|
||||||
|
|
||||||
|
return Output(
|
||||||
|
None,
|
||||||
|
metadata={
|
||||||
|
"total_feeds": len(feed_ids),
|
||||||
|
"successful": successful,
|
||||||
|
"failed": failed,
|
||||||
|
"total_records_in_db": total_records,
|
||||||
|
"preview": MetadataValue.md(preview_df.to_markdown(index=False))
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,20 @@
|
||||||
from dagster import Definitions
|
from dagster import Definitions, load_assets_from_modules, EnvVar
|
||||||
from dagster_duckdb import DuckDBResource
|
from dagster_duckdb import DuckDBResource
|
||||||
from assets import agency_list
|
|
||||||
|
import assets
|
||||||
|
from resources import MobilityDatabaseAPI
|
||||||
|
|
||||||
|
all_assets = load_assets_from_modules([assets])
|
||||||
|
|
||||||
defs = Definitions(
|
defs = Definitions(
|
||||||
assets=[agency_list],
|
assets=all_assets,
|
||||||
resources={
|
resources={
|
||||||
"duckdb": DuckDBResource(
|
"duckdb": DuckDBResource(
|
||||||
database="data/gtfs/gtfs.duckdb"
|
database="data/gtfs/gtfs.duckdb"
|
||||||
|
),
|
||||||
|
"mobility_db": MobilityDatabaseAPI(
|
||||||
|
refresh_token=EnvVar("MOBILITY_DB_REFRESH_TOKEN"),
|
||||||
|
rate_limit_delay=0.5
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
|
||||||
85
user_code/gtfs/resources.py
Normal file
85
user_code/gtfs/resources.py
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
from dagster import ConfigurableResource
|
||||||
|
import requests
|
||||||
|
from typing import Optional
|
||||||
|
from time import sleep
|
||||||
|
import logging
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class MobilityDatabaseAPI(ConfigurableResource):
|
||||||
|
"""Resource for interacting with the Mobility Database API with OAuth2 token management."""
|
||||||
|
|
||||||
|
base_url: str = "https://api.mobilitydatabase.org"
|
||||||
|
refresh_token: str # Long-lived refresh token
|
||||||
|
rate_limit_delay: float = 0.5 # Seconds between requests
|
||||||
|
|
||||||
|
# These will be set at runtime, not in config
|
||||||
|
_access_token: Optional[str] = None
|
||||||
|
_token_expires_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
def _get_access_token(self) -> str:
|
||||||
|
"""
|
||||||
|
Get a valid access token, refreshing if necessary.
|
||||||
|
Access tokens are valid for 1 hour.
|
||||||
|
"""
|
||||||
|
# If we have a token and it's not expired (with 5 min buffer), use it
|
||||||
|
if self._access_token and self._token_expires_at:
|
||||||
|
if datetime.now() < self._token_expires_at - timedelta(minutes=5):
|
||||||
|
return self._access_token
|
||||||
|
|
||||||
|
# Need to get a new token
|
||||||
|
logger.info("Fetching new access token from Mobility Database API")
|
||||||
|
|
||||||
|
url = f"{self.base_url}/v1/tokens"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
data = {"refresh_token": self.refresh_token}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(url, headers=headers, json=data, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
token_data = response.json()
|
||||||
|
self._access_token = token_data.get("access_token")
|
||||||
|
|
||||||
|
# Tokens are valid for 1 hour
|
||||||
|
self._token_expires_at = datetime.now() + timedelta(hours=1)
|
||||||
|
|
||||||
|
logger.info("Successfully obtained new access token")
|
||||||
|
return self._access_token
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error fetching access token: {e}")
|
||||||
|
raise RuntimeError(f"Failed to obtain access token: {e}")
|
||||||
|
|
||||||
|
def get_feed_info(self, feed_id: str) -> dict:
|
||||||
|
"""
|
||||||
|
Fetch feed information from the Mobility Database API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
feed_id: The MDB feed ID (e.g., 'mdb-394')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary containing feed information
|
||||||
|
"""
|
||||||
|
access_token = self._get_access_token()
|
||||||
|
|
||||||
|
url = f"{self.base_url}/v1/feeds/{feed_id}"
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {access_token}"
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Fetching feed info for {feed_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
sleep(self.rate_limit_delay)
|
||||||
|
|
||||||
|
return response.json()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logger.error(f"Error fetching feed {feed_id}: {e}")
|
||||||
|
raise
|
||||||
Loading…
Add table
Add a link
Reference in a new issue