added AQI

This commit is contained in:
Ben Varick 2025-11-18 21:20:25 -07:00
parent 647b961a4f
commit f79c8d5a90
Signed by: ben
SSH key fingerprint: SHA256:jWnpFDAcacYM5aPFpYRqlsamlDyKNpSj3jj+k4ojtUo
4 changed files with 202 additions and 32 deletions

2
.gitignore vendored
View file

@ -6,3 +6,5 @@
#exclude API keys #exclude API keys
api_keys/* api_keys/*
#exclude data
data/*

View file

@ -1,8 +1,8 @@
City,Type,State,Country City,Type,State,Country,CBSA-Code
Madison,city,WI,United States Madison,city,WI,United States,31540
Bellingham,city,WA,United States Bellingham,city,WA,United States,13380
Portland,city,OR,United States Portland,city,OR,United States,38900
Port Angeles,city,WA,United States Port Angeles,city,WA,United States,38820
Boston,city,MA,United States Boston,city,MA,United States,14460
Boise City,city,ID,United States Boise City,city,ID,United States,14260
Salt Lake City,city,UT,United States Salt Lake City,city,UT,United States,41620

1 City Type State Country CBSA-Code
2 Madison city WI United States 31540
3 Bellingham city WA United States 13380
4 Portland city OR United States 38900
5 Port Angeles city WA United States 38820
6 Boston city MA United States 14460
7 Boise City city ID United States 14260
8 Salt Lake City city UT United States 41620

View file

@ -11,3 +11,5 @@ Encoding: UTF-8
RnwWeave: Sweave RnwWeave: Sweave
LaTeX: pdfLaTeX LaTeX: pdfLaTeX
BuildType: Makefile

View file

@ -36,7 +36,7 @@ census_api_key(key = substr(read_file(file = "api_keys/census_api_key"), 1, 40))
## Date ranges ## Date ranges
```{r date_range, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ```{r date_range, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
date_start <- "2010-01-01" date_start <- "2014-01-01"
date_end <- "2024-12-31" date_end <- "2024-12-31"
``` ```
@ -44,42 +44,42 @@ date_end <- "2024-12-31"
```{r cities, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ```{r cities, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
cities <- read_csv(file = "cities.csv") cities <- read_csv(file = "cities.csv")
cities <- cities %>% cities <- cities |>
mutate(city_name = paste0(City, " ", Type)) mutate(city_name = paste0(City, " ", Type))
``` ```
# Get data # Data
## Census data ## Census data
```{r census, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ```{r census, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
populations <- list(NULL) populations <- list(NULL)
for(city in cities$city_name){ for(city in cities$city_name){
state <- cities %>% filter(city_name == city) %>% pull(State) state <- cities |> filter(city_name == city) |> pull(State)
populations[[city]] <- get_acs( populations[[city]] <- get_acs(
geography = "place", geography = "place",
variables = "B01003_001", variables = "B01003_001",
state = state, state = state,
year = 2023, year = 2023,
geometry = TRUE geometry = TRUE
) %>% ) |>
filter(str_detect(NAME, city)) filter(str_detect(NAME, city))
} }
populations <- bind_rows(populations) populations <- bind_rows(populations)
city_center <- populations %>% city_center <- populations |>
st_centroid() %>% st_centroid() |>
st_transform(4326) %>% # Convert to WGS84 (standard lat/lon) st_transform(4326) %>%
mutate( mutate(
lon = st_coordinates(.)[,1], lon = st_coordinates(.)[,1],
lat = st_coordinates(.)[,2] lat = st_coordinates(.)[,2]
) %>% ) |>
st_drop_geometry() %>% st_drop_geometry() |>
select(lat, lon) select(lat, lon)
cities <- bind_cols(cities, populations, city_center) cities <- bind_cols(cities, populations, city_center)
cities <- cities %>% mutate( cities <- cities |> mutate(
density = estimate/as.double(st_area(geometry))*1000000 density = estimate/as.double(st_area(geometry))*1000000
) )
@ -120,12 +120,12 @@ ggplot(data = cities) +
```{r weather, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ```{r weather, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
weather <- list(NULL) weather <- list(NULL)
for(city in cities$City){ for(city in cities$City){
city_info <- cities %>% filter(City == city) city_info <- cities |> filter(City == city)
city_run <- weather_history( city_run <- weather_history(
location = c(city_info %>% pull(lat), city_info %>% pull(lon)), location = c(city_info |> pull(lat), city_info |> pull(lon)),
start = date_start, start = date_start,
end = date_end, end = date_end,
daily = c("apparent_temperature_max", "apparent_temperature_min"), daily = c("apparent_temperature_max", "apparent_temperature_min", "precipitation_hours"),
response_units = list(temperature_unit = "fahrenheit") response_units = list(temperature_unit = "fahrenheit")
) )
city_run$city <- city city_run$city <- city
@ -134,23 +134,27 @@ for(city in cities$City){
weather <- bind_rows(weather) weather <- bind_rows(weather)
weather_summary <- weather %>% weather |>
mutate(year = year(ymd(date)), mutate(year = year(ymd(date)),
month = month(ymd(date))) %>% month = month(ymd(date))) |>
group_by(year, city) %>% group_by(year, city) |>
summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) %>% summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) |>
group_by(city) %>% group_by(city) |>
summarise(median_days_above_80 = median(days_above_80)) summarise(median_days_above_80 = median(days_above_80)) |>
ggplot() +
ggplot(data = weather_summary) +
geom_col(aes(x = city, geom_col(aes(x = city,
y = median_days_above_80)) + y = median_days_above_80)) +
labs(title = "Days above 80°F", labs(title = "Days above 80°F (apparent temperature)",
y = "Median days per year", y = "Median days per year",
x = "City", x = "City",
fill = NULL) fill = NULL)
ggplot(data = weather %>% pivot_longer(cols = starts_with("daily"), names_to = "max_min", values_to = "temperature") %>% filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max"))) + weather |>
pivot_longer(cols = starts_with("daily"),
names_to = "max_min",
values_to = "temperature") |>
filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max")) |>
ggplot() +
geom_violin(aes(x = city, geom_violin(aes(x = city,
y = temperature, y = temperature,
fill = max_min)) + fill = max_min)) +
@ -160,5 +164,167 @@ ggplot(data = weather %>% pivot_longer(cols = starts_with("daily"), names_to = "
y = "°F", y = "°F",
x = "City", x = "City",
fill = NULL) fill = NULL)
weather |>
mutate(year = year(ymd(date)),
month = month(ymd(date))) |>
group_by(year, city) |>
summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |>
group_by(city) |>
summarise(median_days_above_4hr = median(days_above_4hr)) |>
ggplot() +
geom_col(aes(x = city,
y = median_days_above_4hr)) +
labs(title = "Days with more than 4 hrs of rain",
y = "Median days per year",
x = "City",
fill = NULL)
weather |>
mutate(year = year(ymd(date)),
month = month(ymd(date))) |>
group_by(year, month, city) |>
summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |>
group_by(city, month) |>
summarise(median_days_above_4hr = median(days_above_4hr)) |>
ggplot() +
geom_line(aes(x = month,
y = median_days_above_4hr,
color = city)) +
labs(title = "Days with more than 4 hrs of rain",
y = "Median days per month",
x = "Month",
color = "City")
``` ```
## Air Quality
```{r air_quality, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
# download data
# Create data directory if it doesn't exist
if (!dir.exists("data")) {
dir.create("data")
}
# Define years
years <- seq(year(ymd(date_start)), year(ymd(date_end)), 1)
# Initialize empty list to store dataframes
aqi_list <- list()
# Download and process files for each year
for (year in years) {
# Construct URL
url <- paste0("https://aqs.epa.gov/aqsweb/airdata/daily_aqi_by_cbsa_", year, ".zip")
# Define local file paths
zip_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".zip"))
csv_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".csv"))
# Download zip if it doesn't exist
if (file.exists(zip_file)) {
cat(paste0("Zip file for year ", year, " already exists. Skipping download.\n"))
} else {
cat(paste0("Downloading data for year ", year, "...\n"))
tryCatch({
download.file(url, zip_file, mode = "wb")
cat(paste0("Successfully downloaded data for year ", year, "\n"))
}, error = function(e) {
cat(paste0("Error downloading data for year ", year, ": ", e$message, "\n"))
next
})
}
# Extract zip if CSV doesn't exist
if (file.exists(zip_file)) {
if (file.exists(csv_file)) {
cat(paste0("CSV for year ", year, " already extracted. Skipping extraction.\n"))
} else {
cat(paste0("Extracting zip for year ", year, "...\n"))
tryCatch({
unzip(zip_file, exdir = "data")
cat(paste0("Successfully extracted data for year ", year, "\n"))
}, error = function(e) {
cat(paste0("Error extracting data for year ", year, ": ", e$message, "\n"))
next
})
}
}
# Read CSV if it exists
if (file.exists(csv_file)) {
cat(paste0("Reading CSV for year ", year, "...\n"))
tryCatch({
aqi_year <- read.csv(csv_file)
aqi_list[[as.character(year)]] <- aqi_year
cat(paste0("Successfully read ", nrow(aqi_year), " rows for year ", year, "\n"))
}, error = function(e) {
cat(paste0("Error reading CSV for year ", year, ": ", e$message, "\n"))
})
}
}
# Combine all dataframes
aqi_data <- bind_rows(aqi_list)
aqi_data <- aqi_data |>
filter(CBSA.Code %in% (cities |> pull(`CBSA-Code`))) |>
mutate(year = year(ymd(Date)),
month = month(ymd(Date)))
aqi_threshhold <- 50
aqi_data |>
group_by(year, CBSA) |>
summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |>
group_by(CBSA) |>
summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
ggplot() +
geom_col(aes(x = CBSA,
y = mean_hours_above)) +
labs(title = "Hours above 50 AQI",
y = "Mean hours per year",
x = "City",
fill = NULL)
aqi_data |>
group_by(year, CBSA) |>
summarise(hours_above = sum(AQI >= 2*aqi_threshhold, na.rm = TRUE)) |>
group_by(CBSA) |>
summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
ggplot() +
geom_col(aes(x = CBSA,
y = mean_hours_above)) +
labs(title = paste0("Hours above ", aqi_threshhold * 2," AQI"),
y = "Mean hours per year",
x = "City",
fill = NULL)
aqi_data |>
ggplot() +
geom_violin(aes(x = CBSA,
y = AQI)) +
labs(title = "AQI",
y = "AQI",
x = "City",
fill = NULL) +
coord_cartesian(ylim = (c(0,500)))
aqi_data |>
group_by(year, month, CBSA) |>
summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |>
group_by(CBSA, month) |>
summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
ggplot() +
geom_line(aes(x = month,
y = mean_hours_above,
color = CBSA)) +
scale_x_continuous(breaks = seq(1,12,1)) +
scale_color_brewer(type = "qual") +
labs(title = paste0("Hours with an AQI of greater than or equal to ", aqi_threshhold),
y = "Mean days per month",
x = "Month",
color = "City")
```