--- title: "City Compare" output: html_document: toc: true toc_depth: 5 toc_float: collapsed: false smooth_scroll: true editor_options: chunk_output_type: console --- # Input Data & Configuration ## Libraries ```{r libs, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} date() rm(list=ls()) library(tidyverse) library(tidycensus) library(sf) library(openmeteo) library(maps) library(scales) ``` ## API keys ```{r api_keys, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} # load census api key census_api_key(key = substr(read_file(file = "api_keys/census_api_key"), 1, 40)) ``` ## Date ranges ```{r date_range, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} date_start <- "2014-01-01" date_end <- "2024-12-31" ``` ## Cities to compare ```{r cities, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} cities <- read_csv(file = "cities.csv") cities <- cities |> mutate(city_name = paste0(City, " ", Type)) ``` # Data ## Census data ```{r census, eval = TRUE, echo = FALSE, results = "hide", warning = FALSE, error = TRUE, message = FALSE} all_cbsas <- get_acs( geography = "cbsa", variables = "B01003_001", year = 2023, geometry = TRUE ) populations <- list(NULL) for(i in seq_len(nrow(cities))){ city <- cities$City[i] state <- cities$State[i] populations[[city]] <- all_cbsas |> filter( str_detect(NAME, city) & str_detect(NAME, paste0(", .*", state)) # State appears after the comma ) } populations <- bind_rows(populations) city_center <- populations |> st_centroid() |> st_transform(4326) |> (\(x) mutate(x, lon = st_coordinates(x)[,1], lat = st_coordinates(x)[,2] ))() |> st_drop_geometry() |> select(lat, lon) cities <- bind_cols(cities, populations, city_center) cities <- cities |> mutate( density = estimate/as.double(st_area(geometry))*1000000 ) ``` ```{r plotPop, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ggplot(cities) + geom_col(aes(x = City, y = estimate, fill = City)) + labs(title = "City Population - Metro Areas", x = NULL, y = NULL) + scale_y_continuous(label = unit_format(unit = "K", scale = 1e-3, sep = ""), expand = expansion(mult = c(0,0.05))) + scale_fill_brewer(type = "qual", guide = NULL) ggplot(cities) + geom_col(aes(x = City, y = density, fill = City)) + labs(title = "City Density - Metro Areas", x = NULL, y = "people/km^2") + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + scale_fill_brewer(type = "qual", guide = NULL) ``` ## Map cities ```{r cities_map, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} ggplot(data = cities) + geom_polygon(data = map_data(map = "state"), aes(long, lat, group = group), fill = "white", colour = "grey50") + geom_point(aes(x = lon, y = lat, fill = City), shape = 21, color = "black", size = 4) + scale_fill_brewer(type = "qual") ``` ## weather ```{r weather, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} weather <- list(NULL) for(city in cities$City){ city_info <- cities |> filter(City == city) city_run <- weather_history( location = c(city_info |> pull(lat), city_info |> pull(lon)), start = date_start, end = date_end, daily = c("apparent_temperature_max", "apparent_temperature_min", "precipitation_hours"), response_units = list(temperature_unit = "fahrenheit") ) city_run$city <- city weather[[city]] <- city_run } weather <- bind_rows(weather) weather |> mutate(year = year(ymd(date)), month = month(ymd(date))) |> group_by(year, city) |> summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) |> group_by(city) |> summarise(median_days_above_80 = median(days_above_80)) |> ggplot() + geom_col(aes(x = city, y = median_days_above_80, fill = city)) + scale_fill_brewer(type = "qual", guide = NULL) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + labs(title = "Days above 80°F (apparent temperature)", y = "Median days per year", x = NULL, fill = NULL) weather |> pivot_longer(cols = starts_with("daily"), names_to = "max_min", values_to = "temperature") |> filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max")) |> ggplot() + geom_violin(aes(x = city, y = temperature, fill = max_min)) + scale_fill_manual(labels = c("daily max", "daily min"), values = c("firebrick", "dodgerblue")) + labs(title = "Apparent Temperature", y = "°F", x = NULL, fill = NULL) weather |> mutate(year = year(ymd(date)), month = month(ymd(date))) |> group_by(year, city) |> summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |> group_by(city) |> summarise(median_days_above_4hr = median(days_above_4hr)) |> ggplot() + geom_col(aes(x = city, y = median_days_above_4hr, fill = city)) + scale_fill_brewer(type = "qual", guide = NULL) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + labs(title = "Days with more than 4 hrs of rain", y = "Median days per year", x = NULL, fill = NULL) weather |> mutate(year = year(ymd(date)), month = month(ymd(date))) |> group_by(year, month, city) |> summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |> group_by(city, month) |> summarise(median_days_above_4hr = median(days_above_4hr)) |> ggplot() + scale_x_continuous(breaks = seq(1,12,1), expand = expansion(mult = c(0,0))) + scale_y_continuous(expand = expansion(mult = c(0,0))) + geom_line(aes(x = month, y = median_days_above_4hr, color = city), size = 2) + scale_color_brewer(type = "qual") + labs(title = "Days with more than 4 hrs of rain", y = "Median days per month", x = "Month", color = "City") ``` ## Air Quality ```{r airQualityData, eval = TRUE, echo = FALSE, results = "hide", warning = FALSE, error = TRUE, message = FALSE} # download data # Create data directory if it doesn't exist if (!dir.exists("data")) { dir.create("data") } # Define years years <- seq(year(ymd(date_start)), year(ymd(date_end)), 1) # Initialize empty list to store dataframes aqi_list <- list() # Download and process files for each year for (year in years) { # Construct URL url <- paste0("https://aqs.epa.gov/aqsweb/airdata/daily_aqi_by_cbsa_", year, ".zip") # Define local file paths zip_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".zip")) csv_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".csv")) # Download zip if it doesn't exist if (file.exists(zip_file)) { cat(paste0("Zip file for year ", year, " already exists. Skipping download.\n")) } else { cat(paste0("Downloading data for year ", year, "...\n")) tryCatch({ download.file(url, zip_file, mode = "wb") cat(paste0("Successfully downloaded data for year ", year, "\n")) }, error = function(e) { cat(paste0("Error downloading data for year ", year, ": ", e$message, "\n")) next }) } # Extract zip if CSV doesn't exist if (file.exists(zip_file)) { if (file.exists(csv_file)) { cat(paste0("CSV for year ", year, " already extracted. Skipping extraction.\n")) } else { cat(paste0("Extracting zip for year ", year, "...\n")) tryCatch({ unzip(zip_file, exdir = "data") cat(paste0("Successfully extracted data for year ", year, "\n")) }, error = function(e) { cat(paste0("Error extracting data for year ", year, ": ", e$message, "\n")) next }) } } # Read CSV if it exists if (file.exists(csv_file)) { cat(paste0("Reading CSV for year ", year, "...\n")) tryCatch({ aqi_year <- read.csv(csv_file) aqi_list[[as.character(year)]] <- aqi_year cat(paste0("Successfully read ", nrow(aqi_year), " rows for year ", year, "\n")) }, error = function(e) { cat(paste0("Error reading CSV for year ", year, ": ", e$message, "\n")) }) } } # Combine all dataframes aqi_data <- bind_rows(aqi_list) aqi_data <- aqi_data |> filter(CBSA.Code %in% (cities |> pull(`CBSA-Code`))) |> mutate(year = year(ymd(Date)), month = month(ymd(Date))) |> left_join(cities, by = join_by("CBSA.Code" == "CBSA-Code")) ``` ```{r airQualityPlots, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} aqi_threshhold <- 50 aqi_data |> group_by(year, City) |> summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |> group_by(City) |> summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> ggplot() + geom_col(aes(x = City, y = mean_hours_above, fill = City)) + scale_fill_brewer(type = "qual", guide = NULL) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + labs(title = "Hours above 50 AQI", y = "Mean hours per year", x = NULL, fill = NULL) aqi_data |> group_by(year, City) |> summarise(hours_above = sum(AQI >= 2*aqi_threshhold, na.rm = TRUE)) |> group_by(City) |> summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> ggplot() + geom_col(aes(x = City, y = mean_hours_above, fill = City)) + scale_fill_brewer(type = "qual", guide = NULL) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + labs(title = paste0("Hours above ", aqi_threshhold * 2," AQI"), y = "Mean hours per year", x = NULL, fill = NULL) aqi_data |> ggplot() + geom_violin(aes(x = City, y = AQI, fill = City)) + scale_fill_brewer(type = "qual", guide = NULL) + coord_cartesian(ylim = c(0, 150)) + labs(title = "AQI", y = "AQI", x = NULL, fill = NULL) aqi_data |> group_by(year, month, City) |> summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |> group_by(City, month) |> summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> ggplot() + geom_line(aes(x = month, y = mean_hours_above, color = City)) + scale_x_continuous(breaks = seq(1,12,1), expand = expansion(mult = c(0,0))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + scale_color_brewer(type = "qual") + labs(title = paste0("Hours with an AQI of greater than or equal to ", aqi_threshhold), y = "Mean days per month", x = "Month", color = "City") ``` ## Transportation ```{r transportation, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} # Define commute mode variables commute_vars <- c( commute_total = "B08301_001", drove_alone = "B08301_003", carpooled = "B08301_004", transit = "B08301_010", bicycle = "B08301_018", walked = "B08301_019", work_from_home = "B08301_021" ) # Define vehicle ownership variables vehicle_vars <- c( households_total = "B08201_001", vehicles_0 = "B08201_002", vehicles_1 = "B08201_003", vehicles_2 = "B08201_004", vehicles_3 = "B08201_005", vehicles_4plus = "B08201_006", vehicles_aggregate = "B25046_001" ) # Fetch commute mode data commute_data <- list() for(i in seq_len(nrow(cities))){ city <- cities$city_name[i] state <- cities$State[i] get_acs( geography = "place", variables = commute_vars, state = state, year = 2023 ) |> filter(str_detect(NAME, city)) |> select(NAME, variable, estimate) |> pivot_wider(names_from = variable, values_from = estimate) |> mutate( city = city, pct_drove_alone = drove_alone / commute_total * 100, pct_carpooled = carpooled / commute_total * 100, pct_transit = transit / commute_total * 100, pct_bicycle = bicycle / commute_total * 100, pct_walked = walked / commute_total * 100, pct_work_from_home = work_from_home / commute_total * 100 ) -> commute_data[[city]] } commute_data <- bind_rows(commute_data) # Fetch vehicle ownership data vehicle_data <- list() for(i in seq_len(nrow(cities))){ city <- cities$city_name[i] state <- cities$State[i] get_acs( geography = "place", variables = vehicle_vars, state = state, year = 2023 ) |> filter(str_detect(NAME, city)) |> select(NAME, variable, estimate) |> pivot_wider(names_from = variable, values_from = estimate) |> mutate( city = city, avg_vehicles_per_hh = vehicles_aggregate / households_total, pct_no_vehicle = vehicles_0 / households_total * 100, pct_1_vehicle = vehicles_1 / households_total * 100, pct_2_vehicle = vehicles_2 / households_total * 100, pct_3plus_vehicle = (vehicles_3 + vehicles_4plus) / households_total * 100 ) -> vehicle_data[[city]] } vehicle_data <- bind_rows(vehicle_data) ``` ```{r transportationPlots, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE} vehicle_data |> ggplot() + geom_col(aes(x = str_remove(city, " city"), y = pct_no_vehicle / 100, fill = city)) + scale_y_continuous(expand = expansion(mult = c(0,0.05)), labels = percent_format()) + scale_fill_brewer(type = "qual", guide = NULL) + labs(title = "Percent of households without a car", y = NULL, x = NULL) vehicle_data |> ggplot() + geom_col(aes(x = str_remove(city, " city"), y = avg_vehicles_per_hh, fill = city)) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + scale_fill_brewer(type = "qual", guide = NULL) + labs(title = "Average Vehicle per Household", x = NULL, y = NULL) commute_data |> pivot_longer(cols = c("pct_walked","pct_bicycle","pct_transit"), names_to = "transit_type", values_to = "pct") |> ggplot() + geom_col(aes(x = str_remove(city, " city"), y = pct / 100, fill = transit_type)) + scale_y_continuous(expand = expansion(mult = c(0,0.05)), labels = percent_format()) + labs(title = "Commute mode share - non car", y = NULL, x = NULL, fill = "Commute mode") ```