city-compare/city_compare.Rmd

---
title: "City Compare"
output:
  html_document:
    toc: true
    toc_depth: 5
    toc_float:
      collapsed: false
      smooth_scroll: true
editor_options:
  chunk_output_type: console
---

# Input Data & Configuration

## Libraries

```{r libs, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
date()
rm(list=ls())
library(tidyverse)
library(tidycensus)
library(sf)
library(openmeteo)
library(maps)
library(scales)

```

## API keys
```{r api_keys, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
# load census api key
census_api_key(key = substr(read_file(file = "api_keys/census_api_key"), 1, 40))

```

## Date ranges
```{r date_range, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
date_start <- "2014-01-01"
date_end <- "2024-12-31"

```
## Cities to compare

```{r cities, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
cities <- read_csv(file = "cities.csv")
cities <- cities |>
  mutate(city_name = paste0(City, " ", Type))

```

# Data

## Census data
```{r census, eval = TRUE, echo = FALSE, results = "hide", warning = FALSE, error = TRUE, message = FALSE}
all_cbsas <- get_acs(
  geography = "cbsa",
  variables = "B01003_001",
  year = 2023,
  geometry = TRUE
)
populations <- list(NULL)
for(i in seq_len(nrow(cities))){
  city <- cities$City[i]
  state <- cities$State[i]

  populations[[city]] <- all_cbsas |>
    filter(
      str_detect(NAME, city) &
      str_detect(NAME, paste0(", .*", state))  # State appears after the comma
    )
}
populations <- bind_rows(populations)

city_center <- populations |>
  st_centroid() |>
  st_transform(4326) |>
  (\(x) mutate(x,
    lon = st_coordinates(x)[,1],
    lat = st_coordinates(x)[,2]
  ))() |>
  st_drop_geometry() |>
  select(lat, lon)

cities <- bind_cols(cities, populations, city_center)
cities <- cities |> mutate(
  density = estimate/as.double(st_area(geometry))*1000000
)
```

```{r plotPop, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
ggplot(cities) +
  geom_col(aes(x = City,
               y = estimate,
               fill = City)) +
  labs(title = "City Population - Metro Areas",
       x = NULL,
       y = NULL) +
  scale_y_continuous(label = unit_format(unit = "K", scale = 1e-3, sep = ""), expand = expansion(mult = c(0,0.05))) +
  scale_fill_brewer(type = "qual", guide = NULL)

ggplot(cities) +
  geom_col(aes(x = City,
               y = density,
               fill = City)) +
  labs(title = "City Density - Metro Areas",
       x = NULL,
       y = "people/km^2") +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  scale_fill_brewer(type = "qual", guide = NULL)
```

## Map cities
```{r cities_map, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
ggplot(data = cities) +
  geom_polygon(data = map_data(map = "state"),
               aes(long, lat, group = group),
               fill = "white", colour = "grey50") +
  geom_point(aes(x = lon,
                 y = lat,
                 fill = City),
             shape = 21,
             color = "black",
             size = 4) +
  scale_fill_brewer(type = "qual")


```

## weather
```{r weather, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
weather <- list(NULL)
for(city in cities$City){
  city_info <- cities |> filter(City == city)
  city_run <- weather_history(
    location = c(city_info |> pull(lat), city_info |> pull(lon)),
    start = date_start,
    end = date_end,
    daily = c("apparent_temperature_max", "apparent_temperature_min", "precipitation_hours"),
    response_units = list(temperature_unit = "fahrenheit")
  )
  city_run$city <- city
  weather[[city]] <- city_run
}

weather <- bind_rows(weather)

weather |>
  mutate(year = year(ymd(date)),
         month = month(ymd(date))) |>
  group_by(year, city) |>
  summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) |>
  group_by(city) |>
  summarise(median_days_above_80 = median(days_above_80)) |>
  ggplot() +
  geom_col(aes(x = city,
               y = median_days_above_80,
               fill = city)) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  labs(title = "Days above 80°F (apparent temperature)",
       y = "Median days per year",
       x = NULL,
       fill = NULL)

weather |>
  pivot_longer(cols = starts_with("daily"),
               names_to = "max_min",
               values_to = "temperature") |>
  filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max")) |>
ggplot() +
  geom_violin(aes(x = city,
                  y = temperature,
                  fill = max_min)) +
  scale_fill_manual(labels = c("daily max", "daily min"),
                    values = c("firebrick", "dodgerblue")) +
  labs(title = "Apparent Temperature",
       y = "°F",
       x = NULL,
       fill = NULL)

weather |>
  mutate(year = year(ymd(date)),
         month = month(ymd(date))) |>
  group_by(year, city) |>
  summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |>
  group_by(city) |>
  summarise(median_days_above_4hr = median(days_above_4hr)) |>
  ggplot() +
  geom_col(aes(x = city,
               y = median_days_above_4hr,
               fill = city)) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  labs(title = "Days with more than 4 hrs of rain",
       y = "Median days per year",
       x = NULL,
       fill = NULL)

weather |>
  mutate(year = year(ymd(date)),
         month = month(ymd(date))) |>
  group_by(year, month, city) |>
  summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |>
  group_by(city, month) |>
  summarise(median_days_above_4hr = median(days_above_4hr)) |>
  ggplot() +
  scale_x_continuous(breaks = seq(1,12,1), expand = expansion(mult = c(0,0))) +
  scale_y_continuous(expand = expansion(mult = c(0,0))) +
  geom_line(aes(x = month,
               y = median_days_above_4hr,
               color = city),
            size = 2) +
  scale_color_brewer(type = "qual") +
  labs(title = "Days with more than 4 hrs of rain",
       y = "Median days per month",
       x = "Month",
       color = "City")

```


## Air Quality
```{r airQualityData, eval = TRUE, echo = FALSE, results = "hide", warning = FALSE, error = TRUE, message = FALSE}
# download data
# Create data directory if it doesn't exist
if (!dir.exists("data")) {
  dir.create("data")
}

# Define years
years <- seq(year(ymd(date_start)), year(ymd(date_end)), 1)

# Initialize empty list to store dataframes
aqi_list <- list()

# Download and process files for each year
for (year in years) {
  # Construct URL
  url <- paste0("https://aqs.epa.gov/aqsweb/airdata/daily_aqi_by_cbsa_", year, ".zip")

  # Define local file paths
  zip_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".zip"))
  csv_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".csv"))

  # Download zip if it doesn't exist
  if (file.exists(zip_file)) {
    cat(paste0("Zip file for year ", year, " already exists. Skipping download.\n"))
  } else {
    cat(paste0("Downloading data for year ", year, "...\n"))

    tryCatch({
      download.file(url, zip_file, mode = "wb")
      cat(paste0("Successfully downloaded data for year ", year, "\n"))
    }, error = function(e) {
      cat(paste0("Error downloading data for year ", year, ": ", e$message, "\n"))
      next
    })
  }

  # Extract zip if CSV doesn't exist
  if (file.exists(zip_file)) {
    if (file.exists(csv_file)) {
      cat(paste0("CSV for year ", year, " already extracted. Skipping extraction.\n"))
    } else {
      cat(paste0("Extracting zip for year ", year, "...\n"))
      tryCatch({
        unzip(zip_file, exdir = "data")
        cat(paste0("Successfully extracted data for year ", year, "\n"))
      }, error = function(e) {
        cat(paste0("Error extracting data for year ", year, ": ", e$message, "\n"))
        next
      })
    }
  }

  # Read CSV if it exists
  if (file.exists(csv_file)) {
    cat(paste0("Reading CSV for year ", year, "...\n"))
    tryCatch({
      aqi_year <- read.csv(csv_file)
      aqi_list[[as.character(year)]] <- aqi_year
      cat(paste0("Successfully read ", nrow(aqi_year), " rows for year ", year, "\n"))
    }, error = function(e) {
      cat(paste0("Error reading CSV for year ", year, ": ", e$message, "\n"))
    })
  }
}

# Combine all dataframes
aqi_data <- bind_rows(aqi_list)
aqi_data <- aqi_data |>
  filter(CBSA.Code %in% (cities |> pull(`CBSA-Code`))) |>
  mutate(year = year(ymd(Date)),
         month = month(ymd(Date))) |>
  left_join(cities, by = join_by("CBSA.Code" == "CBSA-Code"))
```

```{r airQualityPlots, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
aqi_threshhold <- 50

aqi_data |>
  group_by(year, City) |>
  summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |>
  group_by(City) |>
  summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
  ggplot() +
  geom_col(aes(x = City,
               y = mean_hours_above,
               fill = City)) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  labs(title = "Hours above 50 AQI",
       y = "Mean hours per year",
       x = NULL,
       fill = NULL)

aqi_data |>
  group_by(year, City) |>
  summarise(hours_above = sum(AQI >= 2*aqi_threshhold, na.rm = TRUE)) |>
  group_by(City) |>
  summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
  ggplot() +
  geom_col(aes(x = City,
               y = mean_hours_above,
               fill = City)) +
  scale_fill_brewer(type = "qual", guide = NULL) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  labs(title = paste0("Hours above ", aqi_threshhold * 2," AQI"),
       y = "Mean hours per year",
       x = NULL,
       fill = NULL)

aqi_data |>
  ggplot() +
  geom_violin(aes(x = City,
                  y = AQI,
                  fill = City)) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  coord_cartesian(ylim = c(0, 150)) +
  labs(title = "AQI",
       y = "AQI",
       x = NULL,
       fill = NULL)

aqi_data |>
  group_by(year, month, City) |>
  summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |>
  group_by(City, month) |>
  summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |>
  ggplot() +
  geom_line(aes(x = month,
               y = mean_hours_above,
               color = City)) +
  scale_x_continuous(breaks = seq(1,12,1), expand = expansion(mult = c(0,0))) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  scale_color_brewer(type = "qual") +
  labs(title = paste0("Hours with an AQI of greater than or equal to ", aqi_threshhold),
       y = "Mean days per month",
       x = "Month",
       color = "City")

```

## Transportation
```{r transportation, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
# Define commute mode variables
commute_vars <- c(
  commute_total       = "B08301_001",
  drove_alone         = "B08301_003",
  carpooled           = "B08301_004",
  transit             = "B08301_010",
  bicycle             = "B08301_018",
  walked              = "B08301_019",
  work_from_home      = "B08301_021"
)

# Define vehicle ownership variables
vehicle_vars <- c(
  households_total    = "B08201_001",
  vehicles_0          = "B08201_002",
  vehicles_1          = "B08201_003",
  vehicles_2          = "B08201_004",
  vehicles_3          = "B08201_005",
  vehicles_4plus      = "B08201_006",
  vehicles_aggregate  = "B25046_001"
)

# Fetch commute mode data
commute_data <- list()
for(i in seq_len(nrow(cities))){
  city <- cities$city_name[i]
  state <- cities$State[i]

  get_acs(
    geography = "place",
    variables = commute_vars,
    state = state,
    year = 2023
  ) |>
    filter(str_detect(NAME, city)) |>
    select(NAME, variable, estimate) |>
    pivot_wider(names_from = variable, values_from = estimate) |>
    mutate(
      city = city,
      pct_drove_alone    = drove_alone / commute_total * 100,
      pct_carpooled      = carpooled / commute_total * 100,
      pct_transit        = transit / commute_total * 100,
      pct_bicycle        = bicycle / commute_total * 100,
      pct_walked         = walked / commute_total * 100,
      pct_work_from_home = work_from_home / commute_total * 100
    ) -> commute_data[[city]]
}

commute_data <- bind_rows(commute_data)

# Fetch vehicle ownership data
vehicle_data <- list()
for(i in seq_len(nrow(cities))){
  city <- cities$city_name[i]
  state <- cities$State[i]

  get_acs(
    geography = "place",
    variables = vehicle_vars,
    state = state,
    year = 2023
  ) |>
    filter(str_detect(NAME, city)) |>
    select(NAME, variable, estimate) |>
    pivot_wider(names_from = variable, values_from = estimate) |>
    mutate(
      city = city,
      avg_vehicles_per_hh = vehicles_aggregate / households_total,
      pct_no_vehicle      = vehicles_0 / households_total * 100,
      pct_1_vehicle       = vehicles_1 / households_total * 100,
      pct_2_vehicle       = vehicles_2 / households_total * 100,
      pct_3plus_vehicle   = (vehicles_3 + vehicles_4plus) / households_total * 100
    ) -> vehicle_data[[city]]
}

vehicle_data <- bind_rows(vehicle_data)
```


```{r transportationPlots, eval = TRUE, echo = FALSE, results = "show", warning = FALSE, error = TRUE, message = FALSE}
vehicle_data |>
  ggplot() +
  geom_col(aes(x = str_remove(city, " city"),
               y = pct_no_vehicle / 100,
               fill = city)) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05)), labels = percent_format()) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  labs(title = "Percent of households without a car",
       y = NULL,
       x = NULL)

vehicle_data |>
  ggplot() +
  geom_col(aes(x = str_remove(city, " city"),
               y = avg_vehicles_per_hh,
               fill = city)) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
  scale_fill_brewer(type = "qual", guide = NULL) +
  labs(title = "Average Vehicle per Household",
       x = NULL,
       y = NULL)

commute_data |>
  pivot_longer(cols = c("pct_walked","pct_bicycle","pct_transit"),
               names_to = "transit_type",
               values_to = "pct") |>
  ggplot() +
  geom_col(aes(x = str_remove(city, " city"),
               y = pct / 100,
               fill = transit_type)) +
    scale_y_continuous(expand = expansion(mult = c(0,0.05)), labels = percent_format()) +
  labs(title = "Commute mode share - non car",
       y = NULL,
       x = NULL,
       fill = "Commute mode")

```