diff --git a/.gitignore b/.gitignore index bf41ddf..bf6283c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ #exclude API keys api_keys/* +#exclude data +data/* diff --git a/cities.csv b/cities.csv index 776e14a..71a8a24 100644 --- a/cities.csv +++ b/cities.csv @@ -1,8 +1,8 @@ -City,Type,State,Country -Madison,city,WI,United States -Bellingham,city,WA,United States -Portland,city,OR,United States -Port Angeles,city,WA,United States -Boston,city,MA,United States -Boise City,city,ID,United States -Salt Lake City,city,UT,United States +City,Type,State,Country,CBSA-Code +Madison,city,WI,United States,31540 +Bellingham,city,WA,United States,13380 +Portland,city,OR,United States,38900 +Port Angeles,city,WA,United States,38820 +Boston,city,MA,United States,14460 +Boise City,city,ID,United States,14260 +Salt Lake City,city,UT,United States,41620 diff --git a/city-compare.Rproj b/city-compare.Rproj index 8e3c2eb..be80382 100644 --- a/city-compare.Rproj +++ b/city-compare.Rproj @@ -11,3 +11,5 @@ Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX + +BuildType: Makefile diff --git a/city_compare.Rmd b/city_compare.Rmd index a42ae6d..02dd6a3 100644 --- a/city_compare.Rmd +++ b/city_compare.Rmd @@ -36,7 +36,7 @@ census_api_key(key = substr(read_file(file = "api_keys/census_api_key"), 1, 40)) ## Date ranges ```{r date_range, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} -date_start <- "2010-01-01" +date_start <- "2014-01-01" date_end <- "2024-12-31" ``` @@ -44,42 +44,42 @@ date_end <- "2024-12-31" ```{r cities, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} cities <- read_csv(file = "cities.csv") -cities <- cities %>% +cities <- cities |> mutate(city_name = paste0(City, " ", Type)) ``` -# Get data +# Data ## Census data ```{r census, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} populations <- list(NULL) for(city in cities$city_name){ - state <- cities %>% filter(city_name == city) %>% pull(State) + state <- cities |> filter(city_name == city) |> pull(State) populations[[city]] <- get_acs( geography = "place", variables = "B01003_001", state = state, year = 2023, geometry = TRUE - ) %>% + ) |> filter(str_detect(NAME, city)) } populations <- bind_rows(populations) -city_center <- populations %>% - st_centroid() %>% - st_transform(4326) %>% # Convert to WGS84 (standard lat/lon) +city_center <- populations |> + st_centroid() |> + st_transform(4326) %>% mutate( lon = st_coordinates(.)[,1], lat = st_coordinates(.)[,2] - ) %>% - st_drop_geometry() %>% + ) |> + st_drop_geometry() |> select(lat, lon) cities <- bind_cols(cities, populations, city_center) -cities <- cities %>% mutate( +cities <- cities |> mutate( density = estimate/as.double(st_area(geometry))*1000000 ) @@ -120,12 +120,12 @@ ggplot(data = cities) + ```{r weather, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} weather <- list(NULL) for(city in cities$City){ - city_info <- cities %>% filter(City == city) + city_info <- cities |> filter(City == city) city_run <- weather_history( - location = c(city_info %>% pull(lat), city_info %>% pull(lon)), + location = c(city_info |> pull(lat), city_info |> pull(lon)), start = date_start, end = date_end, - daily = c("apparent_temperature_max", "apparent_temperature_min"), + daily = c("apparent_temperature_max", "apparent_temperature_min", "precipitation_hours"), response_units = list(temperature_unit = "fahrenheit") ) city_run$city <- city @@ -134,23 +134,27 @@ for(city in cities$City){ weather <- bind_rows(weather) -weather_summary <- weather %>% +weather |> mutate(year = year(ymd(date)), - month = month(ymd(date))) %>% - group_by(year, city) %>% - summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) %>% - group_by(city) %>% - summarise(median_days_above_80 = median(days_above_80)) - -ggplot(data = weather_summary) + + month = month(ymd(date))) |> + group_by(year, city) |> + summarise(days_above_80 = sum(daily_apparent_temperature_max > 80)) |> + group_by(city) |> + summarise(median_days_above_80 = median(days_above_80)) |> + ggplot() + geom_col(aes(x = city, y = median_days_above_80)) + - labs(title = "Days above 80°F", + labs(title = "Days above 80°F (apparent temperature)", y = "Median days per year", x = "City", fill = NULL) -ggplot(data = weather %>% pivot_longer(cols = starts_with("daily"), names_to = "max_min", values_to = "temperature") %>% filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max"))) + +weather |> + pivot_longer(cols = starts_with("daily"), + names_to = "max_min", + values_to = "temperature") |> + filter(max_min %in% c("daily_apparent_temperature_min", "daily_apparent_temperature_max")) |> +ggplot() + geom_violin(aes(x = city, y = temperature, fill = max_min)) + @@ -160,5 +164,167 @@ ggplot(data = weather %>% pivot_longer(cols = starts_with("daily"), names_to = " y = "°F", x = "City", fill = NULL) + +weather |> + mutate(year = year(ymd(date)), + month = month(ymd(date))) |> + group_by(year, city) |> + summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |> + group_by(city) |> + summarise(median_days_above_4hr = median(days_above_4hr)) |> + ggplot() + + geom_col(aes(x = city, + y = median_days_above_4hr)) + + labs(title = "Days with more than 4 hrs of rain", + y = "Median days per year", + x = "City", + fill = NULL) + +weather |> + mutate(year = year(ymd(date)), + month = month(ymd(date))) |> + group_by(year, month, city) |> + summarise(days_above_4hr = sum(daily_precipitation_hours > 4)) |> + group_by(city, month) |> + summarise(median_days_above_4hr = median(days_above_4hr)) |> + ggplot() + + geom_line(aes(x = month, + y = median_days_above_4hr, + color = city)) + + labs(title = "Days with more than 4 hrs of rain", + y = "Median days per month", + x = "Month", + color = "City") + ``` + +## Air Quality +```{r air_quality, eval = TRUE, echo = TRUE, results = "show", warning = FALSE, error = TRUE, message = FALSE} +# download data +# Create data directory if it doesn't exist +if (!dir.exists("data")) { + dir.create("data") +} + +# Define years +years <- seq(year(ymd(date_start)), year(ymd(date_end)), 1) + +# Initialize empty list to store dataframes +aqi_list <- list() + +# Download and process files for each year +for (year in years) { + # Construct URL + url <- paste0("https://aqs.epa.gov/aqsweb/airdata/daily_aqi_by_cbsa_", year, ".zip") + + # Define local file paths + zip_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".zip")) + csv_file <- file.path("data", paste0("daily_aqi_by_cbsa_", year, ".csv")) + + # Download zip if it doesn't exist + if (file.exists(zip_file)) { + cat(paste0("Zip file for year ", year, " already exists. Skipping download.\n")) + } else { + cat(paste0("Downloading data for year ", year, "...\n")) + + tryCatch({ + download.file(url, zip_file, mode = "wb") + cat(paste0("Successfully downloaded data for year ", year, "\n")) + }, error = function(e) { + cat(paste0("Error downloading data for year ", year, ": ", e$message, "\n")) + next + }) + } + + # Extract zip if CSV doesn't exist + if (file.exists(zip_file)) { + if (file.exists(csv_file)) { + cat(paste0("CSV for year ", year, " already extracted. Skipping extraction.\n")) + } else { + cat(paste0("Extracting zip for year ", year, "...\n")) + tryCatch({ + unzip(zip_file, exdir = "data") + cat(paste0("Successfully extracted data for year ", year, "\n")) + }, error = function(e) { + cat(paste0("Error extracting data for year ", year, ": ", e$message, "\n")) + next + }) + } + } + + # Read CSV if it exists + if (file.exists(csv_file)) { + cat(paste0("Reading CSV for year ", year, "...\n")) + tryCatch({ + aqi_year <- read.csv(csv_file) + aqi_list[[as.character(year)]] <- aqi_year + cat(paste0("Successfully read ", nrow(aqi_year), " rows for year ", year, "\n")) + }, error = function(e) { + cat(paste0("Error reading CSV for year ", year, ": ", e$message, "\n")) + }) + } +} + +# Combine all dataframes +aqi_data <- bind_rows(aqi_list) +aqi_data <- aqi_data |> + filter(CBSA.Code %in% (cities |> pull(`CBSA-Code`))) |> + mutate(year = year(ymd(Date)), + month = month(ymd(Date))) + +aqi_threshhold <- 50 + +aqi_data |> + group_by(year, CBSA) |> + summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |> + group_by(CBSA) |> + summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> + ggplot() + + geom_col(aes(x = CBSA, + y = mean_hours_above)) + + labs(title = "Hours above 50 AQI", + y = "Mean hours per year", + x = "City", + fill = NULL) + +aqi_data |> + group_by(year, CBSA) |> + summarise(hours_above = sum(AQI >= 2*aqi_threshhold, na.rm = TRUE)) |> + group_by(CBSA) |> + summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> + ggplot() + + geom_col(aes(x = CBSA, + y = mean_hours_above)) + + labs(title = paste0("Hours above ", aqi_threshhold * 2," AQI"), + y = "Mean hours per year", + x = "City", + fill = NULL) + +aqi_data |> + ggplot() + + geom_violin(aes(x = CBSA, + y = AQI)) + + labs(title = "AQI", + y = "AQI", + x = "City", + fill = NULL) + + coord_cartesian(ylim = (c(0,500))) + +aqi_data |> + group_by(year, month, CBSA) |> + summarise(hours_above = sum(AQI >= aqi_threshhold, na.rm = TRUE)) |> + group_by(CBSA, month) |> + summarise(mean_hours_above = mean(hours_above, na.rm = TRUE)) |> + ggplot() + + geom_line(aes(x = month, + y = mean_hours_above, + color = CBSA)) + + scale_x_continuous(breaks = seq(1,12,1)) + + scale_color_brewer(type = "qual") + + labs(title = paste0("Hours with an AQI of greater than or equal to ", aqi_threshhold), + y = "Mean days per month", + x = "Month", + color = "City") + +```