Skip to contents

Editable script

Here is a script you can use if you would prefer to have all the code for the 7DADM and site impairment steps in a single script rather than an installable package format. Just click the clipboard icon in the top right corner of this code chunk to copy the code, then open a new R script in RStudio and paste it there, and save the script wherever you would like to store it on your computer.

# load necessary libraries
library(readr)
library(dplyr)
library(lubridate)

# to handle data from a new water year, just change these file paths
fall_data_loc = "data/2021_fall/4_final_csv/"
sum_data_loc = "data/2022_summer/4_final_csv/"
lookup_loc = "lookup_tables/"

# read in site-group lookup table
# - For each site, which WQS group is it in?
site_group = read_csv(paste0(lookup_loc, "site_group.csv")) %>%
  # removes any completely empty rows (in case this data frame is written
  # in Excel in the future and Excel adds extra empty lines)
  filter_all(any_vars(!is.na(.)))

# read in group-WQS lookup table
# - For each group, how does the WQS change over the water year?
group_wqs = read_csv(paste0(lookup_loc, "group_wqs.csv")) %>%
  # removes any completely empty rows (in case this data frame is written
  # in Excel in the future and Excel adds extra empty lines)
  filter_all(any_vars(!is.na(.)))

# read in quality-controlled data ----------------------

cat("Reading in QC'd data...", fill = TRUE)
# assumes all data files are csv format, not xls or xlsx
# read in fall deployment files
# fall filename format: "AndersonLower_2022_fall_FINAL.csv"
data_files_fall = list.files(path = fall_data_loc, pattern = '*csv')
# read in summer deployment files
# summer filename format: "AndersonLower_2023_sum_FINAL.csv"
data_files_sum = list.files(path = sum_data_loc, pattern = '*csv')
# combine fall and summer filenames into one list so we just need one loop
data_files = c(data_files_fall, data_files_sum)

# loop through each file, read it in, process it
all_data <-NULL # Create new data frame to hold combined data

file_no = 0
for(this_file in data_files){ # for each file,
  # update file_no counter
  file_no = file_no + 1
  #this_file = data_files[1] # uncomment to troubleshoot within loop

  # extract metadata (sitename and deployment season) from filename
  filename_parts = strsplit(this_file, '[_.]')[[1]]
  sitename = filename_parts[1]
  deploy_season = paste(filename_parts[3], filename_parts[2])

  # all the fall files are first in the list, so we can use this to tell
  # from file_no whether the current file is a summer or fall deployment
  # and that tells us in which directory we can find this file
  if(file_no > length(data_files_fall)){
    file_dir = sum_data_loc
  }else{
    file_dir = fall_data_loc
  }

  # read in the data (this_file) from the right directory (file_dir)
  this_data = readr::read_csv(file = paste0(file_dir, this_file),
                              skip = 1, # skip header line
                              # the only columns we really need are 2, 3, 10, and 13,
                              # (date, time, water temp, and the QC indicator UseForCalc)
                              # but we can keep them all for reference
                              col_select = c(2, 3, 10, 13), # read only the four columns of data we need
                              col_names = FALSE, # don't try to name columns from a row of the file
                              show_col_types = FALSE) %>% # suppresses print message
    data.frame() %>% # get rid of annoying attribute information
    # the next several lines convert the character-format datetime to an R POSIXct object
    # ymd_hms is the format the character string is in initially; it tells R
    # how to read and interpret the character string
    dplyr::rename(
      StartDate = X2,
      StartTime = X3,
      WaterTemp = X10,
      UseForCalc = X13) %>%
    dplyr::mutate(
      StartDate = lubridate::mdy(StartDate),
      StartTime = as.character(StartTime),
      SiteName = sitename,
      DeploySeason = deploy_season) %>%
    dplyr::mutate(datetime = as.POSIXct(
      paste(StartDate, StartTime, " "),
      format = "%Y-%m-%d %H:%M:%S",
      tz = "America/Los_Angeles")) %>%
    dplyr::mutate(Date = base::as.Date(datetime, format = "%Y-%m-%d")) %>%
    # keep only the data with UseForCalc = 1 (data that passed QC)
    dplyr::filter(UseForCalc == 1)

  # add this file's data to the combined data frame
  all_data <- dplyr::bind_rows(all_data, this_data)
  # print some basic info to the R Console to update us on what was read in
  cat(paste0(" - Site ", sitename, " Deployment ", deploy_season,
             " Dates ", min(this_data$Date), " to ", max(this_data$Date)),
      fill = TRUE)
} # end for-loop
cat("Done reading in QC'd data.", fill = TRUE)

# compute 7DADM for this data ---------------------------

cat("Computing 7DADM...", fill = TRUE)
temp_stats = all_data %>%
  # compute daily max water temp by site and date
  dplyr::group_by(SiteName, Date)  %>%
  dplyr::summarize(DailyMax = max(WaterTemp, na.rm = TRUE), .groups = 'drop') %>%
  # add in any missing SiteName-Date combinations so we can tell which data is
  # for consecutive dates
  dplyr::right_join(data.frame(SiteName = rep(unique(all_data$SiteName),
                                              each = as.integer(max(all_data$Date)-min(all_data$Date))+1),
                               Date = rep(seq(min(all_data$Date), max(all_data$Date), 1),
                                          dplyr::n_distinct(all_data$SiteName))),
                    by = dplyr::join_by(SiteName, Date)) %>%
  # sort by SiteName first, then by Date within SiteName
  dplyr::arrange(SiteName, Date) %>%
  # compute 7DADM
  dplyr::group_by(SiteName) %>%
  dplyr::mutate(sevenDADM = runner::mean_run(x = DailyMax,
                                             k = 7, lag = -3,
                                             idx = Date,
                                             na_pad = TRUE, na_rm = FALSE)) %>%
  dplyr::ungroup()

cat("Done computing 7DADM.", fill = TRUE)

# use site-group lookup table to determine which WQS group each site is in -----

temp_stats = dplyr::left_join(temp_stats, site_group,
                              by = dplyr::join_by(SiteName == site))

# use group-WQS lookup table to determine what the WQS is for each group ------
# over the course of the water year, and determine when exceedances happen
# match_group_to_WQS(group_wqs)
temp_stats = temp_stats %>%
  dplyr::mutate(Day = format(base::as.Date(Date), "%m/%d")) %>%
  dplyr::left_join(group_wqs, by = "grp_type", relationship = "many-to-many") %>%
  dplyr::mutate(dates_match = (Day > start_date & Day < end_date)) %>%
  dplyr::filter(dates_match) %>%
  dplyr::rename(WQS = wqs) %>%
  dplyr::select(SiteName, Date, Day, sevenDADM, WQS) %>%
  dplyr::mutate(Exceedance = (sevenDADM > WQS))


# generate a table of site impairments ---------------------
# site_impairments = site_impairments_table(wq_data)

site_impairments = data.frame(
  Site = character(),
  Impaired = character(),
  Dates = character())

# format Dates to have date ranges for consecutive dates,
# e.g. 7/18-7/20 instead of 7-18, 7-19, 7-20
for(site in unique(temp_stats$SiteName)){
  # cat(site, fill = TRUE)
  exceedance_dates = dplyr::filter(temp_stats, SiteName == site, Exceedance)$Date
  diffs = diff(exceedance_dates)
  if(length(diffs) >=1 & sum(diffs>7) > 0){
    # cat(site, fill = TRUE)
    consec_dates = split(format(exceedance_dates, "%m/%d"),
                         cumsum(c(1, diff(exceedance_dates) != 1)))
    date_list = sapply(consec_dates, function(x) paste(x[1], tail(x, n=1), sep = "-"))
    site_impairments = dplyr::add_row(site_impairments,
                                      Site = site,
                                      Impaired = "Yes",
                                      Dates = paste(date_list, collapse = ", "))
  } else {
    # cat(site, fill = TRUE)
    site_impairments = dplyr::add_row(site_impairments,
                                      Site = site,
                                      Impaired = "No",
                                      Dates = NA)
  }
}

Demo using dataQCtools

In development