Skip to contents

Editable script

Here is a script you can use if you would prefer to have all the code for the QC plot step in a single script rather than an installable package format. Just click the clipboard icon in the top right corner of this code chunk to copy the code, then open a new R script in RStudio and paste it there, and save the script wherever you would like to store it on your computer.

# load necessary libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(plotly)
library(htmlwidgets)
library(lubridate)

# directory containing cropped data to be plotted
cropped_loc <- "data/2022_summer/2_cropped_csv/"
# directory where QC plots will be stored
qc_plots_loc <- "data/2022_summer/3_qc_plots/"

# get a list of all the cropped files
filenames <- list.files(path = cropped_loc, pattern=".csv")
# this is now a list of all filenames; we haven't read in the data yet, but
# make sure this lists all the cropped files you want to plot
filenames

# Loop to read all the data files and combine them into one datafile for easier plotting
Combined <-NULL # Create new data frame to hold combined data

cat("Reading in cropped data files...", fill = TRUE)
i = 0
for (selection in filenames) {
  i = i+1
  #selection = filenames[1] # for troubleshooting the for-loop
  cat(paste0("Reading file ", i, " of ", length(filenames), ": ", selection), fill = TRUE)
  # split the filename string everywhere there is an underscore or period
  # so that we can get the following metadata from it:
  #  site name, deployment season and year, and media (air/water)
  info_from_filename = unlist(strsplit(selection, split="[_.]"))[1:4]
  sitename = info_from_filename[1]
  media = info_from_filename[2] # air or water; assumes this info is the third chunk after splitting the filename
  deploy_season = info_from_filename[3]
  if(deploy_season != "sum" & deploy_season != "fall"){
    stop("filename does not say sum or fall.")
  }
  deploy_year = 2000 + as.integer(info_from_filename[4]) # or we can format as character

  oneread <- read.csv(
    file = paste0(cropped_loc, selection), as.is=T, skip=1, fill=T, header=F
  ) %>% ## Reads the selected datafile.
    dplyr::select(1:3) %>% #select the first 3 columns (remove air temp from Hoh River sites)
    # add the metadata as variables/columns
    dplyr::mutate(sitename = sitename,
                  media = media,
                  deploy_season = deploy_season,
                  deploy_year = deploy_year)
  Combined <- dplyr::bind_rows(Combined, oneread)  ## Adds the datafile's data to the existing combined datafile.

} # filenames loop
cat("Done reading in cropped data.", fill = TRUE)

cat("Computing temperature differences...", fill = TRUE)
# only compute AWMaxDiff and AirRange if there is air data for at least one site
if("air" %in% unique(Combined$media)){
  Combined <- Combined %>%
    dplyr::rename(row = V1, datetime = V2, temp = V3) %>%  # Rename the first three variables that came from the csv file
    dplyr::mutate(datetime = lubridate::ymd_hms(datetime)) %>%  # change format of datetime column
    dplyr::mutate(date = lubridate::date(datetime)) %>%
    dplyr::group_by(sitename, date, media)  %>%  # compute daily max/min by site, date, and media (air/water)
    dplyr::summarize(dailymax = max(temp, na.rm = TRUE),
                     dailymin = min(temp, na.rm = TRUE)) %>%
    # pivot dataframe to add air or water to column name; this is necessary to calculate stats
    # this puts air and water on the same rows so we can just subtract columns in the next line
    # this requires that at least one site has air temperature!
    tidyr::pivot_wider(names_from = media,
                       values_from = c(dailymin, dailymax)) %>%
    dplyr::mutate(AWMaxDiff = dailymax_air - dailymax_water,
                  AirRange = dailymax_air - dailymin_air,
                  WaterRange = dailymax_water - dailymin_water) %>%
    #pivot longer for plotting purposes later; to plot on same graph, need "calc" column ("grouping variable")
    tidyr::pivot_longer(cols = dailymin_air:WaterRange, names_to = "calc", values_to = "value")
}else{
  Combined <- Combined %>%
    dplyr::rename(row = V1, datetime = V2, temp = V3) %>%  # Rename the first three variables that came from the csv file
    dplyr::mutate(datetime = lubridate::ymd_hms(datetime)) %>%  # change format of datetime column
    dplyr::mutate(date = lubridate::date(datetime)) %>% # extract just the date
    dplyr::group_by(sitename, date, media)  %>%  # compute daily max/min by site, date, and media (air/water)
    dplyr::summarize(dailymax = max(temp, na.rm = TRUE),
                     dailymin = min(temp, na.rm = TRUE)) %>%
    # pivot dataframe to add air or water to column name; this is necessary to calculate stats
    # this puts air and water on the same rows so we can just subtract columns in the next line
    # this requires that at least one site has air temperature!
    tidyr::pivot_wider(names_from = media,
                       values_from = c(dailymin, dailymax)) %>%
    dplyr::mutate(WaterRange = dailymax_water - dailymin_water) %>%
    #pivot longer for plotting purposes later; to plot on same graph, need "calc" column ("grouping variable")
    tidyr::pivot_longer(cols = dailymin_water:WaterRange, names_to = "calc", values_to = "value")
}

# set some plotting parameters
sites <- unique(Combined$sitename)
range.colors <- c(AirRange = "blue", WaterRange = "black")
maxdiff.colors <- c(dailymax_air = "blue", dailymax_water = "black", AWMaxDiff = "purple")

# loop through sites to plot all graphs for all sites
i = 0
for(s in sites){
  i = i+1
  cat(paste0("Making QC plots for site ", i, " of ", length(sites), ": ", s), fill = TRUE)
  # s = sites[1] # uncomment if you want to troubleshoot this loop
  # filter the data to just this site
  this.site.combined = dplyr::filter(Combined, sitename == s)
  rangeplot <- ggplot2::ggplot(this.site.combined %>%
                                 dplyr::filter(calc %in% c("AirRange","WaterRange")),
                               ggplot2::aes(x = date, y = value, color = calc)) +
    ggplot2::geom_line(na.rm=TRUE) +
    ggplot2::geom_point(na.rm=TRUE) +
    ggplot2::labs(title = paste0(s," Air and Water Temperature ranges"),
                  x = "Date", y = "Temperature (C)",
                  color = "Media") +
    ggplot2::scale_color_manual(values = range.colors) +
    ggplot2::geom_hline(yintercept = 3, linewidth = 0.3, color = "red")

  ggplot2::ggsave(paste0(qc_plots_loc, s, "_AirWaterTempRange.png"), rangeplot,
                  width = 11, height = 8.5, units = "in")

  maxdiffplot <- ggplot2::ggplot(this.site.combined %>%
                                   dplyr::filter(calc %in% c("dailymax_air", "dailymax_water", "AWMaxDiff")),
                                 ggplot2::aes(x = date, y = value, color = calc)) +
    ggplot2::geom_line(na.rm=TRUE) +
    ggplot2::labs(title = paste0(s," Max Air and Water Temperature and Difference"),
                  x = "Date", y = "Temperature (C)",
                  color = "Media") +
    ggplot2::scale_color_manual(values = maxdiff.colors) +
    ggplot2::geom_hline(yintercept = 20, linewidth = 0.3, color = "red")

  ggplot2::ggsave(paste0(qc_plots_loc, s, "_MaxDiffAirWaterTemp.png"),
                  maxdiffplot, width = 11, height = 8.5, units = "in")

  maxdifplotly <- plotly::ggplotly(maxdiffplot)#to create plotly of maxdifplot to trace plot
  rangeplotly <- plotly::ggplotly(rangeplot)#to create plotly of rangeplot to trace plot

  htmlwidgets::saveWidget(maxdifplotly, paste0(qc_plots_loc, s, "_MaxDiffAirWaterTemp.html"))
  htmlwidgets::saveWidget(rangeplotly, paste0(qc_plots_loc, s, "_AirWaterTempRange.html"))
}; cat("Done.", fill = TRUE)

Demo using dataQCtools::make_qc_plots_of_cropped_data()

In development