Skip to contents

Overview

Often sensors and equipment record data for some time before they are actually deployed in the field. As a result, they may record spurious data for a few minutes or hours between when they are first powered on or set up and when they are placed in the environment they are meant to be monitoring. Similarly, when equipment is retrieved from the field and brought to a lab to download data, the equipment often keeps recording data in transit. Data cropping is the process of removing the irrelevant data recorded before deployment and after retrieval.

Editable script

Here is a script you can use if you would prefer to have all the cropping code in a single script rather than an installable package format. Just click the clipboard icon in the top right corner of this code chunk to copy the code, then open a new R script in RStudio and paste it there, and save the script wherever you would like to store it on your computer.

See the demo in the next section for a step-by-step guide on how this script works.

# load necessary libraries
library(readxl)
# all the following six libraries can be loaded either in a single line using
# library(tidyverse)
# or separately as below
library(lubridate) # for handling dates
library(stringr)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)

base_loc <- "data/2022_summer/"

# directory containing your raw data
rawdata_loc <- paste0(base_loc, "1_raw_csv/")

# directory where your cropped data will be (or currently is) stored
cropped_loc <- paste0(base_loc, "2_cropped_csv/")

# directory where plots of cropped and raw data will be stored
croppedplots_loc <- paste0(base_loc, "2_cropped_plots/")

# name of your LDRTimes file that has the lookup table with deployment/retrieval dates
ldrtimes_fn <- "LDRTimes_summer22.xlsx"
# in our case, the LDR file is in the same folder as the raw data
ldrtimes_loc <- rawdata_loc

# check that R can find your raw data files
# Get all temperature data filenames
# note: * is called a glob, short for global
# IMPORTANT: filenames must be in the format sitename_medium_deployseason_deployyear.csv
#            e.g. NolanLower_air_sum_23.csv
csv_files = list.files(path = rawdata_loc, pattern = '*csv')
# this is now a list of all filenames; we haven't read in the data yet, but
# make sure this lists all the raw files you want to crop
csv_files

# read in LDR file and take a look at it
ldrtimes = readxl::read_xlsx(paste0(ldrtimes_loc, ldrtimes_fn))

# once you're sure that file paths are working and your ldrtimes looks right,
# crop the files!

i = 0
for(this.file in csv_files){
  i = i + 1
  #this.file = csv_files[1] # uncomment to troubleshoot within loop
  cat(paste0("Reading file ", i, " of ", length(csv_files), ": ", this.file), fill = TRUE)

  # extract metadata from the filename
  filename.parts = stringr::str_split_1(this.file, '[_.]')
  csv.site = filename.parts[1]
  csv.media = filename.parts[2]
  csv.season = filename.parts[3]
  csv.year = filename.parts[4]

  # convert the character-format datetime to an R POSIXct object
  # ymd_hm is the format the character string is in initially; it tells R
  # how to read and interpret the character string
  # sometimes R reads in the datetime format as mdy_hms and sometimes mdy_hm.
  # This tryCatch handles either hh:mm:ss or hh:mm format in csv files
  this.data =  tryCatch(
    {
      readr::read_csv(paste0(rawdata_loc, this.file),
                      skip = 2, # skip the first two lines of the file
                      col_select = 1:3, # read only the first three columns of data
                      col_names = FALSE, # don't try to name columns from a row of the file
                      show_col_types = FALSE) %>% # suppresses print message
        dplyr::rename("row.num" = X1,
                      "datetime" = X2,
                      "temperature" = X3) %>%
        dplyr::mutate(datetime = lubridate::mdy_hms(datetime)) #for datetime in hh:mm:ss
    },
    warning = function(cond) { #if datetime isn't in hh:mm:ss, will now try hh:mm format
      readr::read_csv(paste0(rawdata_loc, this.file),
                      skip = 2, # skip the first two lines of the file
                      col_select = 1:3, # read only the first three columns of data
                      col_names = FALSE, # don't try to name columns from a row of the file
                      show_col_types = FALSE) %>% # suppresses print message
        dplyr::rename("row.num" = X1,
                      "datetime" = X2,
                      "temperature" = X3) %>%
        dplyr::mutate(datetime = lubridate::mdy_hm(datetime)) #for datetime in hh:mm
    }
  )

  # crop the data
  deploy.retrieval = ldrtimes %>%
    # select the row(s) of ldrtimes that match this datafile
    # should be exactly one row, but if there are no rows or multiple rows that
    # match, this step will pull that many rows
    dplyr::filter(site == csv.site, deploy_season == csv.season,
                  deploy_year == csv.year, media == csv.media) %>%
    # keep just the deploy_time and retrieval_time variables/columns
    dplyr::select(deploy_time, retrieval_time)

  if(nrow(deploy.retrieval) == 0){
    stop("no rows of ldrtimes matched this csv file.")
  }
  if(nrow(deploy.retrieval) > 1){
    stop("multiple rows of ldrtimes matched this csv file.")
  }

  deploy = deploy.retrieval$deploy_time
  retrieval = deploy.retrieval$retrieval_time

  if(retrieval > deploy) {
    cropped.data = dplyr::filter(this.data,
                                 datetime > deploy,
                                 datetime < retrieval)

  } # if(retrieval > deploy)

  # write cropped csv files to cropped folder
  readr::write_csv(cropped.data,
                   file=paste0(cropped_loc,
                               stringr::str_split_i(this.file, "[.]", 1), "_cropped.csv"))

  #Create a dataframe of the raw and cropped data
  cropvraw <- dplyr::left_join(this.data, cropped.data, by=c("row.num", "datetime")) %>%
    dplyr::rename(raw.temp = temperature.x,
                  cropped.temp = temperature.y) %>%#rename temperature from each file
    #create new column of data type (raw or cropped for plotting in ggplot)
    tidyr::pivot_longer(cols = raw.temp:cropped.temp,
                        names_to="type", values_to="temp")

  cropvraw.plot <- ggplot2::ggplot(cropvraw,
                                   ggplot2::aes(x = datetime,
                                                y = temp,
                                                color = type)) +
    ggplot2::geom_line(na.rm=TRUE) +
    ggplot2::geom_point(na.rm=TRUE) +
    ggplot2::labs(title = paste0(" Raw versus Cropped data"),
                  x = "Date", y = "Temperature (C)")+
    ggplot2::theme(axis.text = ggplot2::element_text(colour = "black", size = (12)))

  ggplot2::ggsave(paste0(croppedplots_loc, csv.site, "_rawvscroppeddata.png"),
                  cropvraw.plot,
                  width = 11, height = 8.5, units = "in")

}; cat("Done.", fill = TRUE)

Tutorial (script)

First we’ll load the libraries we’ll use:

# load necessary libraries
library(readxl)
# all the following six libraries can be loaded either in a single line using
# library(tidyverse)
# or separately as below
library(lubridate) # for handling dates
#> 
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#> 
#>     date, intersect, setdiff, union
library(stringr)
library(readr)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)

The inputs you need to give the code:

  • the folder containing all the (“raw”) data files that you want to crop
  • the folder containing the lookup table of your deployment and retrieval times, and what the name of this file is
  • the folder to which the code should save all the cropped data files
  • the folder to which the code should save all the diagnostic plots it makes of the data it cropped

This code currently expects that all the raw data files are in comma-separated value (.csv) format and that the lookup table is a .xlsx file. You can edit the script for your context.

In our case, the raw and cropped folders are both subfolders of another folder, so we define the file path (base_loc) to that parent folder and then use it to tell R where it can find the raw and cropped folders (rawdata_loc and cropped_loc):

base_loc <- "data/2022_summer/"

# directory containing your raw data
rawdata_loc <- paste0(base_loc, "1_raw_csv/")

# directory where your cropped data will be (or currently is) stored
cropped_loc <- paste0(base_loc, "2_cropped_csv/")

# directory where plots of cropped and raw data will be stored
croppedplots_loc <- paste0(base_loc, "2_cropped_plots/")

# name of your LDRTimes file that has the lookup table with deployment/retrieval dates
ldrtimes_fn <- "LDRTimes_summer22.xlsx"

# in our case, the LDR file is in the same folder as the raw data
ldrtimes_loc <- rawdata_loc

Next, we use R to get a list of the filenames of the raw data files, and we call this list csv_files:

# check that R can find your raw data files
# Get all temperature data filenames
# note: * is called a glob, short for global
# IMPORTANT: filenames must be in the format sitename_medium_deployseason_deployyear.csv
#            e.g. NolanLower_air_sum_23.csv
csv_files = list.files(path = rawdata_loc, pattern = '*csv')

Let’s take a look at the list to make sure that it looks right and contains all the files we want to crop:

csv_files

We also need to read in the lookup table of deployment and retrieval times, or ldrtimes:

ldrtimes = readxl::read_xlsx(paste0(ldrtimes_loc, ldrtimes_fn))

Next we crop the files! For-loops have the format for(xxx in yyy){code}. They run the code inside the curly braces for each value xxx of yyy. For instance, for(i in 1:5){print(i)} will print the numbers 1, 2, 3, 4, 5. Here, we are using a for-loop to read in and process each file. After this code chunk, we’ll break down what this for-loop is doing by running the code inside of it for just a single file. (Tip: this is a great way to understand what a for-loop is doing, and also a great way to debug a for-loop if you are trying to write or modify one.)

i = 0
for(this.file in csv_files){
  i = i + 1
  #this.file = csv_files[1] # uncomment to troubleshoot within loop
  cat(paste0("Reading file ", i, " of ", length(csv_files), ": ", this.file), fill = TRUE)

  # extract metadata from the filename
  filename.parts = stringr::str_split_1(this.file, '[_.]')
  csv.site = filename.parts[1]
  csv.media = filename.parts[2]
  csv.season = filename.parts[3]
  csv.year = filename.parts[4]

  # convert the character-format datetime to an R POSIXct object
  # ymd_hm is the format the character string is in initially; it tells R
  # how to read and interpret the character string
  # sometimes R reads in the datetime format as mdy_hms and sometimes mdy_hm.
  # This tryCatch handles either hh:mm:ss or hh:mm format in csv files
  this.data =  tryCatch(
    {
      readr::read_csv(paste0(rawdata_loc, this.file),
                      skip = 2, # skip the first two lines of the file
                      col_select = 1:3, # read only the first three columns of data
                      col_names = FALSE, # don't try to name columns from a row of the file
                      show_col_types = FALSE) %>% # suppresses print message
        dplyr::rename("row.num" = X1,
                      "datetime" = X2,
                      "temperature" = X3) %>%
        dplyr::mutate(datetime = lubridate::mdy_hms(datetime)) #for datetime in hh:mm:ss
    },
    warning = function(cond) { #if datetime isn't in hh:mm:ss, will now try hh:mm format
      readr::read_csv(paste0(rawdata_loc, this.file),
                      skip = 2, # skip the first two lines of the file
                      col_select = 1:3, # read only the first three columns of data
                      col_names = FALSE, # don't try to name columns from a row of the file
                      show_col_types = FALSE) %>% # suppresses print message
        dplyr::rename("row.num" = X1,
                      "datetime" = X2,
                      "temperature" = X3) %>%
        dplyr::mutate(datetime = lubridate::mdy_hm(datetime)) #for datetime in hh:mm
    }
  )

  # crop the data
  deploy.retrieval = ldrtimes %>%
    # select the row(s) of ldrtimes that match this datafile
    # should be exactly one row, but if there are no rows or multiple rows that
    # match, this step will pull that many rows
    dplyr::filter(site == csv.site, deploy_season == csv.season,
                  deploy_year == csv.year, media == csv.media) %>%
    # keep just the deploy_time and retrieval_time variables/columns
    dplyr::select(deploy_time, retrieval_time)

  if(nrow(deploy.retrieval) == 0){
    stop("no rows of ldrtimes matched this csv file.")
  }
  if(nrow(deploy.retrieval) > 1){
    stop("multiple rows of ldrtimes matched this csv file.")
  }

  deploy = deploy.retrieval$deploy_time
  retrieval = deploy.retrieval$retrieval_time

  if(retrieval > deploy) {
    cropped.data = dplyr::filter(this.data,
                                 datetime > deploy,
                                 datetime < retrieval)

  } # if(retrieval > deploy)

  # write cropped csv files to cropped folder
  readr::write_csv(cropped.data,
                   file=paste0(cropped_loc,
                               stringr::str_split_i(this.file, "[.]", 1), "_cropped.csv"))

  #Create a dataframe of the raw and cropped data
  cropvraw <- dplyr::left_join(this.data, cropped.data, by=c("row.num", "datetime")) %>%
    dplyr::rename(raw.temp = temperature.x,
                  cropped.temp = temperature.y) %>%#rename temperature from each file
    #create new column of data type (raw or cropped for plotting in ggplot)
    tidyr::pivot_longer(cols = raw.temp:cropped.temp,
                        names_to="type", values_to="temp")

  cropvraw.plot <- ggplot2::ggplot(cropvraw,
                                   ggplot2::aes(x = datetime,
                                                y = temp,
                                                color = type)) +
    ggplot2::geom_line(na.rm=TRUE) +
    ggplot2::geom_point(na.rm=TRUE) +
    ggplot2::labs(title = paste0(" Raw versus Cropped data"),
                  x = "Date", y = "Temperature (C)")+
    ggplot2::theme(axis.text = ggplot2::element_text(colour = "black", size = (12)))

  ggplot2::ggsave(paste0(croppedplots_loc, csv.site, "_rawvscroppeddata.png"),
                  cropvraw.plot,
                  width = 11, height = 8.5, units = "in")

}; cat("Done.", fill = TRUE)

Let’s consider the first file in the list:

this.file = csv_files[1]; i = 1

The cat function prints messages to the screen. The paste0 function allows us to glue together text and values. This is just printing out which file is being read in so that you have some progress updates as the code runs.

cat(paste0("Reading file ", i, " of ", length(csv_files), ": ", this.file), fill = TRUE)

The next part splits up the filename to extract the metadata it contains. This code assumes your files are called sitename_medium_deployseason_deployyear.csv, where sitename is the name for the sampling location, the medium whose temperature is being measured is either air or water, and deployseason and deployyear are the season (fall or sum) and two-digit year when the first data point in this file was collected. For example, you might have a file called NolanLower_air_sum_23.csv which contains air temperature data for a season starting in summer 2023 from a location called Nolan Lower.

str_split_1(this.file, '[_.]') splits the filename into pieces wherever it contains an underscore or a period. The code after that names some of these pieces of information, and we will add them to the dataset later.

# extract metadata from the filename
filename.parts = stringr::str_split_1(this.file, '[_.]')
csv.site = filename.parts[1]
csv.media = filename.parts[2]
csv.season = filename.parts[3]
csv.year = filename.parts[4]
# convert the character-format datetime to an R POSIXct object
# ymd_hm is the format the character string is in initially; it tells R
# how to read and interpret the character string
# sometimes R reads in the datetime format as mdy_hms and sometimes mdy_hm.
# This tryCatch handles either hh:mm:ss or hh:mm format in csv files
this.data =  tryCatch(
  {
    readr::read_csv(paste0(rawdata_loc, this.file),
                    skip = 2, # skip the first two lines of the file
                    col_select = 1:3, # read only the first three columns of data
                    col_names = FALSE, # don't try to name columns from a row of the file
                    show_col_types = FALSE) %>% # suppresses print message
      dplyr::rename("row.num" = X1,
                    "datetime" = X2,
                    "temperature" = X3) %>%
      dplyr::mutate(datetime = lubridate::mdy_hms(datetime)) #for datetime in hh:mm:ss
  },
  warning = function(cond) { #if datetime isn't in hh:mm:ss, will now try hh:mm format
    readr::read_csv(paste0(rawdata_loc, this.file),
                    skip = 2, # skip the first two lines of the file
                    col_select = 1:3, # read only the first three columns of data
                    col_names = FALSE, # don't try to name columns from a row of the file
                    show_col_types = FALSE) %>% # suppresses print message
      dplyr::rename("row.num" = X1,
                    "datetime" = X2,
                    "temperature" = X3) %>%
      dplyr::mutate(datetime = lubridate::mdy_hm(datetime)) #for datetime in hh:mm
  }
)

# crop the data
deploy.retrieval = ldrtimes %>%
  # select the row(s) of ldrtimes that match this datafile
  # should be exactly one row, but if there are no rows or multiple rows that
  # match, this step will pull that many rows
  dplyr::filter(site == csv.site, deploy_season == csv.season,
                deploy_year == csv.year, media == csv.media) %>%
  # keep just the deploy_time and retrieval_time variables/columns
  dplyr::select(deploy_time, retrieval_time)

if(nrow(deploy.retrieval) == 0){
  stop("no rows of ldrtimes matched this csv file.")
}
if(nrow(deploy.retrieval) > 1){
  stop("multiple rows of ldrtimes matched this csv file.")
}

deploy = deploy.retrieval$deploy_time
retrieval = deploy.retrieval$retrieval_time

if(retrieval > deploy) {
  cropped.data = dplyr::filter(this.data,
                               datetime > deploy,
                               datetime < retrieval)

} # if(retrieval > deploy)

# write cropped csv files to cropped folder
readr::write_csv(cropped.data,
                 file=paste0(cropped_loc,
                             stringr::str_split_i(this.file, "[.]", 1), "_cropped.csv"))

#Create a dataframe of the raw and cropped data
cropvraw <- dplyr::left_join(this.data, cropped.data, by=c("row.num", "datetime")) %>%
  dplyr::rename(raw.temp = temperature.x,
                cropped.temp = temperature.y) %>%#rename temperature from each file
  #create new column of data type (raw or cropped for plotting in ggplot)
  tidyr::pivot_longer(cols = raw.temp:cropped.temp,
                      names_to="type", values_to="temp")

cropvraw.plot <- ggplot2::ggplot(cropvraw,
                                 ggplot2::aes(x = datetime,
                                              y = temp,
                                              color = type)) +
  ggplot2::geom_line(na.rm=TRUE) +
  ggplot2::geom_point(na.rm=TRUE) +
  ggplot2::labs(title = paste0(" Raw versus Cropped data"),
                x = "Date", y = "Temperature (C)")+
  ggplot2::theme(axis.text = ggplot2::element_text(colour = "black", size = (12)))

ggplot2::ggsave(paste0(croppedplots_loc, csv.site, "_rawvscroppeddata.png"),
                cropvraw.plot,
                width = 11, height = 8.5, units = "in")

Demo using dataQCtools::crop_raw_data()

In development