Manage Sample Inventories

Rich Lab for Molecular Health

Author

Alicia M. Rich, Ph.D.

Published

Thursday, September 25, 2025

Modified

Thursday, September 25, 2025

1 Setup

Show the code
here::i_am("scripts/sample_inventory.qmd")
source(here::here("_common.R"))
source(here("scripts/sample_inventory.R"))
i_am("scripts/sample_inventory.R")


inventory_csv_cols <- list(
  isolates = cols(
    collection_date            = col_date(),
    extraction_date            = col_date(),
    libprep_date               = col_date(),
    libprep_run                = col_integer(),
    seq_date                   = col_date(),
    collection_LiCl_M          = col_number(),
    extract_run                = col_integer(),
    extraction_sample_input_ul = col_number(),
    extraction_elution_ul      = col_number(),
    extraction_elution_temp    = col_number(),
    extract_ng_ul              = col_number(),
    libprep_extract_ul         = col_number(),
    libprep_extract_h20        = col_number(),
    lib_ng_ul_1                = col_number(),
    lib_ng_ul_2                = col_number(),
    lib_ul_added               = col_number(),
    seq_depth                  = col_number(),
    seq_qscore_mean            = col_number(),
    seq_length_mean            = col_number()
  ),
  fecals = cols(
    seq_run                   = col_number(),
    subject_mass              = col_number(),
    subject_forearm           = col_number(),
    subject_antibiotic        = col_number(),
    subject_antidiarrheal     = col_number(),
    subject_fiber             = col_number(),
    subject_probiotic         = col_number(),
    subject_steroid           = col_number(),
    subject_bristol           = col_number(),
    collection_box            = col_number(),
    collection_date           = col_date(),
    collection_datetime       = col_datetime(),
    collection_latitude       = col_number(),
    collection_longitude      = col_number(),
    collection_pellets        = col_number(),
    extract_box               = col_number(),
    extract_run               = col_number(),
    extraction_date           = col_date(),
    extraction_sample_input_ul= col_number(),
    extraction_elution_ul     = col_number(),
    extraction_elution_temp   = col_number(),
    extract_ng_ul             = col_number(),
    libprep_date              = col_date(),
    libprep_run               = col_number(),
    libprep_barcode           = col_number(),
    libprep_extract_ul        = col_number(),
    libprep_extract_h20       = col_number(),
    lib_ng_ul_1               = col_number(),
    lib_ng_ul_2               = col_number(),
    lib_ul_added              = col_number(),
    seq_date                  = col_date(),
    seq_depth                 = col_number(),
    seq_qscore_mean           = col_number(),
    seq_length_mean           = col_number()
  )
)

inventory_cols <- list(
  ids = c(
    "sampleID",
    "extractID",
    "extract_run",
    "libraryID",
    "libprep_run",
    "seq_run",
    "seq_run_id"
  ),
  controls = c(
    "extractID",
    "extract_run",
    "status",
    "sampleset",
    "extraction_date",
    "libprep_date",
    "seq_date",
    "extract_box",
    "extract_run",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run_id",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  ),
  compilation = c(
    "sampleID",
    "extractID",
    "libraryID",
    "seq_run",
    "status",
    "sampleset",
    "collection_date",
    "extraction_date",
    "libprep_date",
    "seq_date",
    "collection_stabilizer",
    "collection_by",
    "collection_source",
    "collection_site",
    "subject",
    "subject_genus",
    "subject_species",
    "subject_confirmed",
    "subject_sex",
    "subject_age_category",
    "subject_repro_status",
    "subject_mass",
    "subject_mass_confirmed",
    "subject_forearm",
    "collection_latitude",
    "collection_longitude",
    "collection_net_length",
    "collection_pellets",
    "subject_diet",
    "subject_antibiotic",
    "subject_antidiarrheal",
    "subject_fiber",
    "subject_probiotic",
    "subject_steroid",
    "subject_bristol",
    "subject_holding",
    "collection_pair_access",
    "collection_antibiotic",
    "collection_medium",
    "collection_line",
    "collection_LiCl_M",
    "collection_note",
    "extract_box",
    "extract_run",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run_id",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  ),
  samples = c(
    "sampleID",
    "status",
    "sampleset",
    "collection_date",
    "collection_stabilizer",
    "collection_by",
    "collection_source",
    "collection_site",
    "subject",
    "subject_genus",
    "subject_species",
    "subject_confirmed",
    "subject_sex",
    "subject_age_category",
    "subject_repro_status",
    "subject_mass",
    "subject_mass_confirmed",
    "subject_forearm",
    "collection_latitude",
    "collection_longitude",
    "collection_net_length",
    "collection_pellets",
    "subject_diet",
    "subject_antibiotic",
    "subject_antidiarrheal",
    "subject_fiber",
    "subject_probiotic",
    "subject_steroid",
    "subject_bristol",
    "subject_holding",
    "collection_pair_access",
    "collection_antibiotic",
    "collection_medium",
    "collection_line",
    "collection_LiCl_M",
    "collection_note"
  ),
  extracts = c(
    "sampleID",
    "extractID",
    "extract_run",
    "extract_box",
    "extract_run",
    "extraction_date",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note"
  ),
  libraries = c(
    "sampleID",
    "extractID",
    "libraryID",
    "libprep_run",
    "libprep_date",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run",
    "seq_run_id",
    "seq_date",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  )
)
version_control_check <- function(dir_path, filename, file_extension) {
  if (!dir_exists(here(dir_path))) dir_create(here(dir_path)) else print(sprintf("%s directory exists.", dir_path))
  path     <- paste0(dir_path, "/", filename, file_extension)
  archived <- paste0("/version_archive/", filename, "_archive_", ymd(today()), file_extension)
  if (file_exists(here(path))) {
    if (!dir_exists(here(paste0(dir_path, "/version_archive")))) {
      dir_create(here(paste0(dir_path, "/version_archive")))
      print("version_archive directory created")
    } else { print("version_archive directory already exists") }
    file_copy(
      here(path),
      here(paste0(dir_path, archived)),
      overwrite = TRUE
    )
    print("previous file version moved to archive")
  } else { print("no previous file version exists") }
}

dashboard_transfer <- function(dir_path, dash_path, filename, file_extension) {
  if (!dir_exists(here(paste0("dashboards/", dash_path)))) {
    dir_create(here(paste0("dashboards/", dash_path)))
    print("Dashboard subdirectory created")
  } else {
    print("Dashboard subdirectory already exists")
  }
  file_copy(
    here(paste0(dir_path, "/", filename, file_extension)),
    here(paste0("dashboards/", dash_path, "/", filename, file_extension)),
    overwrite = TRUE
  )
  print("File available for dashboard use now.")
}


read_csv_utf8 <- function(path, col_types = readr::cols()) {
  enc_guess <- readr::guess_encoding(path, n_max = 5000)$encoding[1]
  enc_use   <- ifelse(is.na(enc_guess), "UTF-8", enc_guess)
  readr::read_csv(
    path,
    col_types = col_types,
    locale = readr::locale(encoding = enc_use)
  ) %>%
    dplyr::mutate(dplyr::across(where(is.character), enc2utf8))
}

fix_units_glitches <- function(df) {
  df %>%
    dplyr::mutate(dplyr::across(
      where(is.character),
      \(x) {
        x <- stringi::stri_trans_general(x, "NFKC")
        # 2) Canonicalize common micro variants to the micro sign U+00B5
        x <- stringr::str_replace_all(
          x,
          c(
            "µ"          = "\u00B5",  # mis-decoded CP1252
            "\u7121"      = "\u00B5L",  # mis-decoded
            "無"          = "\u00B5L",  # mis-decoded
            "\u00B5"      = "\u00B5",  # already micro sign
            "\u03BC"      = "\u00B5",  # Greek mu to micro sign
            "\uFFFD"      = "\u00B5"   # replacement char -> micro (we only want this in unit contexts, see step 4)
          )
        )
        # 3) Fix common ASCII fallbacks (uL -> µL) when used as a unit
        x <- stringr::str_replace_all(x, "(?<=[/\\s])uL\\b", "\u00B5L")
        # 4) As a last resort, ANY single non-ASCII char used like a micro
        #    between a slash and an L (e.g., "/無L", "/�L") -> "/µL"
        x <- stringr::str_replace_all(x, "(?<=/)\\P{ASCII}(?=L\\b)", "\u00B5")
        # 5) Tidy spacing variants ("/ µL" -> "/µL")
        x <- stringr::str_replace_all(x, "/\\s*\u00B5L\\b", "/\u00B5L")
        x
      }
    ))
}

2 Import Data

Update params$sample_type with one of isolates, fecals, or environmental to import and update the correct inventory files using the code below.
The file should have the columns in the lists below. (I recommend adding empty columns for reproducibility of code)
  • status
  • sampleID
  • sampleset
  • extractID
  • extract_run
  • libraryID
  • libprep_run
  • seq_run
  • seq_run_id
  • collection_date
  • collection_stabilizer
  • collection_by
  • collection_source
  • collection_site
  • extract_box
  • extract_run
  • extraction_date
  • extraction_by
  • extraction_target
  • extraction_kit
  • extraction_protocol
  • extraction_sample_input_ul
  • extraction_elution_ul
  • extraction_elution_temp
  • extract_ng_ul
  • extraction_note
  • libprep_date
  • libprep_run
  • libprep_protocol
  • libprep_kit
  • libprep_extract_ul
  • libprep_extract_h20
  • lib_ng_ul_1
  • lib_ng_ul_2
  • lib_ul_added
  • libprep_note
  • libprep_barcode
  • seq_run_id
  • seq_date
  • seq_by
  • seq_device
  • seq_position
  • seq_flowcell
  • seq_flowcell_id
  • seq_flongle
  • seq_depth
  • seq_qscore_mean
  • seq_length_mean
  • subject
  • subject_genus
  • subject_species
  • subject_confirmed
  • subject_sex
  • subject_age_category
  • subject_repro_status
  • collection_note
  • subject_mass
  • subject_mass_confirmed
  • subject_forearm
  • collection_latitude
  • collection_longitude
  • collection_net_length
  • collection_pellets
  • subject_diet
  • subject_antibiotic
  • subject_antidiarrheal
  • subject_fiber
  • subject_probiotic
  • subject_steroid
  • subject_bristol
  • subject_holding
  • collection_pair_access
  • collection_datetime
  • collection_antibiotic
  • collection_medium
  • collection_line
  • collection_LiCl_M
Warning: Date Formatting in Excel

Sometimes excel sneaks in its own reformatting of dates in csv files, so you should make sure you switch all days back to yyyy-mm-dd format before trying to read the file with the code below.

Show the code
sample_inventory <- read_csv(
  here(paste0("inventories/", params$sample_type, "/sample_inventory.csv")),
  col_types = inventory_csv_cols[[params$sample_type]]
  ) %>%
  group_by(sampleID) %>%
  arrange(extractID, libraryID, seq_run) %>%
  mutate(status = case_when(
    str_detect(sampleID, "NTC") ~ "control",
    !is.na(first(seq_run)) ~ "sequenced",
    is.na(first(seq_run)) & !is.na(first(libraryID)) ~ "libprepped",
    is.na(first(seq_run)) & is.na(first(libraryID)) & !is.na(first(extractID)) ~ "extracted",
    is.na(first(seq_run)) & is.na(first(libraryID)) & is.na(first(extractID)) ~ "collected"
  )) %>%
  ungroup() %>%
  arrange(sampleset, sampleID, extractID, libraryID) %>%
  select(any_of(inventory_cols$compilation)) %>%
  distinct()

3 Organize Inventory List

Show the code
inventory <- list(
  compilation = select(filter(sample_inventory, status != "control"), any_of((inventory_cols$compilation))),
  samples     = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$samples)),
  extracts    = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$extracts)),
  libraries   = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$libraries)),
  controls    = select(filter(sample_inventory, status == "control"), any_of(inventory_cols$controls))
)

inventory$compilation <- inventory$compilation %>%
  distinct() %>%
  arrange(sampleset, sampleID, extractID, libraryID, seq_run)

inventory$samples   <- distinct(inventory$samples)
inventory$extracts  <- distinct(filter(inventory$extracts,  !is.na(extractID)))
inventory$libraries <- distinct(filter(inventory$libraries, !is.na(libraryID)))
inventory$controls  <- distinct(inventory$controls)

4 Export Updated Files

Show the code
version_control_check(
  paste0("inventories/", params$sample_type),
  "inventory",
  ".RData"
)
[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"
Show the code
save(
  inventory, 
  file = here(paste0("inventories/", params$sample_type, "/inventory.RData"))
  )

version_control_check(
  paste0("inventories/", params$sample_type),
  "sample_inventory",
  ".RData"
)
[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"
Show the code
save(
  sample_inventory, 
  file = here(paste0("inventories/", params$sample_type, "/sample_inventory.RData"))
  )


version_control_check(
  paste0("inventories/", params$sample_type),
  "sample_inventory",
  ".csv"
)
[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"
Show the code
write_csv(
  sample_inventory, 
  here(paste0("inventories/", params$sample_type, "/sample_inventory.csv")),
  na = ""
  )


dashboard_transfer(
  paste0("inventories/", params$sample_type),
  paste0("inventories/", params$sample_type),
  "inventory",
  ".RData"
)
[1] "Dashboard subdirectory already exists"
[1] "File available for dashboard use now."

5 Download Data

Use the links below to access the current github repo version of our inventories.