Manage Sample Inventories

Rich Lab for Molecular Health

Author

Alicia M. Rich, Ph.D.

Published

Thursday, September 25, 2025

Modified

Thursday, September 25, 2025

1 Setup

Show the code

here::i_am("scripts/sample_inventory.qmd")
source(here::here("_common.R"))
source(here("scripts/sample_inventory.R"))

Expand to view the functions and lists loaded in the setup chunk of this document.

i_am("scripts/sample_inventory.R")


inventory_csv_cols <- list(
  isolates = cols(
    collection_date            = col_date(),
    extraction_date            = col_date(),
    libprep_date               = col_date(),
    libprep_run                = col_integer(),
    seq_date                   = col_date(),
    collection_LiCl_M          = col_number(),
    extract_run                = col_integer(),
    extraction_sample_input_ul = col_number(),
    extraction_elution_ul      = col_number(),
    extraction_elution_temp    = col_number(),
    extract_ng_ul              = col_number(),
    libprep_extract_ul         = col_number(),
    libprep_extract_h20        = col_number(),
    lib_ng_ul_1                = col_number(),
    lib_ng_ul_2                = col_number(),
    lib_ul_added               = col_number(),
    seq_depth                  = col_number(),
    seq_qscore_mean            = col_number(),
    seq_length_mean            = col_number()
  ),
  fecals = cols(
    seq_run                   = col_number(),
    subject_mass              = col_number(),
    subject_forearm           = col_number(),
    subject_antibiotic        = col_number(),
    subject_antidiarrheal     = col_number(),
    subject_fiber             = col_number(),
    subject_probiotic         = col_number(),
    subject_steroid           = col_number(),
    subject_bristol           = col_number(),
    collection_box            = col_number(),
    collection_date           = col_date(),
    collection_datetime       = col_datetime(),
    collection_latitude       = col_number(),
    collection_longitude      = col_number(),
    collection_pellets        = col_number(),
    extract_box               = col_number(),
    extract_run               = col_number(),
    extraction_date           = col_date(),
    extraction_sample_input_ul= col_number(),
    extraction_elution_ul     = col_number(),
    extraction_elution_temp   = col_number(),
    extract_ng_ul             = col_number(),
    libprep_date              = col_date(),
    libprep_run               = col_number(),
    libprep_barcode           = col_number(),
    libprep_extract_ul        = col_number(),
    libprep_extract_h20       = col_number(),
    lib_ng_ul_1               = col_number(),
    lib_ng_ul_2               = col_number(),
    lib_ul_added              = col_number(),
    seq_date                  = col_date(),
    seq_depth                 = col_number(),
    seq_qscore_mean           = col_number(),
    seq_length_mean           = col_number()
  )
)

inventory_cols <- list(
  ids = c(
    "sampleID",
    "extractID",
    "extract_run",
    "libraryID",
    "libprep_run",
    "seq_run",
    "seq_run_id"
  ),
  controls = c(
    "extractID",
    "extract_run",
    "status",
    "sampleset",
    "extraction_date",
    "libprep_date",
    "seq_date",
    "extract_box",
    "extract_run",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run_id",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  ),
  compilation = c(
    "sampleID",
    "extractID",
    "libraryID",
    "seq_run",
    "status",
    "sampleset",
    "collection_date",
    "extraction_date",
    "libprep_date",
    "seq_date",
    "collection_stabilizer",
    "collection_by",
    "collection_source",
    "collection_site",
    "subject",
    "subject_genus",
    "subject_species",
    "subject_confirmed",
    "subject_sex",
    "subject_age_category",
    "subject_repro_status",
    "subject_mass",
    "subject_mass_confirmed",
    "subject_forearm",
    "collection_latitude",
    "collection_longitude",
    "collection_net_length",
    "collection_pellets",
    "subject_diet",
    "subject_antibiotic",
    "subject_antidiarrheal",
    "subject_fiber",
    "subject_probiotic",
    "subject_steroid",
    "subject_bristol",
    "subject_holding",
    "collection_pair_access",
    "collection_antibiotic",
    "collection_medium",
    "collection_line",
    "collection_LiCl_M",
    "collection_note",
    "extract_box",
    "extract_run",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run_id",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  ),
  samples = c(
    "sampleID",
    "status",
    "sampleset",
    "collection_date",
    "collection_stabilizer",
    "collection_by",
    "collection_source",
    "collection_site",
    "subject",
    "subject_genus",
    "subject_species",
    "subject_confirmed",
    "subject_sex",
    "subject_age_category",
    "subject_repro_status",
    "subject_mass",
    "subject_mass_confirmed",
    "subject_forearm",
    "collection_latitude",
    "collection_longitude",
    "collection_net_length",
    "collection_pellets",
    "subject_diet",
    "subject_antibiotic",
    "subject_antidiarrheal",
    "subject_fiber",
    "subject_probiotic",
    "subject_steroid",
    "subject_bristol",
    "subject_holding",
    "collection_pair_access",
    "collection_antibiotic",
    "collection_medium",
    "collection_line",
    "collection_LiCl_M",
    "collection_note"
  ),
  extracts = c(
    "sampleID",
    "extractID",
    "extract_run",
    "extract_box",
    "extract_run",
    "extraction_date",
    "extraction_by",
    "extraction_target",
    "extraction_kit",
    "extraction_protocol",
    "extraction_sample_input_ul",
    "extraction_elution_ul",
    "extraction_elution_temp",
    "extract_ng_ul",
    "extraction_note"
  ),
  libraries = c(
    "sampleID",
    "extractID",
    "libraryID",
    "libprep_run",
    "libprep_date",
    "libprep_run",
    "libprep_protocol",
    "libprep_kit",
    "libprep_extract_ul",
    "libprep_extract_h20",
    "lib_ng_ul_1",
    "lib_ng_ul_2",
    "lib_ul_added",
    "libprep_note",
    "libprep_barcode",
    "seq_run",
    "seq_run_id",
    "seq_date",
    "seq_by",
    "seq_device",
    "seq_position",
    "seq_flowcell",
    "seq_flowcell_id",
    "seq_flongle",
    "seq_depth",
    "seq_qscore_mean",
    "seq_length_mean"
  )
)

version_control_check <- function(dir_path, filename, file_extension) {
  if (!dir_exists(here(dir_path))) dir_create(here(dir_path)) else print(sprintf("%s directory exists.", dir_path))
  path     <- paste0(dir_path, "/", filename, file_extension)
  archived <- paste0("/version_archive/", filename, "_archive_", ymd(today()), file_extension)
  if (file_exists(here(path))) {
    if (!dir_exists(here(paste0(dir_path, "/version_archive")))) {
      dir_create(here(paste0(dir_path, "/version_archive")))
      print("version_archive directory created")
    } else { print("version_archive directory already exists") }
    file_copy(
      here(path),
      here(paste0(dir_path, archived)),
      overwrite = TRUE
    )
    print("previous file version moved to archive")
  } else { print("no previous file version exists") }
}

dashboard_transfer <- function(dir_path, dash_path, filename, file_extension) {
  if (!dir_exists(here(paste0("dashboards/", dash_path)))) {
    dir_create(here(paste0("dashboards/", dash_path)))
    print("Dashboard subdirectory created")
  } else {
    print("Dashboard subdirectory already exists")
  }
  file_copy(
    here(paste0(dir_path, "/", filename, file_extension)),
    here(paste0("dashboards/", dash_path, "/", filename, file_extension)),
    overwrite = TRUE
  )
  print("File available for dashboard use now.")
}


read_csv_utf8 <- function(path, col_types = readr::cols()) {
  enc_guess <- readr::guess_encoding(path, n_max = 5000)$encoding[1]
  enc_use   <- ifelse(is.na(enc_guess), "UTF-8", enc_guess)
  readr::read_csv(
    path,
    col_types = col_types,
    locale = readr::locale(encoding = enc_use)
  ) %>%
    dplyr::mutate(dplyr::across(where(is.character), enc2utf8))
}

fix_units_glitches <- function(df) {
  df %>%
    dplyr::mutate(dplyr::across(
      where(is.character),
      \(x) {
        x <- stringi::stri_trans_general(x, "NFKC")
        # 2) Canonicalize common micro variants to the micro sign U+00B5
        x <- stringr::str_replace_all(
          x,
          c(
            "Âµ"          = "\u00B5",  # mis-decoded CP1252
            "\u7121"      = "\u00B5L",  # mis-decoded
            "無"          = "\u00B5L",  # mis-decoded
            "\u00B5"      = "\u00B5",  # already micro sign
            "\u03BC"      = "\u00B5",  # Greek mu to micro sign
            "\uFFFD"      = "\u00B5"   # replacement char -> micro (we only want this in unit contexts, see step 4)
          )
        )
        # 3) Fix common ASCII fallbacks (uL -> µL) when used as a unit
        x <- stringr::str_replace_all(x, "(?<=[/\\s])uL\\b", "\u00B5L")
        # 4) As a last resort, ANY single non-ASCII char used like a micro
        #    between a slash and an L (e.g., "/無L", "/�L") -> "/µL"
        x <- stringr::str_replace_all(x, "(?<=/)\\P{ASCII}(?=L\\b)", "\u00B5")
        # 5) Tidy spacing variants ("/ µL" -> "/µL")
        x <- stringr::str_replace_all(x, "/\\s*\u00B5L\\b", "/\u00B5L")
        x
      }
    ))
}

2 Import Data

Update params$sample_type with one of isolates, fecals, or environmental to import and update the correct inventory files using the code below.: The file should have the columns in the lists below. (I recommend adding empty columns for reproducibility of code)

Columns for All Sample Types

status
sampleID
sampleset
extractID
extract_run
libraryID
libprep_run
seq_run
seq_run_id

collection_date
collection_stabilizer
collection_by
collection_source
collection_site

extract_box
extract_run
extraction_date
extraction_by
extraction_target
extraction_kit
extraction_protocol
extraction_sample_input_ul
extraction_elution_ul
extraction_elution_temp
extract_ng_ul
extraction_note

libprep_date
libprep_run
libprep_protocol
libprep_kit
libprep_extract_ul
libprep_extract_h20
lib_ng_ul_1
lib_ng_ul_2
lib_ul_added
libprep_note
libprep_barcode
seq_run_id
seq_date
seq_by
seq_device
seq_position
seq_flowcell
seq_flowcell_id
seq_flongle
seq_depth
seq_qscore_mean
seq_length_mean

Collection Columns for Fecals Only

subject
subject_genus
subject_species
subject_confirmed
subject_sex
subject_age_category
subject_repro_status
collection_note

subject_mass
subject_mass_confirmed
subject_forearm
collection_latitude
collection_longitude
collection_net_length
collection_pellets

subject_diet
subject_antibiotic
subject_antidiarrheal
subject_fiber
subject_probiotic
subject_steroid
subject_bristol
subject_holding
collection_pair_access

collection_datetime

Collection Columns for Isolates Only

collection_antibiotic
collection_medium
collection_line
collection_LiCl_M

Warning: Date Formatting in Excel

Sometimes excel sneaks in its own reformatting of dates in csv files, so you should make sure you switch all days back to yyyy-mm-dd format before trying to read the file with the code below.

Show the code

sample_inventory <- read_csv(
  here(paste0("inventories/", params$sample_type, "/sample_inventory.csv")),
  col_types = inventory_csv_cols[[params$sample_type]]
  ) %>%
  group_by(sampleID) %>%
  arrange(extractID, libraryID, seq_run) %>%
  mutate(status = case_when(
    str_detect(sampleID, "NTC") ~ "control",
    !is.na(first(seq_run)) ~ "sequenced",
    is.na(first(seq_run)) & !is.na(first(libraryID)) ~ "libprepped",
    is.na(first(seq_run)) & is.na(first(libraryID)) & !is.na(first(extractID)) ~ "extracted",
    is.na(first(seq_run)) & is.na(first(libraryID)) & is.na(first(extractID)) ~ "collected"
  )) %>%
  ungroup() %>%
  arrange(sampleset, sampleID, extractID, libraryID) %>%
  select(any_of(inventory_cols$compilation)) %>%
  distinct()

3 Organize Inventory List

Show the code

inventory <- list(
  compilation = select(filter(sample_inventory, status != "control"), any_of((inventory_cols$compilation))),
  samples     = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$samples)),
  extracts    = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$extracts)),
  libraries   = select(filter(sample_inventory, status != "control"), any_of(inventory_cols$libraries)),
  controls    = select(filter(sample_inventory, status == "control"), any_of(inventory_cols$controls))
)

inventory$compilation <- inventory$compilation %>%
  distinct() %>%
  arrange(sampleset, sampleID, extractID, libraryID, seq_run)

inventory$samples   <- distinct(inventory$samples)
inventory$extracts  <- distinct(filter(inventory$extracts,  !is.na(extractID)))
inventory$libraries <- distinct(filter(inventory$libraries, !is.na(libraryID)))
inventory$controls  <- distinct(inventory$controls)

4 Export Updated Files

The chunk below will overwrite the main inventory files for this sampleset if they already existed.

Show the code

version_control_check(
  paste0("inventories/", params$sample_type),
  "inventory",
  ".RData"
)

[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"

Show the code

save(
  inventory, 
  file = here(paste0("inventories/", params$sample_type, "/inventory.RData"))
  )

version_control_check(
  paste0("inventories/", params$sample_type),
  "sample_inventory",
  ".RData"
)

[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"

Show the code

save(
  sample_inventory, 
  file = here(paste0("inventories/", params$sample_type, "/sample_inventory.RData"))
  )


version_control_check(
  paste0("inventories/", params$sample_type),
  "sample_inventory",
  ".csv"
)

[1] "inventories/isolates directory exists."
[1] "version_archive directory already exists"
[1] "previous file version moved to archive"

Show the code

write_csv(
  sample_inventory, 
  here(paste0("inventories/", params$sample_type, "/sample_inventory.csv")),
  na = ""
  )


dashboard_transfer(
  paste0("inventories/", params$sample_type),
  paste0("inventories/", params$sample_type),
  "inventory",
  ".RData"
)

[1] "Dashboard subdirectory already exists"
[1] "File available for dashboard use now."

5 Download Data

Use the links below to access the current github repo version of our inventories.

Copyright

This document is intended solely for members of the Rich Lab for Molecular Health at UNO. The content, including any data presented herein, is unpublished, private, and not for use, distribution, or publication without explicit written consent from Dr. Alicia M. Rich. Any adaptation or use of this material must acknowledge and cite this unpublished work.