Intro

This script streamlines and standardizes our handling of the steps taken to reach each dataset from an original sampleset. You will export some standardized csv and tsv tables for easy importing and manipulation with our other workflows.

Files Needed

To start, you should have four categories of csv files (I download the first three from working google spreadsheets, and the fourth category is a series of files automatically generated by each MinION sequencing run). Those files and their list of column headers to be used are as follows (note: it is fine to have extra columns, but you must at least have these to run the script as is):

  1. Libraries
    • SequenceID
    • Pipeline
    • LibraryTube
    • LibraryBarcode
    • ExtractID
    • Final_Library_Concentration
    • Volume.Added.to.Pool.(uL)
    • Seq_ID
    • Run_ID
    • LibraryTubeID
  2. Extracts
    • ExtractID
    • ExtractDate
    • ExtractedBy
    • ExtractType
    • ExtractKit
    • SampleID
    • ExtractConcentration
    • ExtractBox
    • ExtractNotes
  3. Samples
    • SampleID
    • SampleSubject
    • SampleDate
    • SampleCollectedBy
    • SampleNotes
  4. Barcode Alignments (1 file per Run_ID)
    • barcode
    • alias
    • type
    • target_unclassified
    • qcquisition_run_id
    • protocol_group_id
    • sample_id
    • flow_cell_id
    • started

Other Configuration Settings

Sampleset in Params

You can use the sampleset setting under params in the header of this script to select which sampleset you will be working with. So long as the same name is used consistently, this should automatically filter for that name (e.g., loris or marmoset).

Sequencing Run Lists

The code in the chunk above also generated a list of formatted codes for each available sequencing run to date, separated by taxa/samplesets (currently just for loris and marmoset). Make sure the end number matches the highest integer we have for that sampleset to date.

Script

Barcode Alignments

barcodes <- imap(seqruns, ~ {
  map(.x, ~ read_table(barcode_alignments[[.x]]) %>% mutate(seqrun = .x)) 
}) %>%
    bind_rows() %>%
                        as_tibble() %>%
                        filter(barcode     != "unclassified") %>%
                        mutate(SeqDateTime  = as_datetime(started)) %>%
                        mutate(SeqDate      = floor_date(SeqDateTime, unit = "day")) %>%
                        mutate(SeqRunID     = str_replace_all(sample_id, "pool1", "PL001")) %>%
                      mutate(LibraryCode    = str_squish(str_trim(seqrun      , "both")),
                             FlowCellSerial = str_squish(str_trim(flow_cell_id, "both"))
                             ) %>%
                      mutate(LibraryBarcode  = as.numeric(str_remove_all(barcode, "16S|barcode0|barcode"))) %>%
                        select(LibraryCode,
                               LibraryBarcode,
                               reads_unclassified = target_unclassified,
                               FlowCellSerial,
                               protocol_group_id,
                               flow_cell_id,
                               SeqRunID,
                               SeqDate,
                               SeqDateTime)

write.table(barcodes, barcode_alignments$compilations[[paste0(params$sampleset)]],
            row.names = F,
            sep = "\t")

Sequencing Runs

seqrun.tbl <- read_csv(path$inventories$seqruns) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  mutate(SampleSet       = case_when(
    str_detect(Pooled_Library_Code, "CM")  ~ "marmoset", 
    str_detect(Pooled_Library_Code, "PL")  ~ "loris",
    str_detect(Pooled_Library_Code, "NWR") ~ "bats", .default = "unknown"),
         LibraryCode     = str_to_lower(Pooled_Library_Code),
         LibPrepWorkflow = case_when(
           str_detect(Kit, "LSK") & Pipeline == "16S" ~ "lsk16s",
           Pipeline == "Host mtDNA"                   ~ "lskadaptive",
           str_detect(Kit, "SQK-16S") & Pipeline == "16S" ~ "rapid16s"),
         LibPrepDate     = mdy(Run_Date),
         SeqRunDate      = ymd(str_remove_all(str_trim(Run_ID, "both"), "MIN_16_|MIN_16-|MIN_MT_"))) %>%
  mutate(LibraryCode     = str_replace_all(LibraryCode, "pl00|pl0", "hdz"),
         strands         = 2,
         fragment_type   = if_else(Pipeline == "16S", 3, 1),
         Length          = if_else(Pipeline == "16S", 1500, 10000),
         InputMassStart  = if_else(Pipeline == "16S", 10, 1000),
         TemplateVolPrep = if_else(LibPrepWorkflow == "rapid16s", 15, 47),
         PoolSamples     = if_else(Pipeline == "16S", "yes", "no"),
         InputMassFinal  = 50
         ) %>%
  filter(SampleSet == params$sampleset) %>%
  select(
         SampleSet,
         LibraryCode,
         LibPrepDate,
         LibPrepWorkflow,
         LibPrepKit      = Kit,
         FlowCellSerial  = Flow_Cell_ID,
         FlowCellType    = Flow_Cell_Type,
         FlongleAdapter  = Flongle_Adapter,
         SeqDevice       = Sequencer,
         strands,
         fragment_type,
         Length,
         InputMassStart,
         TemplateVolPrep,
         PoolSamples,
         InputMassFinal)

Sample Records

samples     <- read_csv(path$inventories$collection) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
                      filter(str_starts(SampleID, "\\w+")) %>% 
                      select(-SampleBox)  %>%
                      mutate(SampleID = str_squish(str_trim(SampleID, "both"))) %>% distinct() %>%
                      mutate(CollectionDate     = mdy(SampleDate),
                             Subject            = str_squish(str_trim(SampleSubject)),
                             .keep = "unused") %>% distinct() %>%
                      mutate(Subj_Certainty = if_else(Subject %in% subject_list, "yes", "no")) %>%
                      mutate(Subject        = str_remove_all(Subject, "\\?"))

DNA Extract Records

We will also join the previous sample records to this table at the end of the chunk.

extracts <- read_csv(path$inventories$extraction) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  filter(str_starts(SampleID, "\\w+")) %>%
  mutate(SampleID        = if_else(str_detect(SampleID, "#N/A") | is.na(SampleID), "ExtractControl", SampleID)) %>%
  mutate(SampleID = str_squish(str_trim(SampleID, "both")),
         ExtractID= str_squish(str_trim(ExtractID, "both")),
         ExtractDate       = mdy(ExtractDate)) %>%
  mutate(ExtractConc       = str_remove_all(ExtractConcentration, ">"), .keep = "unused") %>%
  mutate(ExtractConc = if_else(str_detect(ExtractConc, "Higher"), "100", ExtractConc),
         ExtractConc = if_else(str_detect(ExtractConc, "HIGHER"), "100", ExtractConc),
         ExtractConc = if_else(ExtractConc == "LOW", "0", ExtractConc),
         ExtractConc = if_else(ExtractConc == "", NA, ExtractConc)) %>%
  mutate(ExtractConc = round(as.numeric(ExtractConc), 1))  %>% filter(ExtractType == "DNA" | ExtractType == "dna") %>%
  select(-ExtractType) %>%
  right_join(samples) %>% distinct()

Libraries and Combining all Records

compilation <- read_csv(path$inventories$libraries, show_col_types = FALSE, col_types = list(LibraryBarcode = col_character()))  %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  rename_with(~str_remove_all(., "\\(|\\)")) %>%
  filter(str_starts(SequenceID, "\\w+") & Seq_ID != "#N/A") %>%
  mutate(LibraryCode     = str_to_lower(Seq_ID)) %>%
  mutate(LibraryCode     = str_replace_all(LibraryCode, "pl00|pl0" , "hdz"),
         SampVolPool     = round(as.numeric(Volume_Added_to_Pool_uL), 0),
         LibraryBarcode  = as.numeric(str_remove_all(LibraryBarcode, "16S|barcode0|barcode"))) %>%
  mutate(TotalPoolVol    = sum(SampVolPool), .by = LibraryCode) %>%
  mutate(BeadVol         = TotalPoolVol * 0.6) %>%
  select(SequenceID,
         LibraryCode,
         LibraryTube,
         LibraryBarcode,
         ExtractID,
         SampVolPool,
         TotalPoolVol,
         BeadVol,
         Conc_QC2    = Final_Library_Concentration) %>%
  full_join(extracts, by = join_by(ExtractID)) %>% 
  mutate(SampleID = if_else(is.na(SampleID) & !is.na(ExtractID), "NTC", SampleID)) %>%
  full_join(barcodes, by = join_by(LibraryCode, LibraryBarcode)) %>%
  select(-FlowCellSerial) %>%
  left_join(seqrun.tbl, by = join_by(LibraryCode)) %>% distinct() %>%
  mutate(steps_remaining = case_when(
    is.na(ExtractID) ~ "sample not extracted",
    is.na(SequenceID) ~ "extract not sequenced",
    !is.na(ExtractID) & !is.na(SequenceID) & !is.na(SampleID) ~ "sample extracted and sequenced"
  )) %>%
  relocate(SampleID, ExtractID, SequenceID, steps_remaining) %>%
  arrange(CollectionDate, Subject)

Exporting a Spreadsheet with Records

We will use this spreadsheet for building the metadata table but also for calling up sample info in our protocol apps.

write.table(compilation,
            path$inventories$all_stages,
            row.names = F,
            sep = "\t")

Counting Replicates

count.extracts    <- extracts %>% select(ExtractID, SampleID) %>% distinct() %>% 
  group_by(SampleID)  %>% 
  mutate(n_dna_extracts = n_distinct(ExtractID)) %>% ungroup() %>% select(-ExtractID)

count.libraries <- compilation %>% select(SequenceID, ExtractID, SampleID) %>% distinct() %>% 
  group_by(ExtractID) %>% mutate(n_16s_extract = n_distinct(SequenceID)) %>% ungroup() %>%
  group_by(SampleID)  %>% mutate(n_16s_sample  = n_distinct(SequenceID)) %>% ungroup() %>% select(-SequenceID)

Exporting SampleSheets formatted for Dorado

samplesheet <- compilation %>%
  filter(steps_remaining == "sample extracted and sequenced") %>%
                      mutate(barcode = if_else(LibraryBarcode < 10, 
                                               as.character(str_glue("barcode0", "{LibraryBarcode}")),
                                               as.character(str_glue("barcode" , "{LibraryBarcode}")))) %>%
                      select(flow_cell_id  = FlowCellSerial,
                             experiment_id = protocol_group_id,
                             kit           = LibPrepKit,
                             barcode,
                             alias         = SequenceID,
                             seqrun        = LibraryCode)

write.table(samplesheet, 
          sample_sheets$compilations[[paste0(params$sampleset)]],
          row.names = F,
          quote     = F,
          sep       = ",")

Splitting Samplesheet to Individual Files for Each Run

samplesheet.nested <- samplesheet %>% nest(.by = seqrun) %>%
  deframe()

Next Step

Now you should proceed to the Read Processing workflow to begin basecalling the sequencing run.

---
title: "Sample Inventory"
author: "Alicia Rich"
date: "`r Sys.Date()`"
output:
  html_document:
    theme:
      bslib: true
    toc: true
    toc_depth: 3
    toc_float: true
    df_print: paged
    css: journal.css
    code_download: true
params:
  sampleset: "loris"
  seqrun: "hdz18"
                     
---

```{r setup, include = F}

library(bslib)
library(conflicted)
library(htmltools)
library(htmlwidgets)
library(tidyverse)

source("setup/conflicted.R")
source("setup/knit_engines_simple.R")
source("setup/inputs_read_processing.R")


knitr::opts_chunk$set(
  message       = FALSE,
  warning       = FALSE,
  echo          = TRUE,
  include       = TRUE,
  eval          = TRUE,
  comment       = ""  ,
  df_print      = "kable",
  skimr_digits  = 2        )

knitr::opts_knit$set(
  message       = FALSE,
  warning       = FALSE,
  echo          = TRUE,
  include       = TRUE,
  eval          = TRUE,
  comment       = ""  ,
  df_print      = "kable",
  skimr_digits  = 2        
  )

seqruns <- seqruns %>% keep_at(params$sampleset)  %>% list_flatten(name_spec = "")

global            <- config::get(config = "default")
loris             <- config::get(config = "loris")
barcode_alignments<- config::get(config = "barcode_alignments")
sample_sheets     <- config::get(config = "sample_sheets")
path              <- config::get(config = paste0(params$sampleset))

subject_list <- keep_at(subjects, paste0(params$sampleset)) %>% list_flatten(name_spec = "{inner}")

```


# Intro

This script streamlines and standardizes our handling of the steps taken to reach each dataset from an original sampleset. You will export some standardized csv and tsv tables for easy importing and manipulation with our other workflows.

## Files Needed

To start, you should have four categories of csv files (I download the first three from working google spreadsheets, and the fourth category is a series of files automatically generated by each MinION sequencing run). Those files and their list of column headers to be used are as follows (note: it is fine to have extra columns, but you must at least have these to run the script as is):  

```{r, echo = FALSE}
page_fluid(
    accordion(
      open = FALSE,
      accordion_panel(
        "Show/Hide File List",
        tagList(tagList(
    withTags(
      ol(
        li("Libraries"),
          ul(
            li("SequenceID"),
            li("Pipeline"),
            li("LibraryTube"),
            li("LibraryBarcode"),
            li("ExtractID"),
            li("Final_Library_Concentration"),
            li("Volume.Added.to.Pool.(uL)"),
            li("Seq_ID"),
            li("Run_ID"),
            li("LibraryTubeID")
          ),
        li("Extracts"),
          ul(
            li("ExtractID"),
            li("ExtractDate"),
            li("ExtractedBy"),
            li("ExtractType"),
            li("ExtractKit"),
            li("SampleID"),
            li("ExtractConcentration"),
            li("ExtractBox"),
            li("ExtractNotes")
            ),
        li("Samples"),
          ul(  
            li("SampleID"),
            li("SampleSubject"),
            li("SampleDate"),
            li("SampleCollectedBy"),
            li("SampleNotes")
            ),
        li("Barcode Alignments (1 file per Run_ID)"),
          ul(
            li("barcode"),
            li("alias"),
            li("type"),
            li("target_unclassified"),
            li("qcquisition_run_id"),
            li("protocol_group_id"),
            li("sample_id"),
            li("flow_cell_id"),
            li("started")
            )
        )
      )
    )
    )
    )
    )
    )

```


## Other Configuration Settings

### Sampleset in Params

You can use the sampleset setting under params in the header of this script to select which sampleset you will be working with. So long as the same name is used consistently, this should automatically filter for that name (e.g., loris or marmoset). 

### Sequencing Run Lists

The code in the chunk above also generated a list of formatted codes for each available sequencing run to date, separated by taxa/samplesets (currently just for loris and marmoset). Make sure the end number matches the highest integer we have for that sampleset to date.

# Script

## Barcode Alignments

```{r}
barcodes <- imap(seqruns, ~ {
  map(.x, ~ read_table(barcode_alignments[[.x]]) %>% mutate(seqrun = .x)) 
}) %>%
    bind_rows() %>%
                        as_tibble() %>%
                        filter(barcode     != "unclassified") %>%
                        mutate(SeqDateTime  = as_datetime(started)) %>%
                        mutate(SeqDate      = floor_date(SeqDateTime, unit = "day")) %>%
                        mutate(SeqRunID     = str_replace_all(sample_id, "pool1", "PL001")) %>%
                      mutate(LibraryCode    = str_squish(str_trim(seqrun      , "both")),
                             FlowCellSerial = str_squish(str_trim(flow_cell_id, "both"))
                             ) %>%
                      mutate(LibraryBarcode  = as.numeric(str_remove_all(barcode, "16S|barcode0|barcode"))) %>%
                        select(LibraryCode,
                               LibraryBarcode,
                               reads_unclassified = target_unclassified,
                               FlowCellSerial,
                               protocol_group_id,
                               flow_cell_id,
                               SeqRunID,
                               SeqDate,
                               SeqDateTime)

write.table(barcodes, barcode_alignments$compilations[[paste0(params$sampleset)]],
            row.names = F,
            sep = "\t")
```

## Sequencing Runs

```{r, message=FALSE, warning=FALSE}
seqrun.tbl <- read_csv(path$inventories$seqruns) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  mutate(SampleSet       = case_when(
    str_detect(Pooled_Library_Code, "CM")  ~ "marmoset", 
    str_detect(Pooled_Library_Code, "PL")  ~ "loris",
    str_detect(Pooled_Library_Code, "NWR") ~ "bats", .default = "unknown"),
         LibraryCode     = str_to_lower(Pooled_Library_Code),
         LibPrepWorkflow = case_when(
           str_detect(Kit, "LSK") & Pipeline == "16S" ~ "lsk16s",
           Pipeline == "Host mtDNA"                   ~ "lskadaptive",
           str_detect(Kit, "SQK-16S") & Pipeline == "16S" ~ "rapid16s"),
         LibPrepDate     = mdy(Run_Date),
         SeqRunDate      = ymd(str_remove_all(str_trim(Run_ID, "both"), "MIN_16_|MIN_16-|MIN_MT_"))) %>%
  mutate(LibraryCode     = str_replace_all(LibraryCode, "pl00|pl0", "hdz"),
         strands         = 2,
         fragment_type   = if_else(Pipeline == "16S", 3, 1),
         Length          = if_else(Pipeline == "16S", 1500, 10000),
         InputMassStart  = if_else(Pipeline == "16S", 10, 1000),
         TemplateVolPrep = if_else(LibPrepWorkflow == "rapid16s", 15, 47),
         PoolSamples     = if_else(Pipeline == "16S", "yes", "no"),
         InputMassFinal  = 50
         ) %>%
  filter(SampleSet == params$sampleset) %>%
  select(
         SampleSet,
         LibraryCode,
         LibPrepDate,
         LibPrepWorkflow,
         LibPrepKit      = Kit,
         FlowCellSerial  = Flow_Cell_ID,
         FlowCellType    = Flow_Cell_Type,
         FlongleAdapter  = Flongle_Adapter,
         SeqDevice       = Sequencer,
         strands,
         fragment_type,
         Length,
         InputMassStart,
         TemplateVolPrep,
         PoolSamples,
         InputMassFinal)
```


## Sample Records

```{r, warning = FALSE}
samples     <- read_csv(path$inventories$collection) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
                      filter(str_starts(SampleID, "\\w+")) %>% 
                      select(-SampleBox)  %>%
                      mutate(SampleID = str_squish(str_trim(SampleID, "both"))) %>% distinct() %>%
                      mutate(CollectionDate     = mdy(SampleDate),
                             Subject            = str_squish(str_trim(SampleSubject)),
                             .keep = "unused") %>% distinct() %>%
                      mutate(Subj_Certainty = if_else(Subject %in% subject_list, "yes", "no")) %>%
                      mutate(Subject        = str_remove_all(Subject, "\\?"))
```

## DNA Extract Records

We will also join the previous sample records to this table at the end of the chunk.

```{r}
extracts <- read_csv(path$inventories$extraction) %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  filter(str_starts(SampleID, "\\w+")) %>%
  mutate(SampleID        = if_else(str_detect(SampleID, "#N/A") | is.na(SampleID), "ExtractControl", SampleID)) %>%
  mutate(SampleID = str_squish(str_trim(SampleID, "both")),
         ExtractID= str_squish(str_trim(ExtractID, "both")),
         ExtractDate       = mdy(ExtractDate)) %>%
  mutate(ExtractConc       = str_remove_all(ExtractConcentration, ">"), .keep = "unused") %>%
  mutate(ExtractConc = if_else(str_detect(ExtractConc, "Higher"), "100", ExtractConc),
         ExtractConc = if_else(str_detect(ExtractConc, "HIGHER"), "100", ExtractConc),
         ExtractConc = if_else(ExtractConc == "LOW", "0", ExtractConc),
         ExtractConc = if_else(ExtractConc == "", NA, ExtractConc)) %>%
  mutate(ExtractConc = round(as.numeric(ExtractConc), 1))  %>% filter(ExtractType == "DNA" | ExtractType == "dna") %>%
  select(-ExtractType) %>%
  right_join(samples) %>% distinct()
```

## Libraries and Combining all Records


```{r}
compilation <- read_csv(path$inventories$libraries, show_col_types = FALSE, col_types = list(LibraryBarcode = col_character()))  %>% 
  rename_with(~str_replace_all(., "\\s", "_")) %>%
  rename_with(~str_remove_all(., "\\(|\\)")) %>%
  filter(str_starts(SequenceID, "\\w+") & Seq_ID != "#N/A") %>%
  mutate(LibraryCode     = str_to_lower(Seq_ID)) %>%
  mutate(LibraryCode     = str_replace_all(LibraryCode, "pl00|pl0" , "hdz"),
         SampVolPool     = round(as.numeric(Volume_Added_to_Pool_uL), 0),
         LibraryBarcode  = as.numeric(str_remove_all(LibraryBarcode, "16S|barcode0|barcode"))) %>%
  mutate(TotalPoolVol    = sum(SampVolPool), .by = LibraryCode) %>%
  mutate(BeadVol         = TotalPoolVol * 0.6) %>%
  select(SequenceID,
         LibraryCode,
         LibraryTube,
         LibraryBarcode,
         ExtractID,
         SampVolPool,
         TotalPoolVol,
         BeadVol,
         Conc_QC2    = Final_Library_Concentration) %>%
  full_join(extracts, by = join_by(ExtractID)) %>% 
  mutate(SampleID = if_else(is.na(SampleID) & !is.na(ExtractID), "NTC", SampleID)) %>%
  full_join(barcodes, by = join_by(LibraryCode, LibraryBarcode)) %>%
  select(-FlowCellSerial) %>%
  left_join(seqrun.tbl, by = join_by(LibraryCode)) %>% distinct() %>%
  mutate(steps_remaining = case_when(
    is.na(ExtractID) ~ "sample not extracted",
    is.na(SequenceID) ~ "extract not sequenced",
    !is.na(ExtractID) & !is.na(SequenceID) & !is.na(SampleID) ~ "sample extracted and sequenced"
  )) %>%
  relocate(SampleID, ExtractID, SequenceID, steps_remaining) %>%
  arrange(CollectionDate, Subject)
```

### Exporting a Spreadsheet with Records

We will use this spreadsheet for building the metadata table but also for calling up sample info in our protocol apps.

```{r, message=FALSE, warning=FALSE}
write.table(compilation,
            path$inventories$all_stages,
            row.names = F,
            sep = "\t")
```


## Counting Replicates

```{r}
count.extracts    <- extracts %>% select(ExtractID, SampleID) %>% distinct() %>% 
  group_by(SampleID)  %>% 
  mutate(n_dna_extracts = n_distinct(ExtractID)) %>% ungroup() %>% select(-ExtractID)

count.libraries <- compilation %>% select(SequenceID, ExtractID, SampleID) %>% distinct() %>% 
  group_by(ExtractID) %>% mutate(n_16s_extract = n_distinct(SequenceID)) %>% ungroup() %>%
  group_by(SampleID)  %>% mutate(n_16s_sample  = n_distinct(SequenceID)) %>% ungroup() %>% select(-SequenceID)
```

## Exporting SampleSheets formatted for Dorado

```{r, message=FALSE, warning=FALSE}
samplesheet <- compilation %>%
  filter(steps_remaining == "sample extracted and sequenced") %>%
                      mutate(barcode = if_else(LibraryBarcode < 10, 
                                               as.character(str_glue("barcode0", "{LibraryBarcode}")),
                                               as.character(str_glue("barcode" , "{LibraryBarcode}")))) %>%
                      select(flow_cell_id  = FlowCellSerial,
                             experiment_id = protocol_group_id,
                             kit           = LibPrepKit,
                             barcode,
                             alias         = SequenceID,
                             seqrun        = LibraryCode)

write.table(samplesheet, 
          sample_sheets$compilations[[paste0(params$sampleset)]],
          row.names = F,
          quote     = F,
          sep       = ",")
```

### Splitting Samplesheet to Individual Files for Each Run


```{r}
samplesheet.nested <- samplesheet %>% nest(.by = seqrun) %>%
  deframe()
```


```{r, include = FALSE}
imap(samplesheet.nested, ~ {
  write.table(.x, (sample_sheets[[.y]]),
          row.names = F,
          quote     = F,
          sep       = ",")
})
```



# Next Step

>Now you should proceed to the Read Processing workflow to begin basecalling the sequencing run.
