rformassspectrometry · Heriette · Oct 17, 2025
diff --git a/R/readQFeaturesFromDIANN.R b/R/readQFeaturesFromDIANN.R
@@ -1,181 +1,207 @@
-##' @title  Read DIA-NN output as a QFeatures objects
-##'
-##' @description
-##'
-##' This function takes the `Report.tsv` output files from DIA-NN and
-##' converts them into a multi-set `QFeatures` object. It is a wrapper
-##' around [readQFeatures()] with default parameters set to match
-##' DIA-NN label-free and plexDIA report files: default `runCol` is
-##' `"File.Name"` and default quantCols` is `"Ms1.Area"`.
-##'
-##' @inheritParams readQFeatures
-##'
-##' @param extractedData A `data.frame` or any object that can be
-##'     coerced to a `data.frame` that contains the data from the
-##'     `*_ms1_extracted.tsv` file generated by DIA-NN. This argument
-##'     is optional and is currently only applicable for mTRAQ
-##'     multiplexed experiments where DIA-NN was run using the
-##'     `plexdia` module (see references).
-##'
-##' @param multiplexing A `character(1)` indicating the type of
-##'     multiplexing used in the experiment. One of `"none"` (default,
-##'     for label-free experiments) or `"mTRAQ"` (for plexDIA
-##'     experiments).
-##'
-##' @param ... Further arguments passed to [readQFeatures()].
-##'
-##' @return An instance of class `QFeatures`. The quantiative data of
-##'     each acquisition run is stored in a separate set as a
-##'     `SummarizedExperiment` object.
-##'
-##' @seealso
-##'
-##' - The `QFeatures` (see [QFeatures()]) class to read about how to
-##'   manipulate the resulting `QFeatures` object.
-##'
-##' - The [readQFeatures()] function which this one depends on.
-##'
-##' @references
-##' Derks, Jason, Andrew Leduc, Georg Wallmann, R. Gray Huffman,
-##' Matthew Willetts, Saad Khan, Harrison Specht, Markus Ralser,
-##' Vadim Demichev, and Nikolai Slavov. 2022. "Increasing the
-##' Throughput of Sensitive Proteomics by plexDIA." Nature
-##' Biotechnology, July.
-##' [Link to article](http://dx.doi.org/10.1038/s41587-022-01389-w)
-##'
-##' @author Laurent Gatto, Christophe Vanderaa
-##'
-##' @importFrom tidyr pivot_wider
-##' @importFrom tidyselect all_of
-##'
-##' @export
-##'
-##' @examples
-##'
-##' x <- read.delim(MsDataHub::benchmarkingDIA.tsv())
-##' x[["File.Name"]] <- x[["Run"]]
-##'
-##' #################################
-##' ## Label-free multi-set case
-##'
-##' ## using default arguments
-##' readQFeaturesFromDIANN(x)
-##'
-##' ## use the precursor identifier as assay rownames
-##' readQFeaturesFromDIANN(x, fnames = "Precursor.Id") |>
-##'     rownames()
-##'
-##' ## with a colData (and default arguments)
-##' cd <- data.frame(sampleInfo = LETTERS[1:24],
-##'                  quantCols = "Ms1.Area",
-##'                  runCol = unique(x[["File.Name"]]))
-##' readQFeaturesFromDIANN(x, colData = cd)
-##'
-##' #################################
-##' ## mTRAQ multi-set case
-##'
-##' x2 <- read.delim(MsDataHub::Report.Derks2022.plexDIA.tsv())
-##' x2[["File.Name"]] <- x2[["Run"]]
-##' readQFeaturesFromDIANN(x2, multiplexing = "mTRAQ")
-readQFeaturesFromDIANN <- function(assayData,
-                                   colData = NULL,
-                                   quantCols = "Ms1.Area",
-                                   runCol = "File.Name",
-                                   multiplexing = c("none", "mTRAQ"),
-                                   extractedData = NULL,
-                                   ecol = NULL,
-                                   verbose = TRUE,
-                                   ...) {
-    multiplexing <- match.arg(multiplexing, several.ok = FALSE)
-    if (multiplexing == "mTRAQ") {
-        if (verbose) message("Pivoting quantiative data.")
-        assayData <- .formatMtraqReportData(assayData, colData,
-                                      quantCols, runCol)
-        quantCols <- assayData[[2]]
-        assayData <- assayData[[1]]
-    } ## else is none
-    ans <- readQFeatures(assayData, colData = colData,
-                         quantCols = quantCols,
-                         runCol = runCol,
-                         ecol = ecol,
-                         ...)
-    if (!is.null(extractedData)) {
-        ans <- .addDiannExtractedData(ans, extractedData)
-    }
-    setQFeaturesType(ans, "bulk")
-}
-
-## (Only for mTRAQ multiplexing!) Internal function that extracts the
-## mTRAQlabels from the peptide sequence, removes the mTRAQ annotation
-## from the precursor ID, identifies constant columns within precursor
-## and puts the quantification data for different mTRAQ labels in
-## separate columns (wide format).
-.formatMtraqReportData <- function(assayData, colData, quantCols, runCol) {
-    assayData$Label <-
-        sub("^.*[Q-](\\d).*$", "\\1", assayData$Modified.Sequence)
-    assayData$Precursor.Id <-
-        gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", assayData$Precursor.Id)
-    ## .checkLabelsInColData(colData, assayData)
-    idCols <- .findPrecursorVariables(assayData,
-                                      precursorId = "Precursor.Id",
-                                      runCol = runCol)
-    ans <- pivot_wider(
-        assayData, id_cols = all_of(idCols),
-        names_from = "Label",
-        values_from = all_of(quantCols)
-    )
-    list(assayData = ans,
-         quantCols = setdiff(colnames(ans), colnames(assayData)))
-}
-
-## Internal function that identifies which variables in the report
-## data are constant within each precursor within each run.
-.findPrecursorVariables <- function(assayData, precursorId, runCol) {
-    precIds <- paste0(assayData[[precursorId]], assayData[[runCol]])
-    nUniqueIds <- length(unique(precIds))
-    nLevels <- sapply(colnames(assayData), function(x) {
-        nrow(unique(assayData[, c(precursorId, runCol, x)]))
-    })
-    names(nLevels)[nLevels == nUniqueIds]
-}
-
-## Internal function that adds the extractedData to a QFeatures
-## object. The functions first converts the extractedData to a
-## SummarizedExperiment objects and subsets the SCE for the set of
-## shared samples. The added assay is automatically linked (using
-## AssayLinks) to the assayData.
-## Developer's note: the function assumes that DIA-NN creates sample
-## names in the extracted data by appending the labels to the run
-## names
-## @param object A QFeatures object containing DIA-NN report data, as
-##     generated by readQFeatures.
-## @param extractedData A data.frame or any object that can be coerced
-##     to a data.frame that contains the data from the
-##     `*_ms1_extracted.tsv` file generated by DIA-NN.
-.addDiannExtractedData <- function(object, extractedData) {
-    quantColPattern <- paste0(unique(object$Label), "$", collapse = "|")
-    quantCols <- grep(quantColPattern, colnames(extractedData))
-    extractedData <- readSummarizedExperiment(
-        extractedData, quantCols = quantCols, fnames = "Precursor.Id"
-    )
-    extractedData <- .keepSharedSamples(extractedData, object)
-    object <- addAssay(object, extractedData, name = "Ms1Extracted")
-    addAssayLink(
-        object,
-        from = grep("Ms1Extracted", names(object), invert = TRUE),
-        to = "Ms1Extracted",
-        varFrom = rep("Precursor.Id", length(names(object)) - 1),
-        varTo = "Precursor.Id"
-    )
-}
-
-## Internal functions that subsets the extractedData to keep only
-.keepSharedSamples <- function(extractedData, object) {
-    cnames <- unique(unlist(colnames(object)))
-    if (any(mis <- !cnames %in% colnames(extractedData)))
-        stop("Some columns present in assayData are not found in ",
-             "extracted data", paste0(cnames[mis], collapse = ", "),
-             "\nAre you sure the two tables were generated from ",
-             "the same experiment?")
-    extractedData[, cnames]
-}
+##' @title  Read DIA-NN output as a QFeatures objects
+##'
+##' @description
+##'
+##' This function takes the `Report.tsv` output files from DIA-NN and
+##' converts them into a multi-set `QFeatures` object. It is a wrapper
+##' around [readQFeatures()] with default parameters set to match
+##' DIA-NN label-free and plexDIA report files: default `runCol` is
+##' `"File.Name"` and default quantCols` is `"Ms1.Area"`.
+##'
+##' @inheritParams readQFeatures
+##'
+##' @param extractedData A `data.frame` or any object that can be
+##'     coerced to a `data.frame` that contains the data from the
+##'     `*_ms1_extracted.tsv` file generated by DIA-NN. This argument
+##'     is optional and is currently only applicable for mTRAQ
+##'     multiplexed experiments where DIA-NN was run using the
+##'     `plexdia` module (see references).
+##'
+##' @param multiplexing A `character(1)` indicating the type of
+##'     multiplexing used in the experiment. One of `"none"` (default,
+##'     for label-free experiments), `"mTRAQ"` (for multiplex experiments
+##'     with mTRAQ labeling) or `"dimethyl"` (for multiplex experiments
+##'     with dimethyl labeling).
+##'
+##' @param ... Further arguments passed to [readQFeatures()].
+##'
+##' @return An instance of class `QFeatures`. The quantiative data of
+##'     each acquisition run is stored in a separate set as a
+##'     `SummarizedExperiment` object.
+##'
+##' @seealso
+##'
+##' - The `QFeatures` (see [QFeatures()]) class to read about how to
+##'   manipulate the resulting `QFeatures` object.
+##'
+##' - The [readQFeatures()] function which this one depends on.
+##'
+##' @references
+##' Derks, Jason, Andrew Leduc, Georg Wallmann, R. Gray Huffman,
+##' Matthew Willetts, Saad Khan, Harrison Specht, Markus Ralser,
+##' Vadim Demichev, and Nikolai Slavov. 2022. "Increasing the
+##' Throughput of Sensitive Proteomics by plexDIA." Nature
+##' Biotechnology, July.
+##' [Link to article](http://dx.doi.org/10.1038/s41587-022-01389-w)
+##'
+##' @author Laurent Gatto, Christophe Vanderaa
+##'
+##' @importFrom tidyr pivot_wider
+##' @importFrom tidyselect all_of
+##'
+##' @export
+##'
+##' @examples
+##'
+##' x <- read.delim(MsDataHub::benchmarkingDIA.tsv())
+##' x[["File.Name"]] <- x[["Run"]]
+##'
+##' #################################
+##' ## Label-free multi-set case
+##'
+##' ## using default arguments
+##' readQFeaturesFromDIANN(x)
+##'
+##' ## use the precursor identifier as assay rownames
+##' readQFeaturesFromDIANN(x, fnames = "Precursor.Id") |>
+##'     rownames()
+##'
+##' ## with a colData (and default arguments)
+##' cd <- data.frame(sampleInfo = LETTERS[1:24],
+##'                  quantCols = "Ms1.Area",
+##'                  runCol = unique(x[["File.Name"]]))
+##' readQFeaturesFromDIANN(x, colData = cd)
+##'
+##' #################################
+##' ## mTRAQ multi-set case
+##'
+##' x2 <- read.delim(MsDataHub::Report.Derks2022.plexDIA.tsv())
+##' x2[["File.Name"]] <- x2[["Run"]]
+##' readQFeaturesFromDIANN(x2, multiplexing = "mTRAQ")
+##' 
+readQFeaturesFromDIANN <- function(assayData,
+                                   colData = NULL,
+                                   quantCols = "Ms1.Area",
+                                   runCol = "File.Name",
+                                   multiplexing = c("none", "mTRAQ", "dimethyl"),
+                                   extractedData = NULL,
+                                   ecol = NULL,
+                                   verbose = TRUE,
+                                   ...) {
+    multiplexing <- match.arg(multiplexing, several.ok = FALSE)
+    if (multiplexing == "mTRAQ") {
+        if (verbose) message("Pivoting quantiative data.")
+        assayData <- .formatMtraqReportData(assayData, colData,
+                                      quantCols, runCol)
+        quantCols <- assayData[[2]]
+        assayData <- assayData[[1]]
+    } else if (multiplexing == "dimethyl"){
+      if (verbose) message("Pivoting quantiative data.")
+      assayData <- .formatDimethylReportData(assayData, colData,
+                                             quantCols, runCol)
+      quantCols <- assayData[[2]]
+      assayData <- assayData[[1]]
+    } ## else is none
+    ans <- readQFeatures(assayData, colData = colData,
+                         quantCols = quantCols,
+                         runCol = runCol,
+                         ecol = ecol,
+                         ...)
+    if (!is.null(extractedData)) {
+        ans <- .addDiannExtractedData(ans, extractedData)
+    }
+    setQFeaturesType(ans, "bulk")
+}
+
+## (Only for mTRAQ multiplexing!) Internal function that extracts the
+## mTRAQlabels from the peptide sequence, removes the mTRAQ annotation
+## from the precursor ID, identifies constant columns within precursor
+## and puts the quantification data for different mTRAQ labels in
+## separate columns (wide format).
+.formatMtraqReportData <- function(assayData, colData, quantCols, runCol) {
+    assayData$Label <-
+        sub("^.*[Q-](\\d).*$", "\\1", assayData$Modified.Sequence)
+    assayData$Precursor.Id <-
+        gsub("\\(mTRAQ.*?\\)", "(mTRAQ)", assayData$Precursor.Id)
+    ## .checkLabelsInColData(colData, assayData)
+    idCols <- .findPrecursorVariables(assayData,
+                                      precursorId = "Precursor.Id",
+                                      runCol = runCol)
+    ans <- pivot_wider(
+        assayData, id_cols = all_of(idCols),
+        names_from = "Label",
+        values_from = all_of(quantCols)
+    )
+    list(assayData = ans,
+         quantCols = setdiff(colnames(ans), colnames(assayData)))
+}
+
+## (Only for dimethyl multiplexing!) Internal function that 
+## identifies constant columns within precursor and puts the 
+## quantification data for different channels in separate
+## columns (wide format).
+.formatDimethylReportData <- function(assayData, colData, quantCols, runCol) {
+  idCols <- .findPrecursorVariables(assayData,
+                                    precursorId = "Precursor.Id",
+                                    runCol = runCol)
+  ans <- pivot_wider(
+    assayData, id_cols = all_of(idCols),
+    names_from = "Channel",
+    values_from = all_of(quantCols)
+  )
+
+  list(assayData = ans,
+       quantCols = sort(setdiff(colnames(ans), colnames(assayData))))
+}
+
+## Internal function that identifies which variables in the report
+## data are constant within each precursor within each run.
+.findPrecursorVariables <- function(assayData, precursorId, runCol) {
+    precIds <- paste0(assayData[[precursorId]], assayData[[runCol]])
+    nUniqueIds <- length(unique(precIds))
+    nLevels <- sapply(colnames(assayData), function(x) {
+        nrow(unique(assayData[, c(precursorId, runCol, x)]))
+    })
+    names(nLevels)[nLevels == nUniqueIds]
+}
+
+## Internal function that adds the extractedData to a QFeatures
+## object. The functions first converts the extractedData to a
+## SummarizedExperiment objects and subsets the SCE for the set of
+## shared samples. The added assay is automatically linked (using
+## AssayLinks) to the assayData.
+## Developer's note: the function assumes that DIA-NN creates sample
+## names in the extracted data by appending the labels to the run
+## names
+## @param object A QFeatures object containing DIA-NN report data, as
+##     generated by readQFeatures.
+## @param extractedData A data.frame or any object that can be coerced
+##     to a data.frame that contains the data from the
+##     `*_ms1_extracted.tsv` file generated by DIA-NN.
+.addDiannExtractedData <- function(object, extractedData) {
+    quantColPattern <- paste0(unique(object$Label), "$", collapse = "|")
+    quantCols <- grep(quantColPattern, colnames(extractedData))
+    extractedData <- readSummarizedExperiment(
+        extractedData, quantCols = quantCols, fnames = "Precursor.Id"
+    )
+    extractedData <- .keepSharedSamples(extractedData, object)
+    object <- addAssay(object, extractedData, name = "Ms1Extracted")
+    addAssayLink(
+        object,
+        from = grep("Ms1Extracted", names(object), invert = TRUE),
+        to = "Ms1Extracted",
+        varFrom = rep("Precursor.Id", length(names(object)) - 1),
+        varTo = "Precursor.Id"
+    )
+}
+
+## Internal functions that subsets the extractedData to keep only
+.keepSharedSamples <- function(extractedData, object) {
+    cnames <- unique(unlist(colnames(object)))
+    if (any(mis <- !cnames %in% colnames(extractedData)))
+        stop("Some columns present in assayData are not found in ",
+             "extracted data", paste0(cnames[mis], collapse = ", "),
+             "\nAre you sure the two tables were generated from ",
+             "the same experiment?")
+    extractedData[, cnames]
+}