BigMindLab · danoguevara · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025 · Aug 24, 2025
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 ^README\.Rmd$
 ^LICENSE\.md$
 ^\.github$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -46,3 +46,6 @@ Suggests:
     tidyr,
     tidyselect
 Roxygen: list(markdown = TRUE)
+Depends: 
+    R (>= 3.5)
+LazyData: true
diff --git a/R/raw_counts.R b/R/raw_counts.R
@@ -0,0 +1,16 @@
+#' GDC TCGA Lung Adenocarcinoma (LUAD) - Raw STAR counts
+#'
+#' A subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena.
+#' Data originate from GDC, only selected samples are included.
+#' Gene identifiers are Ensembl IDs with the version suffix removed (e.g., "ENSG00000141510.15" → "ENSG00000141510").
+#' Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A").
+#'
+#' @format ## `raw_counts`
+#' A data frame with 60,660 rows and 32 columns:
+#' \describe{
+#'   \item{rownames}{Ensembl gene IDs (GRCh38) with version suffix stripped (no ".##").}
+#'   \item{columns}{TCGA sample IDs written with dots instead of dashes, e.g., `TCGA.38.4627.11A`.}
+#'   \item{values}{Integer raw counts from STAR gene quantification (untransformed).}
+#' }
+#' @source <https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz>
+"raw_counts"
diff --git a/R/sampledata.R b/R/sampledata.R
@@ -0,0 +1,24 @@
+#' GDC TCGA Lung Adenocarcinoma (LUAD) - Metadata
+#'
+#' Samples information of the subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena.
+#' Data originate from GDC, only selected samples are included.
+#' Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A").
+#'
+#' @format ## `sampledata`
+#' A data frame with 32 rows and 12 columns:
+#' \describe{
+#' \item{patient_id}{TCGA sample ID written with dots instead of dashes, e.g., \code{TCGA.38.4627.11A}.}
+#' \item{sample_type}{Sample category, e.g., \code{"tumor"} or \code{"normal"}.}
+#' \item{age}{Age (years) at diagnosis/collection.}
+#' \item{race_demographic}{Self-reported race/ethnicity (e.g., \code{"white"}).}
+#' \item{sex}{Biological sex (\code{"male"} / \code{"female"}).}
+#' \item{status}{Vital status at last follow-up (\code{"Alive"} / \code{"Dead"}).}
+#' \item{pathologic_stage}{Overall AJCC pathologic stage, e.g., \code{"Stage IA"}, \code{"Stage IIIA"}.}
+#' \item{pathologic_t}{Primary tumor (T) category, e.g., \code{"T1b"}, \code{"T2"}.}
+#' \item{smoking_status}{Smoking history from TCGA clinical (free text), e.g., \code{"Lifelong Non-smoker"}.}
+#' \item{agents}{Therapeutic agents administered (free text; may be empty).}
+#' \item{treatment_response}{Clinical response to therapy (free text; may be empty).}
+#' \item{treatment_type}{Type of therapy (free text; may be empty).}
+#' }
+#' @source <https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz>
+"sampledata"
diff --git a/data-raw/raw_counts.R b/data-raw/raw_counts.R
@@ -0,0 +1,20 @@
+# STAR Counts - TCGA LUAD -------------------------
+
+# August, 2025
+# Source: https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz
+
+raw_counts <- read.delim("../TCGA-LUAD.star_counts.tsv.gz", sep = "\t", header = TRUE)
+sampledata <- read.csv("../TCGA-LUAD.samples.reduced.tsv", sep = "\t", header = TRUE)
+
+raw_counts[, -1] <- as.matrix(raw_counts[, -1])
+
+gids <- sub("\\.\\d+$", "", raw_counts$Ensembl_ID)
+raw_counts <- raw_counts[,-1]
+rownames(raw_counts) <- gids
+
+raw_counts <- 2^raw_counts - 1
+
+sampledata$patient_id <- gsub("-", ".", sampledata$patient_id)
+raw_counts <- raw_counts[, sampledata$patient_id]
+
+usethis::use_data(raw_counts, compress = "xz", overwrite = TRUE)
diff --git a/data-raw/sampledata.R b/data-raw/sampledata.R
@@ -0,0 +1,9 @@
+# Metadata - TCGA LUAD -------------------------
+
+# August, 2025
+# Source: https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz
+
+sampledata <- read.csv("../TCGA-LUAD.samples.reduced.tsv", sep = "\t", header = TRUE)
+sampledata$patient_id <- gsub("-", ".", sampledata$patient_id)
+
+usethis::use_data(sampledata, compress = "xz", overwrite = TRUE)
diff --git a/data/raw_counts.rda b/data/raw_counts.rda
diff --git a/data/sampledata.rda b/data/sampledata.rda
diff --git a/man/raw_counts.Rd b/man/raw_counts.Rd
diff --git a/man/sampledata.Rd b/man/sampledata.Rd