diff --git a/.Rbuildignore b/.Rbuildignore index b08190c..475c93d 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -3,3 +3,4 @@ ^README\.Rmd$ ^LICENSE\.md$ ^\.github$ +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index 1dfc1a5..10510ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,3 +46,6 @@ Suggests: tidyr, tidyselect Roxygen: list(markdown = TRUE) +Depends: + R (>= 3.5) +LazyData: true diff --git a/R/raw_counts.R b/R/raw_counts.R new file mode 100644 index 0000000..205423e --- /dev/null +++ b/R/raw_counts.R @@ -0,0 +1,16 @@ +#' GDC TCGA Lung Adenocarcinoma (LUAD) - Raw STAR counts +#' +#' A subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena. +#' Data originate from GDC, only selected samples are included. +#' Gene identifiers are Ensembl IDs with the version suffix removed (e.g., "ENSG00000141510.15" → "ENSG00000141510"). +#' Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A"). +#' +#' @format ## `raw_counts` +#' A data frame with 60,660 rows and 32 columns: +#' \describe{ +#' \item{rownames}{Ensembl gene IDs (GRCh38) with version suffix stripped (no ".##").} +#' \item{columns}{TCGA sample IDs written with dots instead of dashes, e.g., `TCGA.38.4627.11A`.} +#' \item{values}{Integer raw counts from STAR gene quantification (untransformed).} +#' } +#' @source +"raw_counts" diff --git a/R/sampledata.R b/R/sampledata.R new file mode 100644 index 0000000..0ed9f7e --- /dev/null +++ b/R/sampledata.R @@ -0,0 +1,24 @@ +#' GDC TCGA Lung Adenocarcinoma (LUAD) - Metadata +#' +#' Samples information of the subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena. +#' Data originate from GDC, only selected samples are included. +#' Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A"). +#' +#' @format ## `sampledata` +#' A data frame with 32 rows and 12 columns: +#' \describe{ +#' \item{patient_id}{TCGA sample ID written with dots instead of dashes, e.g., \code{TCGA.38.4627.11A}.} +#' \item{sample_type}{Sample category, e.g., \code{"tumor"} or \code{"normal"}.} +#' \item{age}{Age (years) at diagnosis/collection.} +#' \item{race_demographic}{Self-reported race/ethnicity (e.g., \code{"white"}).} +#' \item{sex}{Biological sex (\code{"male"} / \code{"female"}).} +#' \item{status}{Vital status at last follow-up (\code{"Alive"} / \code{"Dead"}).} +#' \item{pathologic_stage}{Overall AJCC pathologic stage, e.g., \code{"Stage IA"}, \code{"Stage IIIA"}.} +#' \item{pathologic_t}{Primary tumor (T) category, e.g., \code{"T1b"}, \code{"T2"}.} +#' \item{smoking_status}{Smoking history from TCGA clinical (free text), e.g., \code{"Lifelong Non-smoker"}.} +#' \item{agents}{Therapeutic agents administered (free text; may be empty).} +#' \item{treatment_response}{Clinical response to therapy (free text; may be empty).} +#' \item{treatment_type}{Type of therapy (free text; may be empty).} +#' } +#' @source +"sampledata" diff --git a/data-raw/raw_counts.R b/data-raw/raw_counts.R new file mode 100644 index 0000000..4c6ab2d --- /dev/null +++ b/data-raw/raw_counts.R @@ -0,0 +1,20 @@ +# STAR Counts - TCGA LUAD ------------------------- + +# August, 2025 +# Source: https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz + +raw_counts <- read.delim("../TCGA-LUAD.star_counts.tsv.gz", sep = "\t", header = TRUE) +sampledata <- read.csv("../TCGA-LUAD.samples.reduced.tsv", sep = "\t", header = TRUE) + +raw_counts[, -1] <- as.matrix(raw_counts[, -1]) + +gids <- sub("\\.\\d+$", "", raw_counts$Ensembl_ID) +raw_counts <- raw_counts[,-1] +rownames(raw_counts) <- gids + +raw_counts <- 2^raw_counts - 1 + +sampledata$patient_id <- gsub("-", ".", sampledata$patient_id) +raw_counts <- raw_counts[, sampledata$patient_id] + +usethis::use_data(raw_counts, compress = "xz", overwrite = TRUE) diff --git a/data-raw/sampledata.R b/data-raw/sampledata.R new file mode 100644 index 0000000..4c7a941 --- /dev/null +++ b/data-raw/sampledata.R @@ -0,0 +1,9 @@ +# Metadata - TCGA LUAD ------------------------- + +# August, 2025 +# Source: https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz + +sampledata <- read.csv("../TCGA-LUAD.samples.reduced.tsv", sep = "\t", header = TRUE) +sampledata$patient_id <- gsub("-", ".", sampledata$patient_id) + +usethis::use_data(sampledata, compress = "xz", overwrite = TRUE) diff --git a/data/raw_counts.rda b/data/raw_counts.rda new file mode 100644 index 0000000..ea791d6 Binary files /dev/null and b/data/raw_counts.rda differ diff --git a/data/sampledata.rda b/data/sampledata.rda new file mode 100644 index 0000000..6e956c6 Binary files /dev/null and b/data/sampledata.rda differ diff --git a/man/raw_counts.Rd b/man/raw_counts.Rd new file mode 100644 index 0000000..2880349 --- /dev/null +++ b/man/raw_counts.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/raw_counts.R +\docType{data} +\name{raw_counts} +\alias{raw_counts} +\title{GDC TCGA Lung Adenocarcinoma (LUAD) - Raw STAR counts} +\format{ +\subsection{\code{raw_counts}}{ + +A data frame with 60,660 rows and 32 columns: +\describe{ +\item{rownames}{Ensembl gene IDs (GRCh38) with version suffix stripped (no ".##").} +\item{columns}{TCGA sample IDs written with dots instead of dashes, e.g., \code{TCGA.38.4627.11A}.} +\item{values}{Integer raw counts from STAR gene quantification (untransformed).} +} +} +} +\source{ +\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz} +} +\usage{ +raw_counts +} +\description{ +A subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena. +Data originate from GDC, only selected samples are included. +Gene identifiers are Ensembl IDs with the version suffix removed (e.g., "ENSG00000141510.15" → "ENSG00000141510"). +Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A"). +} +\keyword{datasets} diff --git a/man/sampledata.Rd b/man/sampledata.Rd new file mode 100644 index 0000000..fd22613 --- /dev/null +++ b/man/sampledata.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sampledata.R +\docType{data} +\name{sampledata} +\alias{sampledata} +\title{GDC TCGA Lung Adenocarcinoma (LUAD) - Metadata} +\format{ +\subsection{\code{sampledata}}{ + +A data frame with 32 rows and 12 columns: +\describe{ +\item{patient_id}{TCGA sample ID written with dots instead of dashes, e.g., \code{TCGA.38.4627.11A}.} +\item{sample_type}{Sample category, e.g., \code{"tumor"} or \code{"normal"}.} +\item{age}{Age (years) at diagnosis/collection.} +\item{race_demographic}{Self-reported race/ethnicity (e.g., \code{"white"}).} +\item{sex}{Biological sex (\code{"male"} / \code{"female"}).} +\item{status}{Vital status at last follow-up (\code{"Alive"} / \code{"Dead"}).} +\item{pathologic_stage}{Overall AJCC pathologic stage, e.g., \code{"Stage IA"}, \code{"Stage IIIA"}.} +\item{pathologic_t}{Primary tumor (T) category, e.g., \code{"T1b"}, \code{"T2"}.} +\item{smoking_status}{Smoking history from TCGA clinical (free text), e.g., \code{"Lifelong Non-smoker"}.} +\item{agents}{Therapeutic agents administered (free text; may be empty).} +\item{treatment_response}{Clinical response to therapy (free text; may be empty).} +\item{treatment_type}{Type of therapy (free text; may be empty).} +} +} +} +\source{ +\url{https://gdc-hub.s3.us-east-1.amazonaws.com/download/TCGA-LUAD.star_counts.tsv.gz} +} +\usage{ +sampledata +} +\description{ +Samples information of the subset of TCGA-LUAD RNA-seq gene-level counts generated by STAR and distributed via UCSC Xena. +Data originate from GDC, only selected samples are included. +Sample barcodes were compacted and written with dots instead of dashes (e.g., "TCGA-38-4627-11A" → "TCGA.38.4627.11A"). +} +\keyword{datasets}