Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ Depends: R (>= 3.4),
lubridate,
readr,
rio,
tidyr
tidyr,
DescTools,
xts
License: AGPL-3
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.0.2
RoxygenNote: 7.1.0
Suggests:
covr,
testthat (>= 2.1.0)
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ export(sdtm_time_actual)
export(simplify_sdtm_names)
export(strip_attributes)
export(supp_reformat)
export(validate_column_names)
importFrom(dplyr,anti_join)
importFrom(dplyr,group_by_at)
importFrom(dplyr,is_grouped_df)
Expand Down
217 changes: 217 additions & 0 deletions R/detect_column.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
#' Check if dataset is consistent with SDTMIG specifications.
#'
#' @param data The dataset whose columns to check.
#' @param sdtmig_version The version of SDTMIG (e.g. "SDTMIG3.3").
#' @param domain_name The domain specification ("CO", "DM", "SE", "SM", "SV") for checking columns.
#' @return If dataset is not consistent with SDTMIG specifications, give a warning or error and convert class-misspecified columns to specified class.
#' @export
validate_column_names <- function(data, sdtmig_version, domain_name) {

# Check if required columns exist, give an error if they do not exist.
validate_required_column_names <- function(data, sdtmig_version, domain_name) {
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
required_cols <- domain_spec$`Variable Name`[domain_spec$Core %in% "Req"]
missing_requred <- setdiff(required_cols, names(data))
if (length(missing_requred) > 0) {
stop(
"The following required columns are missing: ",
paste(missing_requred, collapse=", ")
)
}
}

validate_required_column_names(data, sdtmig_version, domain_name)

# Check if expected columns exist, give a warning if they do not exist
validate_expected_column_names <- function(data,sdtmig_version, domain_name) {
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
expected_cols <- domain_spec$`Variable Name`[domain_spec$Core %in% "Exp"]
missing_expected <- setdiff(expected_cols, names(data))
if (length(missing_expected) > 0) {
warning(
"The following expected columns are missing: ",
paste(missing_expected, collapse=", ")
)
}
}

validate_expected_column_names(data, sdtmig_version, domain_name)

# Check if permitted columns exist, no message either way
validate_permitted_column_names <- function(data,sdtmig_version, domain_name) {
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
permitted_cols <- domain_spec$`Variable Name`[domain_spec$Core %in% "Perm"]
missing_permitted <- setdiff(permitted_cols, names(data))
if (length(missing_permitted) > 0) {
return(NULL)
}
}

validate_permitted_column_names(data,sdtmig_version, domain_name)

# Check if extra columns exist, give an warning if they do exist
validate_extra_column_names <- function(data,sdtmig_version, domain_name) {
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
all_cols <- domain_spec$`Variable Name`
extra_cols <- setdiff(names(data),all_cols)
if (length(extra_cols) > 0) {
warning(
"The following extra columns do exist: ",
paste(extra_cols, collapse=", ")
)
}
}

validate_extra_column_names(data,sdtmig_version, domain_name)

# Check to see if columns have controlled terminology, give an error if columns do not have controlled terminology.
get_cdisc_controlled_terminology <- function(codelist){
data("terminology")
terminology <- force(terminology)
dat_term <- terminology[terminology$'CDISC Submission Value' %in% codelist,]
dat_term2 <- terminology[terminology$'Codelist Code' %in% dat_term$Code,]
controlled_terminology <- dat_term2$'CDISC Submission Value'
return(controlled_terminology)
}

validate_ctrl_terminology <- function(data,sdtmig_version,domain_name){
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
term <- unlist(regmatches(domain_spec$'Controlled Terms, Codelist or Format',
gregexpr("(?=\\().*?(?<=\\))",
domain_spec$'Controlled Terms, Codelist or Format', perl=T)))
term2 <- gsub("[()]","",term)
var <- domain_spec[domain_spec$'Controlled Terms, Codelist or Format' %in% term,]$'Variable Name'
data_name <- names(data[names(data) %in% var])

noncontrol_col <- list()

for(i in 1:length(data_name)){
terminology <- get_cdisc_controlled_terminology(term2[i])
if(any(data[[data_name[i]]] %in% terminology) == F){
noncontrol_col[[i]] <- data_name[i]
}
}

noncontrol_col <- unlist(noncontrol_col)
if(length(noncontrol_col)>0){
stop(
"The following columns has entries not consistent with controlled terminology: ",
paste(noncontrol_col, collapse=", ")
)
}
}

validate_ctrl_terminology(data,sdtmig_version,domain_name)

# Check to see if columns are consistent with ISO format, give an error if columns are not consistent.
validate_iso <- function(data,sdtmig_version,domain_name){
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
data("d.countries")
d_countries <- force(d.countries)
dat_iso <- domain_spec[grep('ISO',domain_spec$'Controlled Terms, Codelist or Format'),]
noniso_date_col <- list()
if('ISO 3166-1 Alpha-3' %in% dat_iso$'Controlled Terms, Codelist or Format'){
country <- dat_iso[dat_iso$'Controlled Terms, Codelist or Format'=='ISO 3166-1 Alpha-3',]
country_var <- country$'Variable Name'
if(any((data[[country_var]] %in% d_countries$a3)==F)){
stop(
"The following columns are not consistent with ISO 3166-1 Alpha-3 standards: ",
paste(country_var,collapse=", ")
)
}
}
if('ISO 8601' %in% dat_iso$'Controlled Terms, Codelist or Format'){
date <- dat_iso[dat_iso$'Controlled Terms, Codelist or Format'=='ISO 8601',]
for(i in date$'Variable Name'){
for(j in data[[i]]){
date_list <- NULL
tmp <- as.data.frame(suppressWarnings(.parseISO8601(j)))
date_list <- rbind(date_list,tmp)
if(any(is.na(date_list$first.time))){
noniso_date_col[[i]] <- i
}
}
}
}
noniso_date_col <- unlist(noniso_date_col)
if(length(noniso_date_col > 0)){
stop(
"The following columns are not consisent with ISO 8601: ",
paste(noniso_date_col, collapse=", ")
)
}
}

validate_iso(data,sdtmig_version,domain_name)

# Check to see if columns have the expected class (numeric or character),
#if not give a warning and convert to one or the other.
#Also, check to see if columns contain NA, give an error if they do.
validate_column_class <- function(data,sdtmig_version,domain_name){
domain_spec <- domain_spec_list[[sdtmig_version]][[domain_name]]
dat1 <- lapply(data,class)
dat1 <- lapply(dat1,function(l) l[[1]])
dat1 <- data.frame(Variable = names(unlist(dat1)), Type = unlist(dat1))
dat1$Variable <- as.character(dat1$Variable)
dat1$Type <- as.character(dat1$Type)
names(domain_spec)[names(domain_spec)=='Variable Name'] <- 'Variable'
domain_spec_type <- select(domain_spec,Variable,Type)
dat1$Type[dat1$Type == 'integer'] <- 'Num'
dat1$Type[dat1$Type == 'numeric'] <- 'Num'
dat1$Type[dat1$Type == 'logical'] <- 'Num'
dat1$Type[dat1$Type == 'character'] <- 'Char'
dat1 <- left_join(dat1,domain_spec_type,by='Variable')
names_num_char <- dat1$Variable[dat1$Type.x == 'Num' & dat1$Type.y== 'Char']
names_num_char <- names_num_char[is.na(names_num_char)==F]

if(length(names_num_char)>0){
warning(
"The following columns of class 'numeric' or 'logical' should be and have been converted to character: ",
paste(names_num_char[is.na(names_num_char)==F], collapse=", ")
)
}

names_char_num <- dat1$Variable[dat1$Type.x == 'Char' & dat1$Type.y== 'Num']
names_char_num <- names_char_num[is.na(names_char_num)==F]

if(length(names_char_num)>0) {
warning(
"The following columns of class 'character' should be and have been converted to numeric: ",
paste(names_char_num[is.na(names_char_num)==F], collapse=", ")
)
}

data_sub <- data

if(length(names_num_char)>0){
for (i in names_num_char){
data_sub[[i]] <- as.character(data[[i]])
}
}

if(length(names_char_num)>0) {
for (j in names_char_num){
data_sub[[j]] <- as.numeric(data[[j]])
}
}

# na_names <- list()
# for(i in names(data)){
# if(any(is.na(data[[i]]))){
# na_names[[i]] <- i
# }
# }
# na_names <- unlist(na_names)
# if(length(na_names) > 0) {
# stop(
# "The following columns in dataset have NA: ",
# paste(na_names, collapse=", ")
# )
# }

data <- data_sub
return(data)
}

validate_column_class(data,sdtmig_version, domain_name)
}
30 changes: 30 additions & 0 deletions data-raw/add_data.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Save the domain specification files into package
library(rio)
setwd('C:/Users/dengj25/Pfizer/AMP/Rsdtm/Branch2/Rsdtm/data-raw/SDTMIG_3.3/Domain_Specifications')
domain_spec_filnames <-
list.files(pattern=".csv$")
names(domain_spec_filnames) <- substr(domain_spec_filnames, 1, 2)
domain_specs <-
lapply(
X=domain_spec_filnames,
FUN=import
)

domain_spec_list <- list(SDTMIG3.3=list(CO=domain_specs$CO,
DM=domain_specs$DM,
SE=domain_specs$SE,
SM=domain_specs$SM,
SV=domain_specs$SV))

# Save the SDTM Terminology file into package
library(readxl)
library(httr)
url1<-'https://evs.nci.nih.gov/ftp1/CDISC/SDTM/SDTM%20Terminology.xls'
GET(url1, write_disk(tf <- tempfile(fileext = ".xls")))
terminology <- read_excel(tf, 2L)

setwd('C:/Users/dengj25/Pfizer/AMP/Rsdtm/Branch2/Rsdtm/R')

# Adding datasets to package
usethis::use_data(domain_spec_list)
usethis::use_data(terminology)
Binary file added data/domain_spec_list.rda
Binary file not shown.
Binary file added data/terminology.rda
Binary file not shown.
21 changes: 21 additions & 0 deletions man/validate_column_names.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading