Cainefm · Cainefm · Dec 25, 2021 · Dec 25, 2021 · Dec 25, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,8 +2,8 @@ Package: MND
 Type: Package
 Title: A package for epidemiology study about Motor neuron disease (MND) in the NEUROGEN
 Version: 0.1.0
-Depends: data.table, SCCS, lubridate
-Imports: data.table, SCCS, lubridate
+Depends: data.table, SCCS, lubridate, readxl, ggplot2, incidence2
+Imports: data.table, SCCS, lubridate, readxl, ggplot2, incidence2
 Suggests: 
 Author: c(person("FAN", "Min", email = "minfan@connect.hku.hk", role =c("aut", "cre")))
 Maintainer: FAN Min <minfan@connect.hku.hk>

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,10 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(print,mndsccs)
+export(clean_4_survival)
 export(get_DT_Exposure_Endpoint)
 export(get_DT_SCCS)
-export(run)
+export(plot_inci)
+export(run_incidence)
+export(run_sccs)
 import(data.table)
diff --git a/R/clean.R b/R/clean.R
@@ -19,7 +19,7 @@ shrink_interval <- function(data,st,ed,gap=1){
     setorder(data,id,st)
     output <- data[,indx:=c(0, cumsum(as.numeric(shift(st,type="lead"))>
                                           cummax(as.numeric(ed)+gap))[-.N]),id
-    ][,.(st=first(st),ed=last(ed)),.(id,indx)]
+    ][,.(st=min(st),ed=max(ed)),.(id,indx)]
     setnames(output,c("st","ed"),c(st,ed))
     return(output)
 }
@@ -32,6 +32,7 @@ shrink_interval <- function(data,st,ed,gap=1){
 #' @param demo the dataset with demographic information, including id, dob, dod, sex, onset_date. Pls check the data shall.
 #' @param ip the dataset with all in-hospitalization records, including id, date of adminssion, date of discharge, type of records (setting, IP, OP, AE). Pls check the data shall.
 #' @param rx the dataset with all prescription records, including id, drug name, date of prescription start and end, type of presciption (IP, OP, AE, Discharge). Pls check the data shall.
+#' @param riluzole_name the name in your database stands for riluzole.
 #' @return a combined data set in data table format. One patients may have multiple rows of records.
 #' @import data.table
 #' @export
@@ -49,12 +50,16 @@ shrink_interval <- function(data,st,ed,gap=1){
 #' 2665:  9903734   F 1955-07-02 2018-11-28 2017-11-28 2018-07-25  2018-11-20 2017-11-27 2017-11-28
 #' 2666:  9903734   F 1955-07-02 2018-11-28 2017-11-28 2018-07-25  2018-11-20 2018-07-30 2018-08-10
 #' 2667:  9903734   F 1955-07-02 2018-11-28 2017-11-28 2018-07-25  2018-11-20 2018-10-31 2018-11-02
-get_DT_Exposure_Endpoint <- function(demo, rx, ip){
-    rx_riluzole<- shrink_interval(rx[grepl('riluzole|riluteck',drug_name,ignore.case = T) &
+get_DT_Exposure_Endpoint <- function(demo, rx, ip,riluzole_name='riluzole|riluteck',...){
+    rx_riluzole<- shrink_interval(rx[grepl(riluzole_name,drug_name,ignore.case = T) &
                                          !setting %in% c("I")],"date_rx_st","date_rx_end")
+    rx_earliest <- rx_riluzole[,.(earliest_rx=min(date_rx_st)),id][,unique(.SD)]
+
     ip_riluzole<- shrink_interval(ip[id %in% demo$id & id %in% rx_riluzole$id],"date_adm","date_dsg",gap = 7)
     rx_ip <- merge(rx_riluzole[,indx:=NULL],
                    ip_riluzole[,indx:=NULL],by="id",allow.cartesian = T)
+
+    rx_ip <- merge(rx_ip,rx_earliest,by="id",all.x=T)
     demo_rx_ip<-merge(demo,rx_ip,by="id")
     return(demo_rx_ip)
 }
@@ -70,9 +75,13 @@ get_DT_Exposure_Endpoint <- function(demo, rx, ip){
 #' @export
 #'
 #' @examples no example
-get_DT_SCCS <- function(data,obst="2001-08-24"){
-    df_mnd <- data[,`:=`(obst=pmax(onset_date,lubridate::ymd(obst),na.rm = T),
-                                obed=pmin(dod,onset_date %m+% years(2),na.rm=T))
+get_DT_SCCS <- function(data,...){
+    df_mnd <- data[,`:=`(obst=pmax(pmin(onset_date %m-% years(1),
+                                        earliest_rx %m-% years(1),na.rm=T),
+                                   lubridate::ymd(obst),na.rm = T),
+                         obed=pmin(dod,
+                                   pmin(onset_date ,
+                                        earliest_rx,na.rm=T) %m+% years(2),obed,na.rm=T))
     ][,`:=`(obst=as.numeric(obst-dob),
             obed=as.numeric(obed-dob),
             event=as.numeric(date_adm-dob),
@@ -81,6 +90,7 @@ get_DT_SCCS <- function(data,obst="2001-08-24"){
     df_mnd <- df_mnd[,`:=`(strx=as.numeric(date_rx_st-dob),
                            edrx=as.numeric(date_rx_end-dob))]
 
+    df_mnd <- df_mnd[strx>=obst & strx<= obed & event >=obst & event <= obed]
     # for more than one rx periods:
     last_rx_time <- unique(df_mnd[,.(id,strx,edrx)])[,last_rx_ed:=as.numeric(shift(edrx,n = 1,fill = NA,type = "lag")),.(id)]
 
@@ -140,3 +150,67 @@ get_DT_SCCS <- function(data,obst="2001-08-24"){
     return(df_mnd)
 }
 
+
+get_subtype <- function(data,icd_subtypes_temp){
+    apply(icd_subtypes_temp,1,function(x) data[,(paste0("subtype.",x[["abbr"]])):=fifelse(grepl(x[["grepl"]],codes),T,F)])
+}
+
+
+
+#' Create a dataset for survival and incidence calculation
+#'
+#' @param dx
+#' @param demo
+#' @param codes_sys can be "icd9", "icd10", or "readcodes"
+#'
+#' @return
+#' @export
+#'
+#' @examples
+clean_4_survival <- function(demo,dx,codes_sys,...){
+
+    if(codes_sys=="icd9"){
+        codes_defined <- "^335.2$|^335.2[01249]"
+    }else if(codes_sys=="icd10"){
+        codes_defined <- "^G12.2"
+    }else if(codes_sys=="readcode"){
+        stop("Don't worry. Fm is working on Readcode now.")
+    }
+
+    icd_subtypes <- as.data.table(readxl::read_excel("data/codes_mnd.xlsx",sheet = "subtype"))
+    icd_subtypes_temp <- icd_subtypes[,.(Dx, abbr, grepl=get(codes_sys))]
+    temp_dx <- copy(dx)
+    setorder(temp_dx,"id","ref_date")
+    temp_dx <- temp_dx[grepl(codes_defined,codes),.SD[ref_date==min(ref_date)],id
+                       ][,unique(.SD)][,.(id,onset_date=ref_date,codes=codes)]
+    temp_dx_codes <- dcast(temp_dx,id+onset_date~.,value.var = "codes",fun.aggregate = function(x) paste(unique(x),collapse = ","))
+    setnames(temp_dx_codes,".","codes")
+
+    get_subtype(temp_dx,icd_subtypes_temp)
+    temp_dx[,codes:=NULL]
+    temp_dx <- dcast(temp_dx,
+                     id+onset_date~.,
+                     value.var = c("subtype.als","subtype.pma","subtype.pbp","subtype.pls","subtype.others"),
+                     fun.aggregate = function(x) any(x))
+    temp_dx <- merge(temp_dx_codes,temp_dx,by=c("id","onset_date"))
+
+
+    df_surv <- merge(temp_dx,
+                     demo[,.(id,sex,dob,dod)],
+                     by="id",all.x=T)
+
+    df_surv[,outcome:=fifelse(!is.na(dod),1,0)]
+    df_surv[,obs.deadline:=fifelse(is.na(dod),ymd('20191231'),dod)]
+    df_surv[,age_adm:=as.numeric((onset_date-dob)/365.25)]
+    df_surv[,age_group:=cut(age_adm, breaks = c(0,6,13,20,48,65,80,Inf),
+                            include.lowest = T,right=FALSE)]
+    df_surv[,age_group_std:=cut(age_adm, breaks = c(seq(0,85,5),+Inf),
+                                include.lowest = T,right=FALSE,labels=c("0-4", "5-9", "10-14", "15-19", "20-24", "25-29", "30-34",
+                                                                        "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69",
+                                                                        "70-74", "75-79", "80-84", "85+"))]
+    df_surv[,year_onset:=year(onset_date)]
+    df_surv[,time_to_event:=as.numeric(obs.deadline-onset_date)]
+    df_surv <- df_surv[time_to_event>=0 & onset_date>=ymd("1994-01-01")]
+    return(df_surv)
+}
+
diff --git a/R/run.R b/R/run.R
@@ -1,20 +1,28 @@
 
 
 #' sccs for mnd study
-#'
+#' @param demo
+#' @param rx
+#' @param ip
+#' @param target_drugs the drug name of riluzole. In hong kong, there are two type of drug names in HK： riluzole and riluteck
+#' @param obst the defined study observation date. Choose the earliest avaiable date with *Riluzole* in the hospital or approved date by FDA. For instance, 2001-8-24 in Hong Kong.
+#' @param obed the defined study observation date. Default : 2019-12-31
 #' @return
 #' @export
 #'
-#' @examples run()
-run <- function(){
+#' @examples run_sccs()
+run_sccs <- function(demo, rx, ip, target_drugs='riluzole|riluteck',
+                     obst="2001-08-24", obed="2018-12-31", ...){
     message("merge exposure and outcome datasets")
-    dt_combined <- get_DT_Exposure_Endpoint(demo,rx,ip)
-    message("SCCS data cleanning")
-    dt_sccs <- get_DT_SCCS(dt_combined)
+    message(nrow(demo)," in the cohort","\n========================\n\n")
+    dt_combined <- get_DT_Exposure_Endpoint(demo,rx,ip,...)
+    dt_sccs <- get_DT_SCCS(dt_combined,...)
+    message("After cleanning, count of participants for sccs:\n", dt_sccs[,uniqueN(id)],"\n========================\n\n")
     ageq <- floor(seq(20,90,10)*365)
     dt_sccs$id <- as.numeric(dt_sccs$id)
-    dt_sccs$dob_dmy <- as.numeric(format(dt_sccs$dob,"%d%m%Y"))
-#    dob_dmy_model <- dt_sccs[,.(id,event,dob_dmy)][,unique(.SD)][,dob_dmy] # this is the dob for previous SCCS package, with version less than 1.3
+    setorder(dt_sccs,id,event,date_rx_st)
+    dt_sccs$dob_dmy <- as.numeric(format(dt_sccs$dob,"%d%m%Y")) # this is for version 1.4 or above
+    dob_dmy_model <- dt_sccs[,.(id,event,dob_dmy)][,unique(.SD)][,dob_dmy] # this is the dob for previous SCCS package, with version less than 1.3
     result <- sccs(event ~ strx_30b + strx_0a + strx_30a + strx_60a + strx_90a+ strx_120a +
                                    strx_150a +strx_180a +
                                    age + season,
@@ -25,10 +33,12 @@ run <- function(){
                                adrug = list(strx_30b,strx_0a, strx_30a,
                                             strx_60a,strx_90a,strx_120a,
                                             strx_150a, strx_180a),
-                               aedrug = list(edrx_30b,edrx_0a,edrx_30a,edrx_60a,edrx_90a,
-                                             edrx_120a, edrx_150a, edrx_180a),
+                               aedrug = list(edrx_30b,edrx_0a,edrx_30a,
+                                             edrx_60a,edrx_90a,edrx_120a,
+                                             edrx_150a, edrx_180a),
                                dataformat = "stack", agegrp = ageq, seasongrp = c(0103,0105,0109,0111),
-                               dob=dob_dmy,
+                               #    dob=dob_dmy, # for sccs version 1.5 or above
+                               dob=dob_dmy_model, # for sccs version 1.3 only
                                data = as.data.frame(dt_sccs))
     return(result)
 }
@@ -96,3 +106,104 @@ sccs <- function(fml,...){
 print.mndsccs <- function(x){
     print(x$res)
 }
+
+show_digit<- function(x){
+    return(sprintf(as.numeric(x),fmt="%#.2f"))
+}
+
+#' Inci Ci calculation
+#'
+#' @param x
+#'
+#' @return
+#'
+#' @examples
+get_inci_CI <- function(x){
+    temp <- poisson.test(as.numeric(x[["stdN"]]),as.numeric(x[["pop_raw"]]))
+    est <- temp[["estimate"]]*100000
+    est_l <- temp$conf.int[2]*100000
+    est_h <- temp$conf.int[1]*100000
+    est_cb <- paste0(show_digit(est)," (",
+                     show_digit(est_l),"-",
+                     show_digit(est_h),")")
+    return(data.frame(est,est_l,est_h,est_cb))
+}
+
+
+#' run analysis for incidence
+#'
+#' @param demo
+#' @param dx
+#' @param region
+#' @param codes_sys
+#'
+#' @return
+#' @export
+#'
+#' @examples
+run_incidence <- function(demo, dx, region="hk",codes_sys = "icd9"){
+    dx_inci <- clean_4_survival(demo=demo,dx=dx,codes_sys)
+
+    raw_pop <- setDT(read_xlsx("./data/codes_mnd.xlsx", sheet=paste0(region,"_pop")))
+    raw_pop <- melt(raw_pop,id.vars = "Age")
+    setnames(raw_pop,c("variable","value"),c("year_onset","pop_raw"))
+
+    std_pop <- setDT(read_xlsx("./data/codes_mnd.xlsx",sheet="stdpop"))
+    std_pop$Age<-factor(std_pop$Age)
+
+
+    incident_raw <- dx_inci[,.(id,year_onset,age_group_std)][,.N,by=.(age_group_std,year_onset)]
+    setnames(incident_raw,"age_group_std","Age")
+    incident_raw$year_onset <- as.factor(incident_raw$year_onset)
+    setorder(incident_raw,Age,year_onset)
+
+    incident_raw <- merge(incident_raw,
+                         raw_pop,
+                         by=c("Age","year_onset"),all.y=T)
+    incident_raw <- merge(incident_raw,
+                         std_pop[,.(Age,pop_std=`Standard For SEER*Stat`,pop_std_ratio=`WHO std pop`)],
+                         by=c("Age"))
+    incident_raw[is.na(N),N:=0]
+    incident_std <- incident_raw[,std_risk:=N/pop_raw*pop_std_ratio*100000
+                                         ][,.(std_risk=sum(std_risk),pop_raw=sum(pop_raw)),year_onset
+                                           ][,stdN:=round(std_risk*pop_raw/100000)]
+
+
+
+    incident_std <- cbind(incident_std,rbindlist(apply(incident_std,1,get_inci_CI)))
+    incident_std[,`:=`(est=as.numeric(est),
+                       est_l=as.numeric(est_l),
+                       est_h=as.numeric(est_h),
+                       year_onset=factor(year_onset))]
+    return(list(std_inci=incident_std, raw_dt=dx_inci))
+}
+
+#' Plot the figure of incidence
+#'
+#' @param data
+#' @param region
+#'
+#' @return
+#' @export
+#'
+#' @examples
+p_inci <- function(data,region="Hong Kong"){
+    ggplot(data,aes(x=year_onset,y=est,group=1))+
+        geom_line()+theme_light()+
+        xlab("Onset year")+
+        ylab("Age standardized MND incidence \nby year per 100,000 population")+
+        ggtitle(region)+
+        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size=18),
+              axis.text.y = element_text(size=18),
+              plot.title = element_text(size=22),
+              axis.title = element_text(size=18)
+              )
+}
+
+
+p_inci_sg <- function(x){
+    iw <- incidence(x, interval = "6 months", date_index = onset_date, groups = sex)
+    plot(iw, fill = "sex", color = "white",border="grey",title = "Hong Kong")
+}
+
+
diff --git a/README.md b/README.md
@@ -71,5 +71,3 @@
     ```r
      run_sccs(demo,rx,ip)
     ```
-
-
diff --git a/data/codes_mnd.xlsx b/data/codes_mnd.xlsx
diff --git a/example.rda b/example.rda
diff --git a/man/clean_4_survival.Rd b/man/clean_4_survival.Rd
diff --git a/man/get_DT_Exposure_Endpoint.Rd b/man/get_DT_Exposure_Endpoint.Rd
diff --git a/man/get_DT_SCCS.Rd b/man/get_DT_SCCS.Rd
diff --git a/man/get_inci_CI.Rd b/man/get_inci_CI.Rd