sequential-test-app/sequentialTestingApp.Rmd at main · alphanumerritt/sequential-test-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Experimentation Tools | Sequential Testing"
output:
  flexdashboard::flex_dashboard:
    orientation: rows
    css: styles.css
    vertical_layout: scroll
    logo: logo-sm.png
    favicon: favicon.png
    fig_height: 1
    navbar:
      - { title: "Planning: MDE-Based Calculator", href: "https://sdidev.shinyapps.io/sample-size-calculator/" }
      - { title: "Planning: Runtime-Based Calculator", href: "https://sdidev.shinyapps.io/sample-size-calculator-runtime/" }
      - { title: "Planning/Analysis: Sequential Testing", href: "https://sdidev.shinyapps.io/sequential-test-calculator/" }
      - { title: "Analysis: Test Results Assessment", href: "https://sdidev.shinyapps.io/ABTestAnalysis/" }
      - { title: "Analysis: Bayesian Testing", href: "https://sdidev.shinyapps.io/bayesian-ab-app/" }
      - { title: "Simulator: A/B Test Result", href: "https://sdidev.shinyapps.io/test-result-simulator/" }
      - { title: "Simulator: Experimentation ROI", href: "https://sdidev.shinyapps.io/experimentation-roi/" }
runtime: shiny
---


```{r setup, include=FALSE}

library(ggplot2)
library(shiny)
library(gsDesign)
library(dplyr)
library(tidyr)
library(gt)

```

<script>
$('.navbar-logo').wrap('<a href="https://www.searchdiscovery.com/how-we-help/services/optimization/" target=_blank>');
</script>


```{r gsdesign_calc, include=FALSE}
# These variables will be used throughout the application
global_design <- reactiveValues(fixedn = NULL, seqn = NULL) # All test DESIGN outputs
global_results <- reactiveValues(z = NULL, rslt = NULL, win = NULL, tgl = 1) # All RESULTS outputs
reactTalesNum <- reactiveVal() # Turn "tails" radio button output to number from string

# Function can be called to return a sequential design object
createTest <- function(a,b,c,d,e,f,g,h=NULL) {
      alph <- 1 - a/100
      pwr <- b/100
      base <- c/100
      nonf <- d/100
      cvrB <- base * (1 + e/100)
      tls1 <- f
      k_checks <- g
      upBnd <- 3 #upper boundary exponent value (higher is more conservative, typically use 2 or 3)
      lowBnd <- 2 #lower boundary exponent values (higher is more conservative, typically use 2 or 3)
      sides <- if (tls1 > 1) "two.sided" else "one.sided"
      testType <- if (tls1 > 1) 2 else 4
      cvrA <- if (tls1 > 1) base else base * (1 - nonf)

      # FIXED SAMPLE
      n_fixed <- power.prop.test(
        n = NULL,
        p1 = cvrA,
        p2 = cvrB,
        sig.level = alph,
        power = pwr,
        alternative = sides
      )$n

      # gsDesign object with evenly spaced checkpoints,
      design <- gsDesign(
        k = k_checks,
        test.type = testType,
        alpha = alph / tls1,
        sfu = sfPower,
        sfupar = upBnd,
        sfl = sfPower,
        sflpar = lowBnd,
        n.fix = n_fixed,
        beta = 1 - pwr
      )

      # MAX SAMPLE PER VARIATION
      n_sequential <- tail(design$n.I, 1)


      # If updated checkpoints were passed in based on analyses entered, plug them into the analysis here
      if (!is.null(h)) {

          dr <- data.frame(chkpt = c(1:k_checks)) %>%
            mutate(planned_n = chkpt*(1/k_checks*n_sequential),
                   actual_n = 0
                   )

          for (i in 1:length(h)) {
            index <- h[[i]][1]
            index_n <- (h[[i]][3] + h[[i]][5])/2
            dr[index,3] <- index_n
          }

          # Use the planned_n for the last value if it's 0
          dr <- dr %>%
            mutate(working_n = actual_n)

          if(dr$working_n[nrow(dr)] == 0){
            dr$working_n[nrow(dr)] <- dr$planned_n[nrow(dr)]
          }

          # Replace the 0s with NA
          dr <- dr %>%
            mutate(working_n = ifelse(working_n == 0, NA, working_n))

          # Add a faux initial row with 0 for working_n
          dr <- tibble(chkpt = 0,
                       planned_n = 0,
                       actual_n = 0,
                       working_n = 0) %>%
            bind_rows(dr)

          # Fill with the values so that we can get the before/after
          dr <- dr %>%
            fill(working_n, .direction = "down")

          # Use lag to bring the "first value" next to the last value
          dr <- dr %>%
            mutate(starting_n = if_else(working_n == dplyr::lag(working_n, 1), 0, dplyr::lag(working_n, 1))) %>%
            # Do counts. This... assumes that all of the actual values for actual_n are unique, which may be a problem
            group_by(working_n) %>%
            mutate(divisor = seq(1, n())) %>%
            ungroup() %>%
            # Shift those counts down by one
            mutate(divisor = dplyr::lag(divisor, 1)) %>%
            # Figure out the base increment
            mutate(base_increment = ifelse(dplyr::lag(working_n, 1) != working_n, (working_n - starting_n) / divisor, NA)) %>%
            # Go ahead and remove that dummy row
            filter(chkpt != 0) %>%
            # Do another fill
            fill(base_increment, .direction = "up") %>%
            # This would need some testing logic-wise, but it's close. We're going to bring
            # the "starting value" over as a pseudo-fill
            mutate(starting_n = if_else(lead(divisor, 1) == 1, starting_n, working_n)) %>%
            # A hack... the last row is going to be NA because of 'lead', so fix that
            mutate(starting_n = if_else(is.na(starting_n), if_else(divisor == 1, dplyr::lag(working_n, 1), dplyr::lag(starting_n, 1)),starting_n)) %>%
            # Calculate the final value
            mutate(revised_n = round(base_increment * divisor + starting_n)) %>%
            # Clean up the interim values
            select(-working_n, -starting_n, -base_increment, -divisor)

          checkpoints <- dr$revised_n


        # Generates updated gsDesign object after results have been entered

        finalDesign <-
          gsDesign(
            k = k_checks,
            test.type = testType,
            alpha = alph / tls1,
            beta = 1 - pwr,
            sfu = sfPower,
            sfupar = upBnd,
            sfl = sfPower,
            sflpar = lowBnd,
            n.fix = n_fixed,
            n.I = checkpoints,
            maxn.IPlan = design$n.I[design$k]
          )
      } else {finalDesign <- design}

      # Returns the whole test design object (see gsDesign)
      return(finalDesign)

}

```

```{r shortcut_display}

# Input to load a configuration from shortcut
div(id = "configLoad",
  h5(class = "ql", "Paste configuration shortcut: "),
  div(class = "ql", textInput("shortcut", NULL, width = "100px")),
  h5(style = "display:inline-block", actionLink("loadShortcut", label = "Load Shortcut")),

  # Link to current configuration
  uiOutput('fixedLink')
)
```

```{r shortcut_output, include=FALSE}

output$fixedLink <- renderUI({
  # Function to put any entered results into a string
  results_to_string <- function () {
    results_string <- ""
    try ({for (i in 1:length(global_results$rslt)) {
      raw_result <- global_results$rslt[i]
      clean_result <- paste(raw_result[[1]], collapse = "|")
      results_string <- if (nchar(results_string) > 1) paste(results_string,clean_result, sep = ";") else clean_result
    }
    })
    return (results_string)
  }

  # Construct the string
  div(class="fixedLinkRow",
      "Current configuration shortcut:",
  paste(
    input$alpha,
    input$pwr,
    input$cvra,
    input$mde,
    input$tls,
    input$nonf,
    input$traff,
    input$checknum,
    input$conversions,
    input$dayNum,
    input$numVars,
    results_to_string(),
    sep = ","
  ))
})
```

``` {r shortcut_observer, include = FALSE}

observeEvent(input$loadShortcut, {
  # Parse out string, check to make sure it's got all the necessary inputs
  shortcutList <- unlist(strsplit(input$shortcut,","))
  req(length(shortcutList) > 9)

  # Empty out results
  global_results$rslt <- NULL
  global_results$win <- NULL
  global_results$status <- NULL
  global_design$table$complete$results <- 0

  # Update all the inputs with the string values
  updateSliderInput(session, 'alpha', value = shortcutList[1])
  updateSliderInput(session, 'pwr', value = shortcutList[2])
  updateNumericInput(session, 'cvra', value = shortcutList[3])
  updateNumericInput(session, 'mde', value = shortcutList[4])
  updateNumericInput(session, 'tls', value = shortcutList[5])
  updateNumericInput(session, 'nonf', value = shortcutList[6])
  updateNumericInput(session, 'traff', value = shortcutList[7])
  updateNumericInput(session, 'checknum', value = shortcutList[8])
  updateNumericInput(session, 'conversions', value = shortcutList[9])
  updateNumericInput(session, 'dayNum', value = shortcutList[10])
  updateNumericInput(session, 'numVars', value = shortcutList[11])

  # Update results (behind the scene variables) if present
  if (!is.na(shortcutList[12])) {
    r1 <- strsplit(shortcutList[12],";", fixed = TRUE) # split up the short cut into checkpoints
    r2 <- r1[[1]] # extract
    r3 <- list()
    r4 <- list()
    for (i in 1:length(r2)) { # for number of saved checkpoints
      r3 <- strsplit(r2[i],"|",fixed = TRUE) # split into values
      r3 <- as.numeric(c(r3[[1]])) # grab checkpoint number
      rl <- paste0("checkpoint",r3[1]) # make a label out of checkpoint number
      r4[[rl]] <- r3 # add label to checkpoint results and add to list
    }
    global_results$rslt <- r4
    global_results$tgl <- global_results$tgl * -1

  }

})
```

Row {.tabset data-height=370}
-----------------------------------------------------------------------

### Test Configuration

```{r primary_inputs}
# 2 column screen - Column 1
div(class = "input-two-c",
    h5("Confidence Level"),
    div(sliderInput("alpha",
                 label = NULL, value = 95, min = 50, max = 99, step = 1, round = TRUE)),

    h5("Power"),
    div(sliderInput(
      "pwr",
      label = NULL, value = 80, min = 50, max = 99, step = 1, round = TRUE
    )),

    h5("Tails"),
    div(
      radioButtons("tls",
             NULL,
             choices = list("1-tail test for superiority" = 1,
                            "2-tail test for any difference" = 2
                            ),
             selected = 1)
    )

    )

# 2 column screen - Column 2
div(class = "input-two-c",
    h5("Minimum detectable effect"),
    div(
    numericInput(
  "mde",
  label = NULL,
  value = 10,
  min = 0,
  max = 1000,
  step = 1
)

    ),

h5("Non-inferiority margin (optional)"),
div(
  numericInput("nonf",
             NULL,
             value = 0,
             min = 0,
             max = 50)
),

h5("Base conversion rate"),
div(
  numericInput(
  "cvra",
  label = NULL,
  value = 10,
  min = 1,
  max = 100,
  step = 1
)
)
)

```


### Current Traffic

```{r traffic_inputs}
# 2 column inputs
div(class = "input-two-c",
    h5("Enter current conversion volume to test area"),
    div(numericInput("conversions", NULL, value=2100, step = 100)),

    h5("Enter current traffic volume to test area"),
    div(numericInput(inputId = "traff",
             label = NULL,
             value = 20000,
             step = 1000)),

    h5("Calculated base conversion rate"),
    div(class = "big", textOutput("currentCvr", inline = TRUE))
)

div(class = "input-two-c",
    h5("How many days of traffic is this?"),
    div(numericInput("dayNum", NULL, value=28, step = 1)),

    h5("Estimated traffic per week"),
    div(class = "big", textOutput("week_traffic", inline = TRUE))

    )

```


```{r traffic_outputs}
# Output calculated conversion rate from current traffic inputs
output$currentCvr = renderText({paste(round(input$conversions/input$traff*100,2),"%")})

# Output calculated traffic per week
output$week_traffic = renderText({paste(round(input$traff/input$dayNum)*7," visitors")})

```


### Help
##### Confidence level
This is ($1 - \alpha$) where $\alpha$ is your nominal false positive (type-1) error rate when the true effect size is at the null hypothesis threshold.

##### Power
This is ($1 - \beta$) where $\beta$ is your nominal false negative (type-2) error rate. It represents your likelihood of getting a statisticaly significant positive result when the true effect size is the Minimum Detectable Effect.

##### Tails
Tails refer to the outside ends of a cumulative probability distribution and essentially reflect the framing of your null hypothesis. Typically 1-tail is the ideal choice for optimization experiments.

##### Minimum Detectable Effect
a.k.a. Minimum Effect of Interest is the true difference at which Power is guaranteed. So if the true difference in conversion rates is `MDE`, you'll get a true positive `Power`% of the time.

##### Non-inferiority margin
Often, "no practical difference" can be somewhat lower than "0 difference". The non-inferiority margin ($\Delta$) is that margin of indifference. $\alpha$ is guaranteed at $Base ConversionRate(1- \Delta)$.

##### Z-score
Our "test statistic". Statistical significance is derived from a p-value ($1-p$) which in turn is derived from a z-score. The z-score more or less represents the number of standard deviations away from the mean a specific observation is. Here, the mean is control conversion rate and the observation is the test conversion rate.

##### Is a sequential test necessary?
Sequential testing helps you run efficient tests where the effect is unknown and can vary widely from expectations. It can help save you from negative effects impacting customers during a prolonged test as well as it can reduce test duration when you want to react to a large positive difference. That said, it's a more complicated approach to testing and might not be worth your trouble. **Before** committing to a sequential test, check the expected duration with a fixed design. It's healthy to run tests for 2 weeks to capture a few business cycles. If you don't need more than that, consider just sticking with a fixed design.

##### Shortcuts (saving your work)
Above the input panel, there is a place to copy a shortcut to the current configuration including any results entered to date, or to load a shortcut from a past configuration. We know, a url would be ideal. Alas! Free software!

##### 1-tail vs 2-tail
Selecting 1-tail for a 'superiority test' will produce a **futility boundary**, allowing you to end the test early when test results are poor and a comeback is not likely while preserving power. Selecting 2-tail will produce identical (symmetrical) upper and lower decision boundaries.

##### Using the Power Plot
Use the power plot to help you plan your test configuration. The probabilities of false positives and false negatives at different effect sizes can prove very informative. Try adding a **non-inferiority margin** and see what happens!

##### Timing adjustments
By default, checkpoints are planned at even intervals. However, should your actual checkpoints differ, **boundaries and checkpoints will automatically adjust** whenever you enter results.
    * To be explicit, you are not bound to the *timing* of the check-ins, only to the number of check-ins.

##### Early check-ins
You might want to **plan your first check for early** in the test. This could help identify problematic test experiences and prevent loss.

##### Non-binding decision boundary
The **lower** decision boundary for a 1-tail test is what's known as a **non-binding** boundary. This means, when you cross it, you can still choose to continue the test while preserving your type-1 error rate and only improving your power.

##### Binding decision boundary
The **upper** decision boundary in a 1-tail test and both boundaries in a 2-tail test are **binding** boundaries. When your results cross them, you must end the test in order to preserve your type-1 error rate.

##### P-value adjustments
Standard Confidence Interval and p-value calculations will be understated and cannot be taken at face value. **We provide adjusted p-values and confidence intervals** when the upper (efficacy) boundary is crossed.

##### Why use z-scores?
You are likely more accustomed to seeing p-values, which are derived from the z-score. But we don't show the p-value because it cannot be interpretted as it normally would be in a fixed-horizon test. In other words, we're trying to avoid misinterpretation and statistical faux pas.


### Multiple Comparisons
```{r multitest_display}
# TO BE DEVELOPED
#
# Boundary type
#
# Test Type
# selectInput("testType",
#             label = NULL,
#             choices = c())
#
# Upper boundary sensitivity
# Lower boundary sensitivity
# O’Brien-Fleming, Pocock, or Wang-Tsiatis are normally used with equally-spaced analyses. They
# are used only with one-sided (test.type=1) and symmetric two-sided (test.type=2) designs. We
# will use the CAPTURE example, again with 80% power rather than the default of 90%. Notice
# that this requires specifying beta=.2 in both nBinomial() and gsDesign(). O’Brien-Fleming,
# Pocock, or Wang-Tsiatis (parameter of 0.15) bounds for equally space analyses are generated using
# the parameters sfu and sfupar below. I

div(class = "input-two-c multiple",
    h4("Warning: Advanced!"),
    #br(),
    p("The Results Analysis panel will be rendered inaccurate in the case of multiple testing (calculating p-values for multiple test variations or metrics). We use the ",
      tags$a(href="https://en.wikipedia.org/wiki/Holm%E2%80%93Bonferroni_method", "Holm method"),
      "to make adjustments to the z-score boundary for each comparison. Only the upper boundary is given which means that for 1-tail, there will be no futility boundary and for 2-tail you just switch the sign when the difference is negative."),

    h5("How many additional variants or metrics are you testing?"),
    div(numericInput("numVars",
                 label = NULL, value = 0, min = 1, max = 20, step = 1))
)

div(class = "input-two-c large",
    p("When analyzing, you must rank z-scores high to low by their absolute values and apply the new z-score boundaries provided in the table. 'Z-1' applies to the highest z-score and so on."),
    gt_output("adjBounds")
)

```

```{r multitest_outputs, include=FALSE}

output$adjBounds <- render_gt({
  req(input$numVars > 0)
  req(!is.null(global_design$table$complete$upper))

  df <- data.frame(
    checkpoints = paste("Checkpoint",seq(1,input$checknum)),
    maxSample = global_design$table$complete$samples / 2,
    origZscors = global_design$table$complete$upper)

  comps <- input$numVars + 1

  for (i in seq(1:comps)) {

    rank <- paste0("Z-",i)

    pVals <- 1-pnorm(global_design$table$complete$upper)
    pVals <- pVals/(comps-i+1)
    zVals <-  abs(qnorm(pVals))

    df[rank] <- zVals


  }

  gt(df) %>%
    fmt_number(columns = contains("z"), decimals = 2) %>%
    fmt_number(columns = contains("sample"), decimals = 0) %>%
    cols_width(contains("check") ~ px(90),
               contains("sample") ~ px(80),
               contains("zscor") ~ px(70)
               ) %>%
    cols_label(checkpoints = "",
               maxSample = "Minimum Sample Size Per Variant",
               origZscors = "Old Boundary") %>%
    tab_options(
      container.height = 275,
      container.overflow.x = TRUE,
      container.overflow.y = TRUE)
})

output$multiwarn <- renderUI({
  if (input$numVars > 0) {
  div(id="multiplewarning",
      "Looks like you have configured the test to run with multiple variants or metrics to compare. Many features will not display due to this.")
  }
})

```


Row {data-height=150}
-----------------------------

### Planned analyses (checkpoints)

##### How many times do you want to "peek" and analyze test results? (max 18)


```{r checkpoint_reco_output, include=FALSE}
output$checkpoint_reco <- renderUI({

  recommendation <- round(global_design$seqn/(input$traff/input$dayNum)/7)
  output_message <- div(id = "checkpoint_reco",
      "One checkpoint per week is a good rule of thumb. Based on expected duration,",
      tags$b("we suggest ",
      recommendation,
      "checkpoints."
      )
      )

  # If multiple testing, then nothing, if not
  if (input$numVars < 1) output_message
})

```

```{r checkpoint_input}
div(
div(style="display:inline-block;", numericInput(
  "checknum", label = NULL, min = 2, max = 18, value = 4, step = 1, width = 100
)),
div(style="display:inline-block; padding-left: 5px;", "(Including final analysis)"),
htmlOutput("checkpoint_reco"))

```

```{r test_data_observer, include=FALSE}
# CREATE SEQUENTIAL TEST DESIGN
# Listen for changes in any inputs or submitted results
observeEvent(c(input$alpha, input$pwr, input$cvra, input$nonf, input$mde, input$tls, input$checknum, global_results$tgl, input$clear),{ #}, global_results$rslt),{
  req(input$cvra > 0)
  req(input$mde > 0)
  req(input$nonf >= 0)
  req(input$checknum >= 2)

  global_design$tls <- as.numeric(input$tls)

  tryCatch({

    # If there are results, it's a difference function call (boundaries are adjusted)
    if (is.null(global_results$rslt)) {
      testDesign <- createTest(input$alpha, input$pwr, input$cvra, input$nonf, input$mde, global_design$tls, input$checknum)
    } else  {
      testDesign <- createTest(input$alpha, input$pwr, input$cvra, input$nonf, input$mde, global_design$tls, input$checknum, global_results$rslt)
    }


  # Take all the design outputs and assign to individual variables used throughout app
  global_design$fixedn <- testDesign$n.fix*2
  global_design$seqn <- tail(testDesign$n.I,1)*2
  global_design$fulldesign <- testDesign
  global_design$table$timing <- testDesign$timing
  global_design$table$samples <- testDesign$n.I * 2
  global_design$table$lower <- testDesign$lower$bound
  global_design$table$upper <- testDesign$upper$bound

  # Sequence of deltas based on MDE
  m <- input$mde
  n <- -input$nonf
  negD <- -m+n
  posD <- m-n

  delta_list <- c()
  theta_list <- c()

  if (input$tls == 2) {
    delta_list <- c(negD*2.5, negD*2.25, negD*2, negD*1.75, negD*1.5, negD*1.25)
    theta_list <- seq(-2.5,-1.25,.25)
  }

  delta_list <- c(delta_list,negD*1+n,negD*.75+n,negD*.5+n,negD*.25+n,n,posD*.25+n,posD*.5+n,posD*.75+n,m,posD*1.25+n,posD*1.5+n,posD*1.75+n,posD*2+n,posD*2.25+n,posD*2.5+n)

  global_design$deltas <- delta_list
  theta_list <- c(theta_list,seq(-1,2.5,.25))

  # global_design$deltas <- c(negD*1+n,negD*.75+n,negD*.5+n,negD*.25+n,n,posD*.25+n,posD*.5+n,posD*.75+n,m,posD*1.25+n,posD*1.5+n,posD*1.75+n,posD*2+n,posD*2.25+n,posD*2.5+n)
  # global_design$deltas <- seq(-1,2.5,.25)*(input$mde+input$nonf)
  thetas <- theta_list*testDesign$delta
  global_design$probOb <- gsProbability(d = testDesign, theta = thetas)

  # DF for results outputs
  global_design$table$complete <- data.frame(
    times = testDesign$timing,
    samples = testDesign$n.I * 2,
    days = testDesign$n.I * 2 / (input$traff/input$dayNum),
    lower = testDesign$lower$bound,
    upper = testDesign$upper$bound,
    results = 0
  )


  # Evaluate any results entered
  if (length(global_results$rslt) > 0) {

    # Add any z-scores in results column of design table
    for (i in 1:length(global_results$rslt)) {
      currRslt <- global_results$rslt[i]
      currChk <- currRslt[[1]][1]
      currZ <- currRslt[[1]][6]
      global_design$table$complete[currChk, 6] = currZ
    }

    # Evaluate results outcomes
    for (i in 1:length(global_results$rslt)) {
      tls_text <- if (global_design$tls == 1) "_1tail" else "_2tail"
      rslt <- global_results$rslt[i]
      chk <- as.numeric(rslt[[1]][1])
      complete_text <- if (chk == input$checknum) "complete" else "early"
      zscr <- rslt[[1]][6]
      boundary <- if (zscr > testDesign$upper$bound[chk]) "_upper" else if (zscr < testDesign$lower$bound[chk]) "_lower" else "_middle"
      global_results$status <- if (boundary == "_middle" || boundary == "_upper") paste0(complete_text,boundary) else paste0(complete_text,boundary,tls_text)
      global_results$win$checkpoint <- chk
      global_results$win$outcome <- switch(global_results$status,
         "complete_upper" = "reject",
         "complete_lower_2tail" = "reject",
         "complete_lower_1tail" = "inconclusive",
         "complete_middle" = "inconclusive",
         "early_upper" = "reject",
         "early_lower_1tail" = "option",
         "early_lower_2tail" = "reject",
         "early_middle" = "continue"
         )

      if (global_results$win$outcome == "reject") {
        break
      }
    }
  }
  }, error = function(e) {
    showNotification("Whoops! How embarrassing. Seems you triggered an error. Pleast consider reporting the bug using the link at the bottom of this document.", type="error", duration = 10)
  })
})


```

### Sample size comparison
```{r sample_size_outputs, include=FALSE}
output$fixedN <- renderText({
  # If there are multiple comparisons, we'll report in terms of per variation sample size since we can't assume the comparisons are separate variants.
  if (input$numVars < 1) {
    result <- format(round(global_design$fixedn), big.mark=",", scientific = FALSE)
  } else {
    val <- global_design$fixedn / 2
    result <- format(round(val), big.mark=",", scientific = FALSE)
    result <- paste(result,"per variant")
  }

  result

  })

output$fixedDays <- renderText({
  # If there are multiple comparisons, we'll report in terms of per variation days since we can't assume the comparisons are separate variants.
  if (input$numVars < 1) round(global_design$fixedn/(input$traff/input$dayNum)) else round(global_design$fixedn/2/(input$traff/input$dayNum))

  })

output$seqN <- renderText({
  # If there are multiple comparisons, we'll report in terms of per variation sample size since we can't assume the comparisons are separate variants.
  if (input$numVars < 1) {
    result <- format(round(global_design$seqn), big.mark=",", scientific = FALSE)
  } else {
    val <- global_design$seqn / 2
    result <- format(round(val), big.mark=",", scientific = FALSE)
    result <- paste(result,"per variant")
  }

  result

  })

output$maxDays <- renderText({
# If there are multiple comparisons, we'll report in terms of per variation days since we can't assume the comparisons are separate variants.
  if (input$numVars < 1) round(global_design$seqn/(input$traff/input$dayNum)) else round(global_design$seqn/2/(input$traff/input$dayNum))  })

output$diff <- renderText({
  paste0(round((global_design$seqn/global_design$fixedn-1)*100),"%")
  })
```


```{r sample_size_display}
div(
  h5("Fixed-horizon sample size: ",
     strong(textOutput("fixedN", inline = TRUE)),
     " (Estimated ",
     strong(textOutput("fixedDays", inline = TRUE)),
     " days)")
)

div(
  h5("Sequential test maximum sample size: ",
     strong(textOutput("seqN", inline = TRUE)),
     " (Estimated ",
     strong(textOutput("maxDays", inline = TRUE)),
     " days)")
)

div(
  h5("That's a maximum increase of ",
     strong(textOutput("diff", inline = TRUE)),
     " See the Expected Sample Size chart below to see what test duration is expected to be based on different effect sizes.")
)

```


Row {data-height=450}
-----------------------------
### Power analysis
This chart plots the likelihood of crossing a decision boundary at each checkpoint for different effect sizes.

```{r powerplot_output}

# Configure the power plot
output$pwrPlot <- renderPlot({
  req(input$numVars == 0)

  probOb <- global_design$probOb
  deltas <- global_design$deltas/100

  upProb <- data.frame(probOb$upper$prob) %>%
    mutate(cumsum(.)) %>%
    t() %>%
    data.frame()


  colnames(upProb) <- paste0("Upper_Checkpoint",seq(1,ncol(upProb)))
  upProb <- mutate(upProb, "Effects" = deltas)

  loProb <- data.frame(probOb$lower$prob) %>%
    mutate(cumsum(.)) %>%
    t() %>%
    data.frame()

  colnames(loProb) <- paste0("Lower_Checkpoint",seq(1,ncol(loProb)))
  loProb <- mutate(loProb, "Effects" = deltas)


  df <- merge(upProb,loProb) %>%
    pivot_longer(cols = !contains("Effects"), names_to = c("Boundary","Checkpoint"), names_pattern = "(.*)_Checkpoint(.*)", values_to = "Probability")

  ggplot(df, aes(x=Effects, y=Probability, color=Boundary, linetype=Checkpoint)) +
    #geom_line() +
    geom_smooth(method="gam", se=FALSE, fullrange=FALSE, size=.75) +
    theme_light() +
    scale_y_continuous(labels = scales::percent_format(accuracy=1), n.breaks = 10) +
    scale_x_continuous(labels = scales::percent, n.breaks = 8)

}, height = 350)

```

```{r powerplot_display}
div(plotOutput("pwrPlot"),
    uiOutput("multiwarn")
    )
```

### Expected sample size by effect size
This chart displays the expected sample size by the actual effect size of the treatment.

```{r durationplot_output, include = FALSE}
# Configure the expected sample size plot
output$samplePlot <- renderPlot({
  #req(as.numeric(input$tls) < 2)
  req(input$numVars == 0)


  nList <- global_design$table$samples
  probOb <- global_design$probOb
  deltas <- global_design$deltas/100
  nperDay <- round(input$traff/input$dayNum)

  # Combine probabilites of crossing upper and lower boundaries for different effect sizes
  combo_probs <- probOb$upper$prob+probOb$lower$prob

  combo_probs <- head(combo_probs,-1) # Drop last checkpoint
  last_row <- 1-colSums(combo_probs)  # New vector for last checkpoint taking 1 - sum of prior checkpoints
  combo_probs <- combo_probs %>%
    rbind(last_row)    # Put them together

  # Multiply probabilities by sample sizes at checkpoints
  combo_probs <- round(combo_probs * nList) %>%
    t() %>%
    data.frame() %>%
    mutate(expLength = rowSums(.)) %>%
    mutate(effects = deltas)

  # Make df of expected sample size by effect
  df <- data.frame(Effects = deltas, ExpSample = combo_probs$expLength)

  spline_df <- as.data.frame(spline(df$Effects, df$ExpSample))

  ggplot(df, aes(x = Effects, y = ExpSample)) +
    geom_line(data = spline_df, aes(x = x, y = y), color = "cyan3") +
    geom_hline(yintercept = global_design$fixedn, size=.25, linetype = "dashed") +
    annotate(geom = "text", x = 0,
         y = global_design$fixedn * .99,
         label= "Fixed horizon sample size",
         fontface = "bold",
         size = 4.0) +
    labs(y="Expected Sample Size") +
    theme_light() +
    scale_x_continuous(labels = scales::percent, n.breaks = 8) +
    scale_y_continuous(sec.axis = sec_axis(~ ./nperDay, name = "Estimated Duration (Days)"), labels = scales::comma)

}, height = 350)

```


```{r durationplot_display}
plotOutput("samplePlot")
```

Row
-----------------------------

### Enter results

```{r varA_results_inputs}
h3("Control Experience")
div(class='three-c',
  h5('Conversions'),
  div(numericInput("aConversions", label = NULL, value = "200", step = 10)
      )
)

div(class='three-c',
  h5('Visitors'),
  div(numericInput("aTraffic", label = NULL, value = "3000", step = 100)
      )
    )

div(class='three-c',
  h5('Conversion rate'),
  div(class = "big", textOutput("aRate", inline = TRUE)
      )
)

```

```{r varB_results_inputs}
h3("Test Experience")
div(class='three-c',
  h5('Conversions'),
  div(numericInput("bConversions", label = NULL, value = "270", step = 10)
      )
)

div(class='three-c',
  h5('Visitors'),
  div(numericInput("bTraffic", label = NULL, value = "3000", step = 100)
      )
    )

div(class='three-c',
  h5('Conversion rate'),
  div(class = "big", textOutput("bRate", inline = TRUE)
      )
)

```


```{r zscore_display}
h3("Calculations")
div(class = 'three-c',
    h5('Difference'),
    div(class = "big", textOutput("rateDiff")))

div(class = 'three-c',
    h5('Z-score'),
    div(class = "big", textOutput("zscore")))

div(class = 'three-c',
    h5('% of Sample size'),
    div(class = "big", textOutput("proportion")))

```

```{r checkpoint_selection_input}
#h3("Checkpoint selection")
h5(style="margin:10px 10px 0px 10px;",
   "Which analysis do these results apply to?")
div(class = "chkpt-selector",
    div(id="checkselectipt", htmlOutput("checkSelect")),
    div(id="checkselectbtn", actionLink("addResults", "Add to results")),
    div(id="checkselecttxt", "(added to the selected checkpoint)")
        )
```

```{r enter_results_outputs, include=FALSE}
# Configure results outputs except for plots -- those are in their corresponding section

# Checkpoint selector output
output$checkSelect <- renderUI({
  selectInput("checkpoint",
              NULL,
              choices = seq(1,input$checknum),
              selected = 1)
})

# Conversion rate calculation outputs
output$aRate <- renderText({paste0(round(input$aConversions / input$aTraffic * 100,2), "%")})
output$bRate <- renderText({paste0(round(input$bConversions / input$bTraffic * 100,2), "%")})

# Conversion difference
output$rateDiff <- renderText({
  req(input$aConversions > 1)
  req(input$bConversions > 1)
  req(input$aTraffic > 10)
  req(input$bTraffic > 10)

  diff <-
    round(((input$bConversions / input$bTraffic) / (input$aConversions / input$aTraffic) -
             1
    ) * 100, 2)
  paste0(diff, "%")
})

# Updates zscore calculation when results inputs change
observeEvent(c(input$aConversions, input$bConversions, input$aTraffic, input$bTraffic),{
  req(input$aConversions > 1)
  req(input$bConversions > 1)
  req(input$aTraffic > input$aConversions)
  req(input$bTraffic > input$bConversions)

  a <- input$aConversions
  b <- input$bConversions
  c <- input$aTraffic
  d <- input$bTraffic
  e <- if (input$nonf > 0) input$nonf/100 else 0

  # Calculate a z-score
  global_results$z <- round(-testBinomial(x1=a, x2=b, n1=c, n2=d, delta0 = a/c*e), digits = 2)

})

# Zscore output
output$zscore <- renderText({
  req(global_results$z != 0)
  global_results$z})

# Pct to sample output
output$proportion <- renderText({
  req(input$aConversions > 1)
  req(input$bConversions > 1)
  req(input$aTraffic > 10)
  req(input$bTraffic > 10)

  nprop <- (round(100*(input$aTraffic + input$bTraffic)/global_design$seqn))

  paste0(nprop,"%")
})

# CTA action adds current inputs to reactive results list
observeEvent(input$addResults, {
  inchk <- as.numeric(input$checkpoint)
  rownum <- paste0("checkpoint",inchk)
  zadd <- global_results$z
  rsltList <- c(inchk,input$aConversions,input$aTraffic,input$bConversions,input$bTraffic,zadd)

  # Validate results data