diff --git a/ui/scripts/R/gendata.R b/ui/scripts/R/gendata.R new file mode 100644 index 0000000..af6053a --- /dev/null +++ b/ui/scripts/R/gendata.R @@ -0,0 +1,28 @@ +#' Generate unique test data files +#' +#' Creates \code{n} tab-delimited files. When \code{size} is \code{NULL}, +#' file \code{i} contains \code{i} rows (so every file has a unique hash). +#' When \code{size} is given, every file has \code{size} rows but a unique +#' random seed per file ensures distinct hashes. +#' +#' @param files Directory to write the files into (created if it doesn't exist). +#' @param n Number of files to generate. Defaults to \code{1}. +#' @param size Number of rows per file. If \code{NULL} (default), file \code{i} +#' gets \code{i} rows (legacy behaviour). +#' @param data Source data frame. Defaults to \code{Theoph}. +#' @return Invisibly returns \code{NULL}. Called for its side effect of writing +#' \code{Theoph_n_1.tab}, \code{Theoph_n_2.tab}, \ldots, \code{Theoph_n_.tab} +#' into \code{files}. +# TODO: rename `files` to `path` and check that it is one single path too +gendata <- function(files, n = 1, size = NULL, data = Theoph) { + dir.create(files, recursive = TRUE, showWarnings = FALSE) + for (i in seq_len(n)) { + nrows <- if (is.null(size)) i else size + i - 1L + rows <- rep_len(seq_len(nrow(data)), nrows) + write.table( + data[rows, ], + file = file.path(files, sprintf("Theoph_n_%d.tab", i)), + eol = "\n" + ) + } +} diff --git a/ui/scripts/R/run.R b/ui/scripts/R/run.R new file mode 100644 index 0000000..c9455c0 --- /dev/null +++ b/ui/scripts/R/run.R @@ -0,0 +1,64 @@ +#' Source a script and capture echoed output in a file +#' +#' Runs [source()] while redirecting standard output and messages to `out`. +#' If `out` already exists, a warning is issued and the file is overwritten +#' (truncated). +#' +#' @param script Character scalar. Path to the script passed to [source()]. +#' @param out Character scalar. Output file path. Defaults to `paste0(script, ".out")`. +#' @param echo Logical. Passed to [source()]. Defaults to `TRUE`. +#' @param max.deparse.length Numeric/integer. Passed to [source()]. +#' Use `Inf` to avoid `"[TRUNCATED]"` markers. +#' @param split Logical. Passed to [sink()] for standard output. +#' If `TRUE`, output is written to both console and file. +#' @param ... Additional arguments passed to [source()]. +#' +#' @return Invisibly returns the value from [source()]. +#' +#' @examples +#' \dontrun{ +#' source_to_file("script.R", out = "script.R.out") +#' source_to_file("script.R", out = "script.R.out", split = TRUE) +#' } +source_to_file <- function( + script, + out = paste0(script, ".out"), + echo = TRUE, + max.deparse.length = Inf, + split = FALSE, + ... +) { + if (file.exists(out)) { + warning( + sprintf("'%s' already exists and will be overwritten.", out), + call. = FALSE + ) + } + + con <- file(out, open = "wt") # truncate/overwrite + out_n <- sink.number() + msg_n <- sink.number(type = "message") + + sink(con, split = split) + sink(con, type = "message") + + on.exit( + { + while (sink.number(type = "message") > msg_n) { + sink(type = "message") + } + while (sink.number() > out_n) { + sink() + } + close(con) + }, + add = TRUE + ) + + invisible(source( + script, + echo = echo, + max.deparse.length = max.deparse.length, + ... + )) +} diff --git a/ui/scripts/R/tree.R b/ui/scripts/R/tree.R new file mode 100644 index 0000000..1e58265 --- /dev/null +++ b/ui/scripts/R/tree.R @@ -0,0 +1,37 @@ +fs_manual_dir_tree <- function(path = ".", recurse = TRUE, ...) { + files <- fs::dir_ls(path, recurse = recurse, ...) + by_dir <- split(files, fs::path_dir(files)) + ch <- fs:::box_chars() + get_coloured_name <- function(x) { + coloured <- fs:::colourise_fs_path(x) + sub(x, fs::path_file(x), coloured, fixed = TRUE) + } + print_leaf <- function(x, indent) { + leafs <- by_dir[[x]] + for (i in seq_along(leafs)) { + if (i == length(leafs)) { + cat( + indent, + fs:::pc(ch$l, ch$h, ch$h, " "), + get_coloured_name(leafs[[i]]), + "\n", + sep = "" + ) + print_leaf(leafs[[i]], paste0(indent, " ")) + } else { + cat( + indent, + fs:::pc(ch$j, ch$h, ch$h, " "), + get_coloured_name(leafs[[i]]), + "\n", + sep = "" + ) + print_leaf(leafs[[i]], paste0(indent, fs:::pc(ch$v, " "))) + } + } + } + cat(fs:::colourise_fs_path(path), "\n", sep = "") + # print_leaf(fs::path_expand(path), "") + print_leaf(path, "") + invisible(files) +} diff --git a/ui/scripts/add_01.R b/ui/scripts/add_01.R new file mode 100755 index 0000000..2274151 --- /dev/null +++ b/ui/scripts/add_01.R @@ -0,0 +1,153 @@ + +if (!exists("dvs_workspace")) { + dvs_workspace <- getwd() +} +source(file.path(dvs_workspace, "ui/scripts/R", "tree.R"), echo = TRUE) + +dvs_workspace +withr::with_dir( + dvs_workspace, + system2( + "just", + "install-cli", + ) +) + +system2("dvs") + +message("dvs repository where storage directory is different location") +proj_root <- file.path(tempfile(), "projectA") +proj_root +dir.create(proj_root, recursive = TRUE) +setwd(proj_root) +dir.create(file.path(proj_root, ".git/")) +message("define global storage directory") +storage_directory <- file.path(tempdir(), "dvs_data_directory") +list( + project_root = proj_root, + storage_root = storage_directory +) + +message("storage directory is an ancestor to the project(s)") + +message( + "open visual studio code in `tempdir()` (session constant)", + " to have an overview over all file changes" +) +browseURL(url = tempdir(), browser = "code") + +message("dvs with a storage directory provided") +system2( + "dvs", + c("init", storage_directory) +) + +cat("# DVS repository\n", file = file.path(proj_root, "README.md")) + +# fs::dir_tree( +# tempdir() +# ) |> +# print() |> +# fs::path_rel(file.path(tempdir())) |> +# fs_manual_dir_tree() + +fs::dir_tree( + tempdir() +) + tempdir() |> unclass() + tempdir() |> fs::path_expand() |> unclass() + +fs_manual_dir_tree( + # tempdir() + tempdir() |> fs::path_expand() +) + + +#' +#' Let's add something +#' + +fs::dir_create(proj_root, "data") + +write.table( + file = file.path(proj_root, "data", "theoph_head.tab"), + head(Theoph, 15), + eol = "\n" +) + +fs::dir_tree( + tempdir() +) + +message("data/theoph_head.tab:\n") + +readLines( + file.path(proj_root, "data", "theoph_head.tab") +) |> + cat(sep = "\n") + +system2( + "dvs", + c("add", "--help") +) + +readLines( + file.path(proj_root, "dvs.toml") +) |> + cat(sep = "\n") + +system2( + "dvs", + c( + "add", + file.path(proj_root, "data", "theoph_head.tab"), + "--message", + r"("added head of theoph in tab format")" + ) +) +# COMMENT: The backtrace need to be removed. + +fs::dir_ls(proj_root, recurse = TRUE) |> + fs::path_rel(start = fs::path(proj_root, "..")) + + +fs::dir_ls(storage_directory, recurse = TRUE) |> + fs::path_rel(start = fs::path(storage_directory, "..")) + +readLines(fs::path(storage_directory, "audit.log.jsonl")) |> + cat(sep = "\n") + +# Let's add the same file again + +system2( + "dvs", + c( + "add", + file.path(proj_root, "data", "theoph_head.tab"), + "--message", + r"("ssadded head of theoph in tab format")" + ) +) +file.edit(fs::path(storage_directory, "audit.log.jsonl")) +readLines(fs::path(storage_directory, "audit.log.jsonl")) |> + cat(sep = "\n") +#' The audit log has two entries, for files that has the exact same hash, but the time +#' is different. +#' +#' + +write.table( + file = file.path(proj_root, "data", "theoph_head3.tab"), + head(Theoph, 12), + eol = "\n" +) + +system2( + "dvs", + c( + "add", + file.path(proj_root, "data", "theoph_head3.tab"), + "--message", + r"("added head of theoph in tab format")" + ) +) diff --git a/ui/scripts/add_02.R b/ui/scripts/add_02.R new file mode 100755 index 0000000..59e3ecc --- /dev/null +++ b/ui/scripts/add_02.R @@ -0,0 +1,192 @@ +#' We will make two DVS repositories, choose a common storage directory, but then +#' we will store two files that are distinct, and try to store them again in the same +#' repo, and then inspect the storage directory under these actions/events. +#' + +if (!exists("dvs_workspace")) { + dvs_workspace <- getwd() +} +source(file.path(dvs_workspace, "ui/scripts/R", "tree.R"), echo = TRUE) + +dvs_workspace +withr::with_dir( + dvs_workspace, + system2( + "just", + "install-cli", + ) +) + +system2("dvs") + +message("dvs repository where storage directory is different location") +proj_root_a <- file.path(tempfile(), "projectA") +proj_root_a +dir.create(proj_root_a, recursive = TRUE) +dir.create(file.path(proj_root_a, ".git/")) +message("define global storage directory") +storage_directory <- file.path(tempdir(), "dvs_data_directory") +storage_directory + +message("dvs repository for project A with a storage directory provided") +setwd(proj_root_a) +system2( + "dvs", + c("init", storage_directory) +) + + +proj_root_b <- file.path(tempfile(), "projectB") +dir.create(proj_root_b, recursive = TRUE) +dir.create(file.path(proj_root_b, ".git/")) + +message("dvs repository for project B with a storage directory provided") +setwd(proj_root_b) +system2( + "dvs", + c("init", storage_directory) +) + +# create data directories + +fs::dir_create(proj_root_a, "data") +fs::dir_create(proj_root_b, "data") + +# store one data file in the two projects + +write.table( + file = file.path(proj_root_a, "data", "theoph_head_15.tab"), + head(Theoph, 15), + eol = "\n" +) +write.table( + file = file.path(proj_root_b, "data", "theoph_head_15.tab"), + head(Theoph, 15), + eol = "\n" +) + +# store another file in the two projects + +write.table( + file = file.path(proj_root_a, "data", "theoph_head_23.tab"), + head(Theoph, 23), + eol = "\n" +) +write.table( + file = file.path(proj_root_b, "data", "theoph_head_23.tab"), + head(Theoph, 23), + eol = "\n" +) + + +# add two 15/23 files to projects a and b + +setwd(proj_root_a) +system2( + "dvs", + c( + "add", + file.path(proj_root_a, "data", "theoph_head_23.tab"), + "--message", + r"("added head(23) of theoph in tab format")" + ) +) +system2( + "dvs", + c( + "add", + file.path(proj_root_a, "data", "theoph_head_15.tab"), + "--message", + r"("added head(15) of theoph in tab format")" + ) +) + +# audit log in common storage + +fs::dir_tree( + tempdir(), + recurse = TRUE, + all = TRUE, + # invert = TRUE, + # glob = "*.git" +) + +readLines(fs::path(storage_directory, "audit.log.jsonl")) |> + cat(sep = "\n") + + +# add the two files to project B + +setwd(proj_root_b) +system2( + "dvs", + c( + "add", + file.path(proj_root_b, "data", "theoph_head_23.tab"), + "--message", + r"("added head(23) of theoph in tab format")" + ) +) +system2( + "dvs", + c( + "add", + file.path(proj_root_b, "data", "theoph_head_15.tab"), + "--message", + r"("added head(15) of theoph in tab format")" + ) +) + +fs::dir_tree( + tempdir(), + recurse = TRUE, + all = TRUE, + invert = TRUE, + glob = "*.git" +) + + +readLines(fs::path(storage_directory, "audit.log.jsonl")) |> + cat(sep = "\n") + +#' Add the Theoph(15) file, with the same message twice. +#' + +setwd(proj_root_a) +system2( + "dvs", + c( + "add", + file.path(proj_root_a, "data", "theoph_head_15.tab"), + "--message", + r"("added head(23) of theoph in tab format")" + ) +) +setwd(proj_root_b) +system2( + "dvs", + c( + "add", + file.path(proj_root_b, "data", "theoph_head_15.tab"), + "--message", + r"("added head(15) of theoph in tab format")" + ) +) + +fs::dir_tree( + tempdir(), + recurse = TRUE, + all = TRUE, + glob = "*/.git", + invert = TRUE +) + +readLines(fs::path(storage_directory, "audit.log.jsonl")) |> + cat(sep = "\n") + + +message( + "open visual studio code in `tempdir()` (session constant)", + " to have an overview over all file changes" +) +browseURL(url = tempdir(), browser = "code") diff --git a/ui/scripts/add_03.sh b/ui/scripts/add_03.sh new file mode 100755 index 0000000..ef12769 --- /dev/null +++ b/ui/scripts/add_03.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +set -euo pipefail +LOGFILE="${0##*/}" +LOGFILE="${LOGFILE%.*}.log" # remove last extension +exec > >(tee "$LOGFILE") 2>&1 + +run() { echo "$ $*"; "$@"; echo; } + +TMPDIR="$(mktemp -d)" +# trap 'rm -rf "$TMPDIR"' EXIT # remove everything when exit + +cd "$TMPDIR" +mkdir projectA +mkdir storage + +STORAGE="$TMPDIR/storage" +PROJECT_A="$TMPDIR/projectA" + +run cd "$PROJECT_A" +run git init +run mkdir data +run dvs init "$STORAGE" +# echo "$PWD" +# run ls -la +run tree "$TMPDIR" +run R -q -e 'write.csv(head(Theoph), "data/theoph-head.csv")' +run dvs add data/theoph-head.csv --message "adding file within-tree" +run cat "$STORAGE"/audit.log.jsonl + +# adding file outside of project tree + +run R -q --vanilla -e "write.csv(head(Theoph), \"$TMPDIR/theoph-head.csv\")" + +run dvs add "$TMPDIR"/theoph-head.csv --message "addding a file out-of-tree" + +exit 1 diff --git a/ui/scripts/batch_par_add.sh b/ui/scripts/batch_par_add.sh new file mode 100755 index 0000000..08c83b9 --- /dev/null +++ b/ui/scripts/batch_par_add.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail +LOGFILE="${0##*/}" +LOGFILE="${LOGFILE%.*}.log" +exec > >(tee "$LOGFILE") 2>&1 + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +# Archetypes: +# n size description +# 10 1e6 few large files (big genomics tables) +# 100 1e5 moderate count, moderate size (typical clinical datasets) +# 1000 1e4 many small-medium files (batch simulation outputs) +# 10000 1e2 lots of tiny files (parameter sweeps / configs) + +CONFIGS=( + "10 1e6" + "100 1e5" + "1000 1e4" + "10000 1e2" +) + +echo "===========================================" +echo " Batch benchmark: par_add.sh" +echo "===========================================" +echo + +for cfg in "${CONFIGS[@]}"; do + read -r n size <<< "$cfg" + echo "###############################################" + echo "### n=$n size=$size" + echo "###############################################" + echo + bash "$SCRIPT_DIR/par_add.sh" "$n" "$size" + echo +done + +# collect all summaries at the end +echo "" +echo "==================================================" +echo " Combined Summary" +echo "==================================================" +for cfg in "${CONFIGS[@]}"; do + read -r n size <<< "$cfg" + logfile="par_add_n${n}_s${size}.log" + if [ -f "$logfile" ]; then + grep -A100 "Timing Summary" "$logfile" | head -20 + echo + fi +done diff --git a/ui/scripts/init.R b/ui/scripts/init.R new file mode 100755 index 0000000..7acbd5a --- /dev/null +++ b/ui/scripts/init.R @@ -0,0 +1,52 @@ + +if(!exists("dvs_workspace")) dvs_workspace <- getwd() + +dvs_workspace +system2( + "just", "install-cli", +) + +system2("dvs") + +proj_root <- file.path(tempfile(), "projectA") +proj_root +dir.create(proj_root, recursive = TRUE) + +setwd(proj_root) + +system2( + "dvs", c("init", "--help") +) + +system2( + "dvs", c("init", proj_root) +) + +message("Missing a .git folder, as otherwise `dvs init` does not work") + +dir.create(file.path(proj_root, ".git/")) + +system2( + "dvs", c("init", proj_root) +) +# IDEA: Absolute path should be printed, atleast in the R package + +fs::dir_tree(proj_root) + +readLines(file.path(proj_root, "dvs.toml")) |> cat(sep="\n") + +message("now, recreate a project with --no-compression") + +proj_root <- file.path(tempfile(), "projectA") +proj_root +dir.create(proj_root, recursive = TRUE) + +setwd(proj_root) +dir.create(file.path(proj_root, ".git/")) + +system2( + "dvs", c("init", proj_root, "--no-compression") +) +fs::dir_tree(proj_root) + +readLines(file.path(proj_root, "dvs.toml")) |> cat(sep="\n") diff --git a/ui/scripts/par_add.sh b/ui/scripts/par_add.sh new file mode 100755 index 0000000..020fff4 --- /dev/null +++ b/ui/scripts/par_add.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +set -euo pipefail +LOGFILE="${0##*/}" +LOGFILE="${LOGFILE%.*}" # remove last extension +# if called with args, include them in log name +if [ $# -ge 1 ]; then + LOGFILE="${LOGFILE}_n${1}_s${2:-1e6}.log" +else + LOGFILE="${LOGFILE}.log" +fi +exec > >(tee "$LOGFILE") 2>&1 + +run() { echo "$ $*"; "$@"; echo; } +tic() { TIC_START=$(perl -MTime::HiRes=time -e 'printf "%.3f", time'); } +toc() { + local end elapsed + end=$(perl -MTime::HiRes=time -e 'printf "%.3f", time') + elapsed=$(echo "$end - $TIC_START" | bc) + echo "elapsed: ${elapsed}s"; echo + TIMINGS+=("$elapsed") +} +TIMINGS=() +LABELS=() + +N="${1:-100}" +SIZE="${2:-1e6}" + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" + +TMPDIR="$(mktemp -d)" +trap 'rm -rf "$TMPDIR"' EXIT + +cd "$TMPDIR" +mkdir projectA +mkdir storage + +STORAGE="$TMPDIR/storage" +PROJECT_A="$TMPDIR/projectA" + +run cd "$PROJECT_A" +run git init +run mkdir data + +# generate datasets into 4 subdirectories +run R -q -e "source('$SCRIPT_DIR/R/gendata.R'); gendata(files='data/add_par_print/', n=$N, size=$SIZE); gendata(files='data/add_seq_print/', n=$N, size=$SIZE); gendata(files='data/add_par_quiet/', n=$N, size=$SIZE); gendata(files='data/add_seq_quiet/', n=$N, size=$SIZE)" + +run dvs init "$STORAGE" +run tree "$TMPDIR" + +# parallel add — with printing +LABELS+=("add: parallel, with printing") +echo "=== dvs add (parallel, with printing) ===" +tic +run dvs add data/add_par_print/Theoph_n_*.tab --message "parallel add with printing" +toc + +run cat "$STORAGE"/audit.log.jsonl +run tree "$TMPDIR" + +# show .dvs metadata for first 10 files +echo "=== .dvs metadata for first 10 added files ===" +for i in $(seq 1 10); do + f=".dvs/data/add_par_print/Theoph_n_${i}.tab.dvs" + echo "--- $f ---" + cat "$f" + echo +done + +# sequential add — with printing +LABELS+=("add: sequential, with printing") +echo "=== dvs add (sequential, with printing) ===" +tic +run env DVS_NUM_THREADS=1 dvs add data/add_seq_print/Theoph_n_*.tab --message "sequential add with printing" +toc + +# parallel add — without printing +LABELS+=("add: parallel, no printing") +echo "=== dvs add (parallel, no printing) ===" +tic +dvs add data/add_par_quiet/Theoph_n_*.tab --message "parallel add no printing" > /dev/null 2>&1 +toc + +# sequential add — without printing +LABELS+=("add: sequential, no printing") +echo "=== dvs add (sequential, no printing) ===" +tic +DVS_NUM_THREADS=1 dvs add data/add_seq_quiet/Theoph_n_*.tab --message "sequential add no printing" > /dev/null 2>&1 +toc + +# --- status benchmarks --- + +# parallel status — with printing +LABELS+=("status: parallel, with printing") +echo "=== dvs status (parallel, with printing) ===" +tic +run dvs status +toc + +# parallel status — without printing +LABELS+=("status: parallel, no printing") +echo "=== dvs status (parallel, no printing) ===" +tic +dvs status > /dev/null 2>&1 +toc + +# sequential status — with printing +LABELS+=("status: sequential, with printing") +echo "=== dvs status (sequential, with printing) ===" +tic +run env DVS_NUM_THREADS=1 dvs status +toc + +# sequential status — without printing +LABELS+=("status: sequential, no printing") +echo "=== dvs status (sequential, no printing) ===" +tic +DVS_NUM_THREADS=1 dvs status > /dev/null 2>&1 +toc + +# --- get benchmarks --- + +# delete local data files so get has something to fetch +run rm data/add_par_print/Theoph_n_*.tab + +# parallel get — with printing +LABELS+=("get: parallel, with printing") +echo "=== dvs get (parallel, with printing) ===" +tic +run dvs get --glob "data/add_par_print/Theoph_n_*.tab" +toc + +rm data/add_par_print/Theoph_n_*.tab + +# parallel get — without printing +LABELS+=("get: parallel, no printing") +echo "=== dvs get (parallel, no printing) ===" +tic +dvs get --glob "data/add_par_print/Theoph_n_*.tab" > /dev/null 2>&1 +toc + +rm data/add_par_print/Theoph_n_*.tab + +# sequential get — with printing +LABELS+=("get: sequential, with printing") +echo "=== dvs get (sequential, with printing) ===" +tic +run env DVS_NUM_THREADS=1 dvs get --glob "data/add_par_print/Theoph_n_*.tab" +toc + +rm data/add_par_print/Theoph_n_*.tab + +# sequential get — without printing +LABELS+=("get: sequential, no printing") +echo "=== dvs get (sequential, no printing) ===" +tic +DVS_NUM_THREADS=1 dvs get --glob "data/add_par_print/Theoph_n_*.tab" > /dev/null 2>&1 +toc + +# summary +echo "===========================================" +echo " Timing Summary ($N files, $SIZE rows each)" +echo "===========================================" +for i in "${!LABELS[@]}"; do + printf " %-40s %ss\n" "${LABELS[$i]}" "${TIMINGS[$i]}" +done +echo + +exit 0