Skip to content

storage duplicating same files in different projects #44

@CGMossa

Description

@CGMossa

We mentioned this already: Two projects pointing to a common storage directory, with the same data added to them, result in duplicate entries in audit log, yet since the data hashes to the same, we only have one common entry in the storage directory itself.

Setup phase

Details

> if (!exists("dvs_workspace")) {
+   dvs_workspace <- getwd()
+ }

> source(file.path(dvs_workspace, "ui/scripts/R", "tree.R"), echo = TRUE)

> fs_manual_dir_tree <- function(path = ".", recurse = TRUE, ...) {
+   files <- fs::dir_ls(path, recurse = recurse, ...)
+   by_dir <- split(files, f .... [TRUNCATED] 

> dvs_workspace
[1] "/Users/elea/Documents/a2ai_github/dvs2"

> withr::with_dir(
+   dvs_workspace,
+   system2(
+     "just",
+     "install-cli",
+   )
+ )
cargo install --force --locked --path=dvs-cli 

The following is initializing the two projects

Details

> proj_root_a <- file.path(tempfile(), "projectA")

> proj_root_a
[1] "/var/folders/_x/bq8vb1b156sgl363l71by61h0000gn/T//Rtmp4Fdbri/file143597d2ac1/projectA"

> dir.create(proj_root_a, recursive = TRUE)

> dir.create(file.path(proj_root_a, ".git/"))

> message("define global storage directory")
define global storage directory

> storage_directory <- file.path(tempdir(), "dvs_data_directory")

> storage_directory
[1] "/var/folders/_x/bq8vb1b156sgl363l71by61h0000gn/T//Rtmp4Fdbri/dvs_data_directory"

> message("dvs repository for project A with a storage directory provided")
dvs repository for project A with a storage directory provided

> setwd(proj_root_a)

> system2(
+   "dvs",
+   c("init", storage_directory)
+ )
DVS Initialized

> proj_root_b <- file.path(tempfile(), "projectB")

> dir.create(proj_root_b, recursive = TRUE)

> dir.create(file.path(proj_root_b, ".git/"))

> message("dvs repository for project B with a storage directory provided")
dvs repository for project B with a storage directory provided

> setwd(proj_root_b)

> system2(
+   "dvs",
+   c("init", storage_directory)
+ )
DVS Initialized

> # create data directories
> 
> fs::dir_create(proj_root_a, "data")

> fs::dir_create(proj_root_b, "data")

Added and store two distinct data files in the two projects.

Details

> # store one data file in the two projects
> 
> write.table(
+   file = file.path(proj_root_a, "data", "theoph_head_15.tab"),
+   head(Theoph, 15),
+   eol = "\n"
+ )

> write.table(
+   file = file.path(proj_root_b, "data", "theoph_head_15.tab"),
+   head(Theoph, 15),
+   eol = "\n"
+ )

> # store another file in the two projects
> 
> write.table(
+   file = file.path(proj_root_a, "data", "theoph_head_23.tab"),
+   head(Theoph, 23),
+   eol = "\n"
+ )

> write.table(
+   file = file.path(proj_root_b, "data", "theoph_head_23.tab"),
+   head(Theoph, 23),
+   eol = "\n"
+ )

> # add two 15/23 files to projects a and b
> 
> setwd(proj_root_a)

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_a, "data", "theoph_head_23.tab"),
+     "--message",
+     r"("added head(23) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_23.tab

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_a, "data", "theoph_head_15.tab"),
+     "--message",
+     r"("added head(15) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_15.tab

Let's inspect the whole tree and the audit log:

> fs::dir_tree(
+   tempdir(),
+   recurse = TRUE,
+   all = TRUE,
+   # invert = TRUE,
+   # glob = "*.git"
+ )
/var/folders/_x/bq8vb1b156sgl363l71by61h0000gn/T//Rtmp4Fdbri
├── dvs_data_directory
│   ├── 07
│   │   └── 53308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32
│   ├── audit.log.jsonl
│   └── d1
│       └── aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81
├── file143591981e527
│   └── projectB
│       ├── .dvs
│       ├── .git
│       ├── data
│       │   ├── theoph_head_15.tab
│       │   └── theoph_head_23.tab
│       └── dvs.toml
└── file143597d2ac1
    └── projectA
        ├── .dvs
        │   ├── .cache
        │   │   └── dvs.db
        │   ├── .gitignore
        │   └── data
        │       ├── theoph_head_15.tab.dvs
        │       └── theoph_head_23.tab.dvs
        ├── .git
        ├── data
        │   ├── .gitignore
        │   ├── theoph_head_15.tab
        │   └── theoph_head_23.tab
        └── dvs.toml

> readLines(fs::path(storage_directory, "audit.log.jsonl")) |>
+   cat(sep = "\n")
{"operation_id":"ff3d0908-10a4-47d0-879a-05fba46bebe9","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_23.tab","hashes":{"blake3":"0753308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32","md5":"d407dfbc21154b4efc341e3ae8d03ca4"}},"action":"add"}
{"operation_id":"058f42b6-6cb1-48e4-a578-42a1d8c053c8","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_15.tab","hashes":{"blake3":"d1aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81","md5":"2be6986c70a0208f6071c25d5d1a3353"}},"action":"add"}

> # <continue> add the two files to project B
> 
> setwd(proj_root_b)

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_b, "data", "theoph_head_23.tab"),
+     "--message",
+     r"("added head(23) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_23.tab

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_b, "data", "theoph_head_15.tab"),
+     "--message",
+     r"("added head(15) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_15.tab

> fs::dir_tree(
+   tempdir(),
+   recurse = TRUE,
+   all = TRUE,
+   invert = TRUE,
+   glob = "*.git"
+ )
/var/folders/_x/bq8vb1b156sgl363l71by61h0000gn/T//Rtmp4Fdbri
├── dvs_data_directory
│   ├── 07
│   │   └── 53308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32
│   ├── audit.log.jsonl
│   └── d1
│       └── aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81
├── file143591981e527
│   └── projectB
│       ├── .dvs
│       │   ├── .cache
│       │   │   └── dvs.db
│       │   ├── .gitignore
│       │   └── data
│       │       ├── theoph_head_15.tab.dvs
│       │       └── theoph_head_23.tab.dvs
│       ├── data
│       │   ├── .gitignore
│       │   ├── theoph_head_15.tab
│       │   └── theoph_head_23.tab
│       └── dvs.toml
└── file143597d2ac1
    └── projectA
        ├── .dvs
        │   ├── .cache
        │   │   └── dvs.db
        │   ├── .gitignore
        │   └── data
        │       ├── theoph_head_15.tab.dvs
        │       └── theoph_head_23.tab.dvs
        ├── data
        │   ├── .gitignore
        │   ├── theoph_head_15.tab
        │   └── theoph_head_23.tab
        └── dvs.toml

> readLines(fs::path(storage_directory, "audit.log.jsonl")) |>
+   cat(sep = "\n")
{"operation_id":"ff3d0908-10a4-47d0-879a-05fba46bebe9","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_23.tab","hashes":{"blake3":"0753308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32","md5":"d407dfbc21154b4efc341e3ae8d03ca4"}},"action":"add"}
{"operation_id":"058f42b6-6cb1-48e4-a578-42a1d8c053c8","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_15.tab","hashes":{"blake3":"d1aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81","md5":"2be6986c70a0208f6071c25d5d1a3353"}},"action":"add"}
{"operation_id":"6f95227d-b568-444d-a51d-8bc27e92304a","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_23.tab","hashes":{"blake3":"0753308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32","md5":"d407dfbc21154b4efc341e3ae8d03ca4"}},"action":"add"}
{"operation_id":"ed55f391-a77a-42e3-971f-c4f66694aa57","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_15.tab","hashes":{"blake3":"d1aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81","md5":"2be6986c70a0208f6071c25d5d1a3353"}},"action":"add"}

> #' Add the Theoph(15) file, with the same message twice.
> #'
> 
> setwd(proj_root_a)

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_a, "data", "theoph_head_15.tab"),
+     "--message",
+     r"("added head(23) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_15.tab

> setwd(proj_root_b)

> system2(
+   "dvs",
+   c(
+     "add",
+     file.path(proj_root_b, "data", "theoph_head_15.tab"),
+     "--message",
+     r"("added head(15) of theoph in tab format")"
+   )
+ )
Added: data/theoph_head_15.tab

> fs::dir_tree(
+   tempdir(),
+   recurse = TRUE,
+   all = TRUE,
+   glob = "*/.git",
+   invert = TRUE
+ )
/var/folders/_x/bq8vb1b156sgl363l71by61h0000gn/T//Rtmp4Fdbri
├── dvs_data_directory
│   ├── 07
│   │   └── 53308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32
│   ├── audit.log.jsonl
│   └── d1
│       └── aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81
├── file143591981e527
│   └── projectB
│       ├── .dvs
│       │   ├── .cache
│       │   │   └── dvs.db
│       │   ├── .gitignore
│       │   └── data
│       │       ├── theoph_head_15.tab.dvs
│       │       └── theoph_head_23.tab.dvs
│       ├── data
│       │   ├── .gitignore
│       │   ├── theoph_head_15.tab
│       │   └── theoph_head_23.tab
│       └── dvs.toml
└── file143597d2ac1
    └── projectA
        ├── .dvs
        │   ├── .cache
        │   │   └── dvs.db
        │   ├── .gitignore
        │   └── data
        │       ├── theoph_head_15.tab.dvs
        │       └── theoph_head_23.tab.dvs
        ├── data
        │   ├── .gitignore
        │   ├── theoph_head_15.tab
        │   └── theoph_head_23.tab
        └── dvs.toml

> readLines(fs::path(storage_directory, "audit.log.jsonl")) |>
+   cat(sep = "\n")
{"operation_id":"ff3d0908-10a4-47d0-879a-05fba46bebe9","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_23.tab","hashes":{"blake3":"0753308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32","md5":"d407dfbc21154b4efc341e3ae8d03ca4"}},"action":"add"}
{"operation_id":"058f42b6-6cb1-48e4-a578-42a1d8c053c8","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_15.tab","hashes":{"blake3":"d1aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81","md5":"2be6986c70a0208f6071c25d5d1a3353"}},"action":"add"}
{"operation_id":"6f95227d-b568-444d-a51d-8bc27e92304a","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_23.tab","hashes":{"blake3":"0753308ffe7cb4bcefa85def0a4692da383510e3a5e1755c8cfea6558f595e32","md5":"d407dfbc21154b4efc341e3ae8d03ca4"}},"action":"add"}
{"operation_id":"ed55f391-a77a-42e3-971f-c4f66694aa57","timestamp":1771932025,"user":"elea","file":{"path":"data/theoph_head_15.tab","hashes":{"blake3":"d1aa2ed583c752d5212e4ac18cc3e2005048b92ce5e1fa43521c2046f5995a81","md5":"2be6986c70a0208f6071c25d5d1a3353"}},"action":"add"}

Convenience way to open vscode:

> message(
+   "open visual studio code in `tempdir()` (session constant)",
+   " to have an overview over all file changes"
+ )
open visual studio code in `tempdir()` (session constant) to have an overview over all file changes

> browseURL(url = tempdir(), browser = "code")
>

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions