From 01145f337786fa0b44dac9c5713c9d5d6e4693eb Mon Sep 17 00:00:00 2001
From: simlecode <69969590+simlecode@users.noreply.github.com>
Date: Tue, 29 Jul 2025 09:59:21 +0800
Subject: [PATCH 1/2] opt: add ChunkParams
---
chunk.go | 63 ++++++++++++++++++++++--------------------
cmd/graphsplit/main.go | 27 ++++++++++++++++--
extra_file.go | 2 +-
utils.go | 16 ++++-------
4 files changed, 64 insertions(+), 44 deletions(-)
diff --git a/chunk.go b/chunk.go
index 509aa26..2d097a8 100644
--- a/chunk.go
+++ b/chunk.go
@@ -154,34 +154,37 @@ func ErrCallback() GraphBuildCallback {
return &errCallback{}
}
-func Chunk(ctx context.Context,
- expectSliceSize int64,
- parentPath,
- targetPath,
- carDir,
- graphName string,
- parallel int,
- cb GraphBuildCallback,
- ef *ExtraFile,
- randomRenameSourceFile bool,
- randomSelectFile bool,
-) error {
+type ChunkParams struct {
+ ExpectSliceSize int64
+ ParentPath string
+ TargetPath string
+ CarDir string
+ GraphName string
+ Parallel int
+ Cb GraphBuildCallback
+ Ef *ExtraFile
+ RandomRenameSourceFile bool
+ RandomSelectFile bool
+ SkipFilename bool
+}
+
+func Chunk(ctx context.Context, params *ChunkParams) error {
var cumuSize int64 = 0
graphSliceCount := 0
graphFiles := make([]Finfo, 0)
- if expectSliceSize == 0 {
+ if params.ExpectSliceSize == 0 {
return fmt.Errorf("slice size has been set as 0")
}
- if parallel <= 0 {
+ if params.Parallel <= 0 {
return fmt.Errorf("parallel has to be greater than 0")
}
- if parentPath == "" {
- parentPath = targetPath
+ if params.ParentPath == "" {
+ params.ParentPath = params.TargetPath
}
- partSliceSize := expectSliceSize - ef.sliceSize
- args := []string{targetPath}
- sliceTotal := GetGraphCount(args, expectSliceSize)
+ partSliceSize := params.ExpectSliceSize - params.Ef.sliceSize
+ args := []string{params.TargetPath}
+ sliceTotal := GetGraphCount(args, params.ExpectSliceSize)
if sliceTotal == 0 {
log.Warn("Empty folder or file!")
return nil
@@ -197,7 +200,7 @@ func Chunk(ctx context.Context,
for _, item := range allFiles {
item := item
- if randomRenameSourceFile {
+ if params.RandomRenameSourceFile {
item = tryRenameFileName([]Finfo{item})[0]
}
// log.Infof("name: %s", item.Name)
@@ -210,9 +213,9 @@ func Chunk(ctx context.Context,
cumuSize += fileSize
graphFiles = append(graphFiles, item)
// todo build ipld from graphFiles
- BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
+ BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params)
log.Infof("cumu-size: %d", cumuSize)
- log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
+ log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal))
log.Infof("=================")
cumuSize = 0
graphFiles = make([]Finfo, 0)
@@ -234,16 +237,16 @@ func Chunk(ctx context.Context,
SeekStart: seekStart,
SeekEnd: seekEnd,
}
- if randomRenameSourceFile {
+ if params.RandomRenameSourceFile {
graphFiles = append(graphFiles, tryRenameFileName([]Finfo{fi})...)
} else {
graphFiles = append(graphFiles, fi)
}
fileSliceCount++
// todo build ipld from graphFiles
- BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
+ BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params)
log.Infof("cumu-size: %d", cumuSize+firstCut)
- log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
+ log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal))
log.Infof("=================")
cumuSize = 0
graphFiles = make([]Finfo, 0)
@@ -264,7 +267,7 @@ func Chunk(ctx context.Context,
SeekStart: seekStart,
SeekEnd: seekEnd,
}
- if randomRenameSourceFile {
+ if params.RandomRenameSourceFile {
graphFiles = append(graphFiles, tryRenameFileName([]Finfo{fi})...)
} else {
graphFiles = append(graphFiles, fi)
@@ -273,9 +276,9 @@ func Chunk(ctx context.Context,
fileSliceCount++
if seekEnd-seekStart == partSliceSize-1 {
// todo build ipld from graphFiles
- BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
+ BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params)
log.Infof("cumu-size: %d", partSliceSize)
- log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
+ log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal))
log.Infof("=================")
cumuSize = 0
graphFiles = make([]Finfo, 0)
@@ -286,9 +289,9 @@ func Chunk(ctx context.Context,
}
if cumuSize > 0 {
// todo build ipld from graphFiles
- BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
+ BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params)
log.Infof("cumu-size: %d", cumuSize)
- log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
+ log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal))
log.Infof("=================")
}
return nil
diff --git a/cmd/graphsplit/main.go b/cmd/graphsplit/main.go
index 8352a73..fe64256 100644
--- a/cmd/graphsplit/main.go
+++ b/cmd/graphsplit/main.go
@@ -100,6 +100,11 @@ var chunkCmd = &cli.Command{
Usage: "random select file to chunk",
Value: true,
},
+ &cli.BoolFlag{
+ Name: "skip-filename",
+ Usage: "manifest csv detail not contain filename",
+ Value: true,
+ },
},
ArgsUsage: "",
Action: func(c *cli.Context) error {
@@ -110,6 +115,7 @@ var chunkCmd = &cli.Command{
graphName := c.String("graph-name")
randomRenameSourceFile := c.Bool("random-rename-source-file")
randomSelectFile := c.Bool("random-select-file")
+ skipFilename := c.Bool("skip-filename")
if !graphsplit.ExistDir(carDir) {
return fmt.Errorf("the path of car-dir does not exist")
}
@@ -151,7 +157,8 @@ var chunkCmd = &cli.Command{
return fmt.Errorf("slice size %d + extra file slice size %d exceeds 32 GiB", sliceSize, extraFileSliceSize)
}
log.Infof("extra file slice size: %d, random rename source file: %v, random select file: %v", extraFileSliceSize, randomRenameSourceFile, randomSelectFile)
- rf, err := graphsplit.NewRealFile(strings.TrimSuffix(cfg.ExtraFilePath, "/"), int64(extraFileSliceSize), int64(sliceSize), randomRenameSourceFile)
+ log.Infof("skip filename: %v", skipFilename)
+ ef, err := graphsplit.NewExtraFile(strings.TrimSuffix(cfg.ExtraFilePath, "/"), int64(extraFileSliceSize), int64(sliceSize), randomRenameSourceFile)
if err != nil {
return err
}
@@ -166,15 +173,29 @@ var chunkCmd = &cli.Command{
cb = graphsplit.ErrCallback()
}
+ params := graphsplit.ChunkParams{
+ ExpectSliceSize: int64(sliceSize),
+ ParentPath: parentPath,
+ TargetPath: targetPath,
+ CarDir: carDir,
+ GraphName: graphName,
+ Parallel: int(parallel),
+ Cb: cb,
+ Ef: ef,
+ RandomRenameSourceFile: randomRenameSourceFile,
+ RandomSelectFile: randomSelectFile,
+ SkipFilename: skipFilename,
+ }
+
loop := c.Bool("loop")
fmt.Println("loop: ", loop)
if !loop {
fmt.Println("chunking once...")
- return graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf, randomRenameSourceFile, randomSelectFile)
+ return graphsplit.Chunk(ctx, ¶ms)
}
fmt.Println("loop chunking...")
for {
- err = graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf, randomRenameSourceFile, randomSelectFile)
+ err = graphsplit.Chunk(ctx, ¶ms)
if err != nil {
return fmt.Errorf("failed to chunk: %v", err)
}
diff --git a/extra_file.go b/extra_file.go
index b0db8b3..13b32c0 100644
--- a/extra_file.go
+++ b/extra_file.go
@@ -15,7 +15,7 @@ type ExtraFile struct {
pieceRawSize int64
}
-func NewRealFile(path string, sliceSize int64, pieceRawSize int64, randomRenameSourceFile bool) (*ExtraFile, error) {
+func NewExtraFile(path string, sliceSize int64, pieceRawSize int64, randomRenameSourceFile bool) (*ExtraFile, error) {
rf := &ExtraFile{path: path, sliceSize: sliceSize, pieceRawSize: pieceRawSize}
if path != "" {
finfo, err := os.Stat(path)
diff --git a/utils.go b/utils.go
index d9f608e..4f47e58 100644
--- a/utils.go
+++ b/utils.go
@@ -135,21 +135,17 @@ func (b *FSBuilder) getNodeByLink(ln *ipld.Link) (fn fsNode, err error) {
func BuildIpldGraph(ctx context.Context,
fileList []Finfo,
- graphName,
- parentPath,
- carDir string,
- parallel int,
- cb GraphBuildCallback,
- sliceSize int64,
- ef *ExtraFile,
+ graphName string,
+ params *ChunkParams,
) {
- buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, parentPath, parallel, sliceSize, ef)
+ buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, params.ParentPath, params.Parallel,
+ params.ExpectSliceSize, params.Ef)
if err != nil {
// log.Fatal(err)
- cb.OnError(err)
+ params.Cb.OnError(err)
return
}
- cb.OnSuccess(buf, graphName, payloadCid, fsDetail)
+ params.Cb.OnSuccess(buf, graphName, payloadCid, fsDetail)
}
func buildIpldGraph(ctx context.Context,
From 2ced23b136760490b42b1989080a18a06ad47701 Mon Sep 17 00:00:00 2001
From: simlecode <69969590+simlecode@users.noreply.github.com>
Date: Tue, 29 Jul 2025 15:09:18 +0800
Subject: [PATCH 2/2] feat: manifest detail skip file name
---
.gitignore | 1 +
cmd/graphsplit/main.go | 1 -
utils.go | 28 +++++++++++++++++++++++++++-
3 files changed, 28 insertions(+), 2 deletions(-)
diff --git a/.gitignore b/.gitignore
index baa1b5a..7e46f3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@
.idea
.vscode
cars2
+cars3
config.toml
datas
datas2
diff --git a/cmd/graphsplit/main.go b/cmd/graphsplit/main.go
index fe64256..00a1083 100644
--- a/cmd/graphsplit/main.go
+++ b/cmd/graphsplit/main.go
@@ -103,7 +103,6 @@ var chunkCmd = &cli.Command{
&cli.BoolFlag{
Name: "skip-filename",
Usage: "manifest csv detail not contain filename",
- Value: true,
},
},
ArgsUsage: "",
diff --git a/utils.go b/utils.go
index 4f47e58..91dc0dd 100644
--- a/utils.go
+++ b/utils.go
@@ -9,6 +9,7 @@ import (
"math/rand"
"os"
"path"
+ "path/filepath"
"regexp"
"runtime"
"strings"
@@ -139,7 +140,7 @@ func BuildIpldGraph(ctx context.Context,
params *ChunkParams,
) {
buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, params.ParentPath, params.Parallel,
- params.ExpectSliceSize, params.Ef)
+ params.ExpectSliceSize, params.Ef, params.SkipFilename)
if err != nil {
// log.Fatal(err)
params.Cb.OnError(err)
@@ -154,6 +155,7 @@ func buildIpldGraph(ctx context.Context,
parallel int,
sliceSize int64,
ef *ExtraFile,
+ skipFilename bool,
) (*Buffer, string, string, error) {
bs2 := bstore.NewBlockstore(dss.MutexWrap(datastore.NewMapDatastore()))
dagServ := dag.NewDAGService(blockservice.New(bs2, offline.Exchange(bs2)))
@@ -326,6 +328,30 @@ func buildIpldGraph(ctx context.Context,
}
log.Info("++++++++++++ finished to build ipld +++++++++++++")
+ if skipFilename {
+ type path struct {
+ Path string `json:"path"`
+ }
+
+ var list []path
+ seen := make(map[string]struct{})
+ for _, f := range sfis {
+ dir := filepath.Dir(f.Path)
+ if dir == "" || dir == "." || dir == "/" {
+ continue
+ }
+ if _, ok := seen[dir]; !ok {
+ seen[dir] = struct{}{}
+ list = append(list, path{Path: dir})
+ }
+ }
+
+ fileInfo, err = json.Marshal(list)
+ if err != nil {
+ return nil, "", "", err
+ }
+ }
+
return buf, rootNode.Cid().String(), string(fileInfo), nil
}