From 01145f337786fa0b44dac9c5713c9d5d6e4693eb Mon Sep 17 00:00:00 2001 From: simlecode <69969590+simlecode@users.noreply.github.com> Date: Tue, 29 Jul 2025 09:59:21 +0800 Subject: [PATCH 1/2] opt: add ChunkParams --- chunk.go | 63 ++++++++++++++++++++++-------------------- cmd/graphsplit/main.go | 27 ++++++++++++++++-- extra_file.go | 2 +- utils.go | 16 ++++------- 4 files changed, 64 insertions(+), 44 deletions(-) diff --git a/chunk.go b/chunk.go index 509aa26..2d097a8 100644 --- a/chunk.go +++ b/chunk.go @@ -154,34 +154,37 @@ func ErrCallback() GraphBuildCallback { return &errCallback{} } -func Chunk(ctx context.Context, - expectSliceSize int64, - parentPath, - targetPath, - carDir, - graphName string, - parallel int, - cb GraphBuildCallback, - ef *ExtraFile, - randomRenameSourceFile bool, - randomSelectFile bool, -) error { +type ChunkParams struct { + ExpectSliceSize int64 + ParentPath string + TargetPath string + CarDir string + GraphName string + Parallel int + Cb GraphBuildCallback + Ef *ExtraFile + RandomRenameSourceFile bool + RandomSelectFile bool + SkipFilename bool +} + +func Chunk(ctx context.Context, params *ChunkParams) error { var cumuSize int64 = 0 graphSliceCount := 0 graphFiles := make([]Finfo, 0) - if expectSliceSize == 0 { + if params.ExpectSliceSize == 0 { return fmt.Errorf("slice size has been set as 0") } - if parallel <= 0 { + if params.Parallel <= 0 { return fmt.Errorf("parallel has to be greater than 0") } - if parentPath == "" { - parentPath = targetPath + if params.ParentPath == "" { + params.ParentPath = params.TargetPath } - partSliceSize := expectSliceSize - ef.sliceSize - args := []string{targetPath} - sliceTotal := GetGraphCount(args, expectSliceSize) + partSliceSize := params.ExpectSliceSize - params.Ef.sliceSize + args := []string{params.TargetPath} + sliceTotal := GetGraphCount(args, params.ExpectSliceSize) if sliceTotal == 0 { log.Warn("Empty folder or file!") return nil @@ -197,7 +200,7 @@ func Chunk(ctx context.Context, for _, item := range allFiles { item := item - if randomRenameSourceFile { + if params.RandomRenameSourceFile { item = tryRenameFileName([]Finfo{item})[0] } // log.Infof("name: %s", item.Name) @@ -210,9 +213,9 @@ func Chunk(ctx context.Context, cumuSize += fileSize graphFiles = append(graphFiles, item) // todo build ipld from graphFiles - BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef) + BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params) log.Infof("cumu-size: %d", cumuSize) - log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal)) + log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal)) log.Infof("=================") cumuSize = 0 graphFiles = make([]Finfo, 0) @@ -234,16 +237,16 @@ func Chunk(ctx context.Context, SeekStart: seekStart, SeekEnd: seekEnd, } - if randomRenameSourceFile { + if params.RandomRenameSourceFile { graphFiles = append(graphFiles, tryRenameFileName([]Finfo{fi})...) } else { graphFiles = append(graphFiles, fi) } fileSliceCount++ // todo build ipld from graphFiles - BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef) + BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params) log.Infof("cumu-size: %d", cumuSize+firstCut) - log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal)) + log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal)) log.Infof("=================") cumuSize = 0 graphFiles = make([]Finfo, 0) @@ -264,7 +267,7 @@ func Chunk(ctx context.Context, SeekStart: seekStart, SeekEnd: seekEnd, } - if randomRenameSourceFile { + if params.RandomRenameSourceFile { graphFiles = append(graphFiles, tryRenameFileName([]Finfo{fi})...) } else { graphFiles = append(graphFiles, fi) @@ -273,9 +276,9 @@ func Chunk(ctx context.Context, fileSliceCount++ if seekEnd-seekStart == partSliceSize-1 { // todo build ipld from graphFiles - BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef) + BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params) log.Infof("cumu-size: %d", partSliceSize) - log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal)) + log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal)) log.Infof("=================") cumuSize = 0 graphFiles = make([]Finfo, 0) @@ -286,9 +289,9 @@ func Chunk(ctx context.Context, } if cumuSize > 0 { // todo build ipld from graphFiles - BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef) + BuildIpldGraph(ctx, append(params.Ef.getFiles(), graphFiles...), GenGraphName(params.GraphName, graphSliceCount, sliceTotal), params) log.Infof("cumu-size: %d", cumuSize) - log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal)) + log.Infof("%s", GenGraphName(params.GraphName, graphSliceCount, sliceTotal)) log.Infof("=================") } return nil diff --git a/cmd/graphsplit/main.go b/cmd/graphsplit/main.go index 8352a73..fe64256 100644 --- a/cmd/graphsplit/main.go +++ b/cmd/graphsplit/main.go @@ -100,6 +100,11 @@ var chunkCmd = &cli.Command{ Usage: "random select file to chunk", Value: true, }, + &cli.BoolFlag{ + Name: "skip-filename", + Usage: "manifest csv detail not contain filename", + Value: true, + }, }, ArgsUsage: "", Action: func(c *cli.Context) error { @@ -110,6 +115,7 @@ var chunkCmd = &cli.Command{ graphName := c.String("graph-name") randomRenameSourceFile := c.Bool("random-rename-source-file") randomSelectFile := c.Bool("random-select-file") + skipFilename := c.Bool("skip-filename") if !graphsplit.ExistDir(carDir) { return fmt.Errorf("the path of car-dir does not exist") } @@ -151,7 +157,8 @@ var chunkCmd = &cli.Command{ return fmt.Errorf("slice size %d + extra file slice size %d exceeds 32 GiB", sliceSize, extraFileSliceSize) } log.Infof("extra file slice size: %d, random rename source file: %v, random select file: %v", extraFileSliceSize, randomRenameSourceFile, randomSelectFile) - rf, err := graphsplit.NewRealFile(strings.TrimSuffix(cfg.ExtraFilePath, "/"), int64(extraFileSliceSize), int64(sliceSize), randomRenameSourceFile) + log.Infof("skip filename: %v", skipFilename) + ef, err := graphsplit.NewExtraFile(strings.TrimSuffix(cfg.ExtraFilePath, "/"), int64(extraFileSliceSize), int64(sliceSize), randomRenameSourceFile) if err != nil { return err } @@ -166,15 +173,29 @@ var chunkCmd = &cli.Command{ cb = graphsplit.ErrCallback() } + params := graphsplit.ChunkParams{ + ExpectSliceSize: int64(sliceSize), + ParentPath: parentPath, + TargetPath: targetPath, + CarDir: carDir, + GraphName: graphName, + Parallel: int(parallel), + Cb: cb, + Ef: ef, + RandomRenameSourceFile: randomRenameSourceFile, + RandomSelectFile: randomSelectFile, + SkipFilename: skipFilename, + } + loop := c.Bool("loop") fmt.Println("loop: ", loop) if !loop { fmt.Println("chunking once...") - return graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf, randomRenameSourceFile, randomSelectFile) + return graphsplit.Chunk(ctx, ¶ms) } fmt.Println("loop chunking...") for { - err = graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf, randomRenameSourceFile, randomSelectFile) + err = graphsplit.Chunk(ctx, ¶ms) if err != nil { return fmt.Errorf("failed to chunk: %v", err) } diff --git a/extra_file.go b/extra_file.go index b0db8b3..13b32c0 100644 --- a/extra_file.go +++ b/extra_file.go @@ -15,7 +15,7 @@ type ExtraFile struct { pieceRawSize int64 } -func NewRealFile(path string, sliceSize int64, pieceRawSize int64, randomRenameSourceFile bool) (*ExtraFile, error) { +func NewExtraFile(path string, sliceSize int64, pieceRawSize int64, randomRenameSourceFile bool) (*ExtraFile, error) { rf := &ExtraFile{path: path, sliceSize: sliceSize, pieceRawSize: pieceRawSize} if path != "" { finfo, err := os.Stat(path) diff --git a/utils.go b/utils.go index d9f608e..4f47e58 100644 --- a/utils.go +++ b/utils.go @@ -135,21 +135,17 @@ func (b *FSBuilder) getNodeByLink(ln *ipld.Link) (fn fsNode, err error) { func BuildIpldGraph(ctx context.Context, fileList []Finfo, - graphName, - parentPath, - carDir string, - parallel int, - cb GraphBuildCallback, - sliceSize int64, - ef *ExtraFile, + graphName string, + params *ChunkParams, ) { - buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, parentPath, parallel, sliceSize, ef) + buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, params.ParentPath, params.Parallel, + params.ExpectSliceSize, params.Ef) if err != nil { // log.Fatal(err) - cb.OnError(err) + params.Cb.OnError(err) return } - cb.OnSuccess(buf, graphName, payloadCid, fsDetail) + params.Cb.OnSuccess(buf, graphName, payloadCid, fsDetail) } func buildIpldGraph(ctx context.Context, From 2ced23b136760490b42b1989080a18a06ad47701 Mon Sep 17 00:00:00 2001 From: simlecode <69969590+simlecode@users.noreply.github.com> Date: Tue, 29 Jul 2025 15:09:18 +0800 Subject: [PATCH 2/2] feat: manifest detail skip file name --- .gitignore | 1 + cmd/graphsplit/main.go | 1 - utils.go | 28 +++++++++++++++++++++++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index baa1b5a..7e46f3b 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ .idea .vscode cars2 +cars3 config.toml datas datas2 diff --git a/cmd/graphsplit/main.go b/cmd/graphsplit/main.go index fe64256..00a1083 100644 --- a/cmd/graphsplit/main.go +++ b/cmd/graphsplit/main.go @@ -103,7 +103,6 @@ var chunkCmd = &cli.Command{ &cli.BoolFlag{ Name: "skip-filename", Usage: "manifest csv detail not contain filename", - Value: true, }, }, ArgsUsage: "", diff --git a/utils.go b/utils.go index 4f47e58..91dc0dd 100644 --- a/utils.go +++ b/utils.go @@ -9,6 +9,7 @@ import ( "math/rand" "os" "path" + "path/filepath" "regexp" "runtime" "strings" @@ -139,7 +140,7 @@ func BuildIpldGraph(ctx context.Context, params *ChunkParams, ) { buf, payloadCid, fsDetail, err := buildIpldGraph(ctx, fileList, params.ParentPath, params.Parallel, - params.ExpectSliceSize, params.Ef) + params.ExpectSliceSize, params.Ef, params.SkipFilename) if err != nil { // log.Fatal(err) params.Cb.OnError(err) @@ -154,6 +155,7 @@ func buildIpldGraph(ctx context.Context, parallel int, sliceSize int64, ef *ExtraFile, + skipFilename bool, ) (*Buffer, string, string, error) { bs2 := bstore.NewBlockstore(dss.MutexWrap(datastore.NewMapDatastore())) dagServ := dag.NewDAGService(blockservice.New(bs2, offline.Exchange(bs2))) @@ -326,6 +328,30 @@ func buildIpldGraph(ctx context.Context, } log.Info("++++++++++++ finished to build ipld +++++++++++++") + if skipFilename { + type path struct { + Path string `json:"path"` + } + + var list []path + seen := make(map[string]struct{}) + for _, f := range sfis { + dir := filepath.Dir(f.Path) + if dir == "" || dir == "." || dir == "/" { + continue + } + if _, ok := seen[dir]; !ok { + seen[dir] = struct{}{} + list = append(list, path{Path: dir}) + } + } + + fileInfo, err = json.Marshal(list) + if err != nil { + return nil, "", "", err + } + } + return buf, rootNode.Cid().String(), string(fileInfo), nil }