Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ cars2
config.toml
datas
datas2
datas3
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ payload_cid,filename,piece_cid,piece_size,detail
ba...,graph-slice-name.car,baga...,16646144,inner-structure-json
```

Config:

[example](https://github.com/ipfs-force-community/go-graphsplit/blob/main/config/example.toml)

config 包含三个字段:

* SliceSize piece 源文件大小,默认是 18Gib
* ExtraFilePath 指向存储了图片、视频等文件的目录
* ExtraFileSizeInOnePiece 每个 piece 文件包含图片和视频等文件的大小,例如:500Gib

Import car file to IPFS:
```sh
ipfs dag import /path/to/car-dir/car-file
Expand Down
40 changes: 26 additions & 14 deletions chunk.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,20 @@ func ErrCallback() GraphBuildCallback {
return &errCallback{}
}

func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir, graphName string, parallel int, cb GraphBuildCallback) error {
func Chunk(ctx context.Context,
expectSliceSize int64,
parentPath,
targetPath,
carDir,
graphName string,
parallel int,
cb GraphBuildCallback,
ef *ExtraFile,
) error {
var cumuSize int64 = 0
graphSliceCount := 0
graphFiles := make([]Finfo, 0)
if sliceSize == 0 {
if expectSliceSize == 0 {
return fmt.Errorf("slice size has been set as 0")
}
if parallel <= 0 {
Expand All @@ -168,36 +177,39 @@ func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir,
parentPath = targetPath
}

partSliceSize := expectSliceSize - ef.sliceSize
args := []string{targetPath}
sliceTotal := GetGraphCount(args, sliceSize)
sliceTotal := GetGraphCount(args, expectSliceSize)
if sliceTotal == 0 {
log.Warn("Empty folder or file!")
return nil
}
files := GetFileListAsync(args)
for item := range files {
item := tryRenameFileName([]Finfo{item})[0]
// log.Infof("name: %s", item.Name)
fileSize := item.Info.Size()
switch {
case cumuSize+fileSize < sliceSize:
case cumuSize+fileSize < partSliceSize:
cumuSize += fileSize
graphFiles = append(graphFiles, item)
case cumuSize+fileSize == sliceSize:
case cumuSize+fileSize == partSliceSize:
cumuSize += fileSize
graphFiles = append(graphFiles, item)
// todo build ipld from graphFiles
BuildIpldGraph(ctx, graphFiles, GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, sliceSize)
BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
log.Infof("cumu-size: %d", cumuSize)
log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
log.Infof("=================")
cumuSize = 0
graphFiles = make([]Finfo, 0)
graphSliceCount++
case cumuSize+fileSize > sliceSize:
case cumuSize+fileSize > partSliceSize:
fileSliceCount := 0
// need to split item to fit graph slice
//
// first cut
firstCut := sliceSize - cumuSize
firstCut := partSliceSize - cumuSize
var seekStart int64 = 0
var seekEnd int64 = seekStart + firstCut - 1
log.Infof("first cut %d, seek start at %d, end at %d", firstCut, seekStart, seekEnd)
Expand All @@ -211,7 +223,7 @@ func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir,
})
fileSliceCount++
// todo build ipld from graphFiles
BuildIpldGraph(ctx, graphFiles, GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, sliceSize)
BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
log.Infof("cumu-size: %d", cumuSize+firstCut)
log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
log.Infof("=================")
Expand All @@ -220,7 +232,7 @@ func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir,
graphSliceCount++
for seekEnd < fileSize-1 {
seekStart = seekEnd + 1
seekEnd = seekStart + sliceSize - 1
seekEnd = seekStart + partSliceSize - 1
if seekEnd >= fileSize-1 {
seekEnd = fileSize - 1
}
Expand All @@ -235,10 +247,10 @@ func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir,
SeekEnd: seekEnd,
})
fileSliceCount++
if seekEnd-seekStart == sliceSize-1 {
if seekEnd-seekStart == partSliceSize-1 {
// todo build ipld from graphFiles
BuildIpldGraph(ctx, graphFiles, GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, sliceSize)
log.Infof("cumu-size: %d", sliceSize)
BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
log.Infof("cumu-size: %d", partSliceSize)
log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
log.Infof("=================")
cumuSize = 0
Expand All @@ -250,7 +262,7 @@ func Chunk(ctx context.Context, sliceSize int64, parentPath, targetPath, carDir,
}
if cumuSize > 0 {
// todo build ipld from graphFiles
BuildIpldGraph(ctx, graphFiles, GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, sliceSize)
BuildIpldGraph(ctx, append(ef.getFiles(), graphFiles...), GenGraphName(graphName, graphSliceCount, sliceTotal), parentPath, carDir, parallel, cb, expectSliceSize, ef)
log.Infof("cumu-size: %d", cumuSize)
log.Infof("%s", GenGraphName(graphName, graphSliceCount, sliceTotal))
log.Infof("=================")
Expand Down
34 changes: 28 additions & 6 deletions cmd/graphsplit/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import (
"context"
"fmt"
"os"
"strings"
"time"

"github.com/docker/go-units"
"github.com/filedrive-team/go-graphsplit"
"github.com/filedrive-team/go-graphsplit/config"
"github.com/filedrive-team/go-graphsplit/dataset"
Expand Down Expand Up @@ -66,7 +68,7 @@ var chunkCmd = &cli.Command{
},
&cli.BoolFlag{
Name: "calc-commp",
Value: false,
Value: true,
Usage: "create a mainfest.csv in car-dir to save mapping of data-cids, slice names, piece-cids and piece-sizes",
},
&cli.BoolFlag{
Expand Down Expand Up @@ -104,12 +106,13 @@ var chunkCmd = &cli.Command{
if cfgPath == "" {
return fmt.Errorf("config file path is required")
}
log.Infoln("config file path: ", cfgPath)

cfg, err := config.LoadConfig(cfgPath)
if err != nil {
return fmt.Errorf("failed to load config file: %v", err)
return fmt.Errorf("failed to load config file(%s): %v", cfgPath, err)
}
log.Infof("config file: %+v", cfg)

log.Infof("old slice size: %d", cfg.SliceSize)
cfg.SliceSize++
sliceSize := cfg.SliceSize
Expand All @@ -122,7 +125,26 @@ var chunkCmd = &cli.Command{
return fmt.Errorf("failed to save config file: %v", err)
}

targetPath := c.Args().First()
var extraFileSliceSize int64
if len(cfg.ExtraFilePath) != 0 {
if cfg.ExtraFileSizeInOnePiece == "" {
return fmt.Errorf("extra file size in one piece is required when extra file path is set")
}
extraFileSliceSize, err = units.RAMInBytes(cfg.ExtraFileSizeInOnePiece)
if err != nil {
return fmt.Errorf("failed to parse real file size: %v", err)
}
}
if sliceSize+int(extraFileSliceSize) > 32*graphsplit.Gib {
return fmt.Errorf("slice size %d + extra file slice size %d exceeds 32 GiB", sliceSize, extraFileSliceSize)
}
log.Infof("extra file slice size: %d", extraFileSliceSize)
rf, err := graphsplit.NewRealFile(strings.TrimSuffix(cfg.ExtraFilePath, "/"), int64(extraFileSliceSize), int64(sliceSize))
if err != nil {
return err
}

targetPath := strings.TrimSuffix(c.Args().First(), "/")
var cb graphsplit.GraphBuildCallback
if c.Bool("calc-commp") {
cb = graphsplit.CommPCallback(carDir, c.Bool("rename"), c.Bool("add-padding"))
Expand All @@ -136,11 +158,11 @@ var chunkCmd = &cli.Command{
fmt.Println("loop: ", loop)
if !loop {
fmt.Println("chunking once...")
return graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb)
return graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf)
}
fmt.Println("loop chunking...")
for {
err = graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb)
err = graphsplit.Chunk(ctx, int64(sliceSize), parentPath, targetPath, carDir, graphName, int(parallel), cb, rf)
if err != nil {
return fmt.Errorf("failed to chunk: %v", err)
}
Expand Down
64 changes: 62 additions & 2 deletions config/config.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,26 @@
package config

import (
"bytes"
"fmt"
"os"
"reflect"
"strings"

"github.com/BurntSushi/toml"
)

type Config struct {
SliceSize int
SliceSize int `toml:"SliceSize" comment:"SliceSize, the size of each slice in bytes, default is 18G"`
ExtraFilePath string `toml:"ExtraFilePath" comment:"ExtraFilePath extra file path, 指向存储了图片、视频等文件的目录"`
ExtraFileSizeInOnePiece string `toml:"ExtraFileSizeInOnePiece" comment:"ExtraFileSizeInOnePiece 每个 piece 文件包含图片和视频等文件的大小, 例如:500Mib"`
}

func NewConfig() *Config {
return &Config{
SliceSize: 19327352832, // 18G
SliceSize: 19327352832, // 18G
ExtraFileSizeInOnePiece: "",
ExtraFilePath: "",
}
}

Expand Down Expand Up @@ -46,3 +54,55 @@ func (c *Config) SaveConfig(filePath string) error {
}
return nil
}

func generateTOMLWithComments(data any) (string, error) {
// Step 1: Encode struct to TOML
var buf bytes.Buffer
if err := toml.NewEncoder(&buf).Encode(data); err != nil {
return "", fmt.Errorf("failed to encode TOML: %v", err)
}
tomlLines := strings.Split(buf.String(), "\n")

// Step 2: Get field comments using reflection
comments := make(map[string]string)
val := reflect.ValueOf(data)
if val.Kind() == reflect.Ptr {
val = val.Elem()
}
typ := val.Type()

for i := 0; i < typ.NumField(); i++ {
field := typ.Field(i)
tomlTag := field.Tag.Get("toml")
commentTag := field.Tag.Get("comment")
if tomlTag != "" && commentTag != "" {
comments[tomlTag] = commentTag
}
}

// Step 3: Insert comments before corresponding TOML keys
var result []string
for _, line := range tomlLines {
// Skip empty lines
if strings.TrimSpace(line) == "" {
result = append(result, line)
continue
}

// Check if the line contains a TOML key
for key, comment := range comments {
if strings.HasPrefix(strings.TrimSpace(line), key+" =") {
result = append(result, fmt.Sprintf("# %s", comment))
}
}
result = append(result, line)
}

// Add a header comment
header := []string{
"# 配置文件",
"# 自动生成,包含字段说明",
"",
}
return strings.Join(append(header, result...), "\n"), nil
}
4 changes: 3 additions & 1 deletion config/config_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package config

import (
"os"
"testing"

"github.com/gozelle/testify/require"
Expand All @@ -9,8 +10,9 @@ import (
func TestConfig(t *testing.T) {
cfg := NewConfig()

err := cfg.SaveConfig("example.toml")
data, err := generateTOMLWithComments(cfg)
require.NoError(t, err)
require.NoError(t, os.WriteFile("example.toml", []byte(data), 0644))

loadedCfg, err := LoadConfig("example.toml")
require.NoError(t, err)
Expand Down
8 changes: 8 additions & 0 deletions config/example.toml
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
# 配置文件
# 自动生成,包含字段说明

# SliceSize, the size of each slice in bytes, default is 18G
SliceSize = 19327352832
# ExtraFilePath extra file path, 指向存储了图片、视频等文件的目录
ExtraFilePath = ""
# ExtraFileSizeInOnePiece 每个 piece 文件包含图片和视频等文件的大小, 例如:500Mib
ExtraFileSizeInOnePiece = ""
64 changes: 64 additions & 0 deletions extra_file.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package graphsplit

import (
"fmt"
"os"
)

const Gib = 1024 * 1024 * 1024

type ExtraFile struct {
path string
files []Finfo
idx int
sliceSize int64
pieceRawSize int64
}

func NewRealFile(path string, sliceSize int64, pieceRawSize int64) (*ExtraFile, error) {
rf := &ExtraFile{path: path, sliceSize: sliceSize, pieceRawSize: pieceRawSize}
if path != "" {
finfo, err := os.Stat(path)
if err != nil {
return nil, err
}
if !finfo.IsDir() {
return nil, fmt.Errorf("the path %s is not a directory", path)
}
rf.walk()
}

return rf, nil
}

func (rf *ExtraFile) walk() {
files := GetFileListAsync([]string{rf.path})
for item := range files {
rf.files = append(rf.files, item)
}
rf.files = tryRenameFileName(rf.files)
}

func (rf *ExtraFile) getFiles() []Finfo {
count := len(rf.files)
if count == 0 {
return nil
}
var total int64
var files []Finfo
startIdx := rf.idx
for total < rf.sliceSize {
file := rf.files[rf.idx]
if total+file.Info.Size()+rf.pieceRawSize <= 32*Gib {
total += file.Info.Size()
files = append(files, file)
}
rf.idx = (rf.idx + 1) % count

if rf.idx == startIdx {
break
}
}

return files
}
Loading