From 1f189bc390517f79b0d9c95c1d3fc3617641815e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 20 Nov 2025 17:32:31 +0800 Subject: [PATCH] Add EROFS flatten device support (fsmerge feature) So that hundreds of sub-blobs (container image layers) can be merged into one block device to avoid having too many block devices plugging into a VM. Closes: https://github.com/containerd/nerdbox/issues/30 Signed-off-by: Gao Xiang --- internal/erofs/vmdk.go | 106 ++++++++++++++++++++++++++++++++ internal/shim/task/mount.go | 59 ++++++++++++++++-- internal/vm/libkrun/instance.go | 6 +- internal/vm/libkrun/krun.go | 12 ++++ internal/vm/vm.go | 7 +++ 5 files changed, 183 insertions(+), 7 deletions(-) create mode 100644 internal/erofs/vmdk.go diff --git a/internal/erofs/vmdk.go b/internal/erofs/vmdk.go new file mode 100644 index 0000000..ac959d7 --- /dev/null +++ b/internal/erofs/vmdk.go @@ -0,0 +1,106 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package erofs + +import ( + "fmt" + "io" + "os" +) + +const ( + max2GbExtentSectors = 0x80000000 >> 9 + sectorsPerTrack = 63 + numberHeads = 16 + subformat = "twoGbMaxExtentFlat" + adapterType = "ide" + hwVersion = "4" +) + +// vmdkDescAddExtent writes extent lines to the writer. +// Each extent line follows the format: RW FLAT "" +func vmdkDescAddExtent(w io.Writer, sectors uint64, filename string, offset uint64) error { + for sectors > 0 { + count := min(sectors, max2GbExtentSectors) + + _, err := fmt.Fprintf(w, "RW %d FLAT \"%s\" %d\n", count, filename, offset) + if err != nil { + return err + } + offset += count + sectors -= count + } + return nil +} + +func DumpVMDKDescriptor(w io.Writer, cid uint32, devices []string) error { + parentCID := uint32(0xffffffff) + + _, err := fmt.Fprintf(w, `# Disk DescriptorFile +version=1 +CID=%08x +parentCID=%08x +createType="%s" + +# Extent description +`, cid, parentCID, subformat) + if err != nil { + return err + } + + totalSectors := uint64(0) + + for _, d := range devices { + fi, err := os.Stat(d) + if err != nil { + return err + } + sectors := uint64(fi.Size()) >> 9 + err = vmdkDescAddExtent(w, sectors, d, 0) + if err != nil { + return err + } + totalSectors += sectors + } + + cylinders := (totalSectors + sectorsPerTrack*numberHeads - 1) / (sectorsPerTrack * numberHeads) + _, err = fmt.Fprintf(w, ` + +# The Disk Data Base +#DDB + +ddb.virtualHWVersion = "%s" +ddb.geometry.cylinders = "%d" +ddb.geometry.heads = "%d" +ddb.geometry.sectors = "63" +ddb.adapterType = "%s" +`, hwVersion, cylinders, numberHeads, adapterType) + if err != nil { + return err + } + return nil +} + +func DumpVMDKDescriptorToFile(vmdkdesc string, cid uint32, devices []string) error { + f, err := os.Create(vmdkdesc) + if err != nil { + return err + } + err = DumpVMDKDescriptor(f, cid, devices) + f.Close() + return err +} diff --git a/internal/shim/task/mount.go b/internal/shim/task/mount.go index e436951..2af2361 100644 --- a/internal/shim/task/mount.go +++ b/internal/shim/task/mount.go @@ -19,12 +19,15 @@ package task import ( "context" "fmt" + "os" + "path/filepath" "strings" "github.com/containerd/containerd/api/types" "github.com/containerd/errdefs" "github.com/containerd/log" + "github.com/containerd/nerdbox/internal/erofs" "github.com/containerd/nerdbox/internal/vm" ) @@ -32,6 +35,7 @@ type diskOptions struct { name string source string readOnly bool + vmdk bool } // transformMounts does not perform any local mounts but transforms @@ -44,25 +48,63 @@ func transformMounts(ctx context.Context, vmi vm.Instance, id string, ms []*type err error ) + log.G(ctx).Trace("transformMounts", ms) for _, m := range ms { switch m.Type { case "erofs": + disk := fmt.Sprintf("disk-%d-%s", disks, id) // virtiofs implementation has a limit of 36 characters for the tag if len(disk) > 36 { disk = disk[:36] } - addDisks = append(addDisks, diskOptions{ - name: disk, - source: m.Source, - readOnly: true, - }) + + var Options []string + + devices := []string{m.Source} + for _, o := range m.Options { + if d, f := strings.CutPrefix(o, "device="); f { + devices = append(devices, d) + continue + } + Options = append(Options, o) + } + + if len(devices) > 1 { + // generate VMDK desc for the EROFS flattened fs if it does not exist + mergedfsPath := filepath.Dir(m.Source) + "/merged_fs.vmdk" + if _, err := os.Stat(mergedfsPath); err != nil { + if !os.IsNotExist(err) { + log.G(ctx).Warnf("failed to stat %v: %v", mergedfsPath, err) + return nil, errdefs.ErrNotImplemented + } + err = erofs.DumpVMDKDescriptorToFile(mergedfsPath, 0xfffffffe, devices) + if err != nil { + log.G(ctx).Warnf("failed to generate %v: %v", mergedfsPath, err) + return nil, errdefs.ErrNotImplemented + } + } + addDisks = append(addDisks, diskOptions{ + name: disk, + source: mergedfsPath, + readOnly: true, + vmdk: true, + }) + } else { + addDisks = append(addDisks, diskOptions{ + name: disk, + source: m.Source, + readOnly: true, + vmdk: false, + }) + } am = append(am, &types.Mount{ Type: "erofs", Source: fmt.Sprintf("/dev/vd%c", disks), Target: m.Target, - Options: filterOptions(m.Options), + Options: filterOptions(Options), }) + disks++ case "ext4": disk := fmt.Sprintf("disk-%d-%s", disks, id) @@ -75,6 +117,7 @@ func transformMounts(ctx context.Context, vmi vm.Instance, id string, ms []*type name: disk, source: m.Source, readOnly: false, + vmdk: false, }) am = append(am, &types.Mount{ Type: "ext4", @@ -127,6 +170,10 @@ func transformMounts(ctx context.Context, vmi vm.Instance, id string, ms []*type if do.readOnly { opts = append(opts, vm.WithReadOnly()) } + if do.vmdk { + opts = append(opts, vm.WithVmdk()) + } + if err := vmi.AddDisk(ctx, do.name, do.source, opts...); err != nil { return nil, err } diff --git a/internal/vm/libkrun/instance.go b/internal/vm/libkrun/instance.go index ece29ef..d5f8e33 100644 --- a/internal/vm/libkrun/instance.go +++ b/internal/vm/libkrun/instance.go @@ -172,7 +172,11 @@ func (v *vmInstance) AddDisk(ctx context.Context, blockID, mountPath string, opt o(&mc) } - if err := v.vmc.AddDisk(blockID, mountPath, mc.Readonly); err != nil { + var dskFmt uint32 = 0 + if mc.Vmdk { + dskFmt = 2 + } + if err := v.vmc.AddDisk2(blockID, mountPath, dskFmt, mc.Readonly); err != nil { return fmt.Errorf("failed to add disk at '%s': %w", mountPath, err) } diff --git a/internal/vm/libkrun/krun.go b/internal/vm/libkrun/krun.go index 07e4da5..48f5760 100644 --- a/internal/vm/libkrun/krun.go +++ b/internal/vm/libkrun/krun.go @@ -156,6 +156,17 @@ func (vmc *vmcontext) AddDisk(blockID, path string, readonly bool) error { return nil } +func (vmc *vmcontext) AddDisk2(blockID, path string, diskFmt uint32, readonly bool) error { + if vmc.lib.AddDisk2 == nil { + return fmt.Errorf("libkrun not loaded") + } + ret := vmc.lib.AddDisk2(vmc.ctxID, blockID, path, diskFmt, readonly) + if ret != 0 { + return fmt.Errorf("krun_add_disk2 failed: %d", ret) + } + return nil +} + func (vmc *vmcontext) AddNIC(endpoint string, mac net.HardwareAddr, mode vm.NetworkMode, features, flags uint32) error { if vmc.lib.AddNetUnixgram == nil || vmc.lib.AddNetUnixstream == nil { return fmt.Errorf("libkrun not loaded") @@ -243,6 +254,7 @@ type libkrun struct { SetGvproxyPath func(ctxID uint32, path string) int32 `C:"krun_set_gvproxy_path"` SetNetMac func(ctxID uint32, mac []uint8) int32 `C:"krun_set_net_mac"` AddDisk func(ctxID uint32, blockId, path string, readonly bool) int32 `C:"krun_add_disk"` + AddDisk2 func(ctxID uint32, blockId, path string, diskFmt uint32, readonly bool) int32 `C:"krun_add_disk2"` AddNetUnixstream func(ctxID uint32, path string, fd int, mac []uint8, features, flags uint32) int32 `C:"krun_add_net_unixstream"` AddNetUnixgram func(ctxID uint32, path string, fd int, mac []uint8, features, flags uint32) int32 `C:"krun_add_net_unixgram"` diff --git a/internal/vm/vm.go b/internal/vm/vm.go index d84b25a..2f6b614 100644 --- a/internal/vm/vm.go +++ b/internal/vm/vm.go @@ -49,6 +49,7 @@ func WithInitArgs(args ...string) StartOpt { type MountConfig struct { Readonly bool + Vmdk bool } type MountOpt func(*MountConfig) @@ -75,3 +76,9 @@ func WithReadOnly() MountOpt { o.Readonly = true } } + +func WithVmdk() MountOpt { + return func(o *MountConfig) { + o.Vmdk = true + } +}