Skip to content

Commit 01a3dc6

Browse files
committed
feat: add device controller
1 parent d6a831e commit 01a3dc6

File tree

17 files changed

+126
-473
lines changed

17 files changed

+126
-473
lines changed

cmd/hypervisor/main.go

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -36,28 +36,11 @@ func main() {
3636
klog.Fatalf("Failed to start device manager: %v", err)
3737
}
3838
defer mgr.Stop()
39-
4039
klog.Info("Device manager started")
4140

42-
// Discover devices
4341
devices := mgr.GetDevices()
44-
klog.Infof("Discovered %d devices", len(devices))
45-
4642
if len(devices) == 0 {
47-
klog.Warning("No devices discovered, waiting...")
48-
time.Sleep(2 * time.Second)
49-
devices = mgr.GetDevices()
50-
if len(devices) == 0 {
51-
klog.Fatalf("No devices available")
52-
}
53-
}
54-
55-
// Register default pool
56-
deviceUUIDs := make([]string, 0, len(devices))
57-
for _, d := range devices {
58-
deviceUUIDs = append(deviceUUIDs, d.UUID)
59-
klog.Infof("Device: UUID=%s, Vendor=%s, Model=%s, Memory=%d GB",
60-
d.UUID, d.Vendor, d.Model, d.TotalMemory/(1024*1024*1024))
43+
klog.Fatalf("No devices found")
6144
}
6245

6346
// Parse isolation mode
@@ -75,31 +58,13 @@ func main() {
7558
klog.Fatalf("Invalid isolation mode: %s", *isolationMode)
7659
}
7760

78-
pool := &device.DevicePool{
79-
Vendor: devices[0].Vendor,
80-
IsolationMode: mode,
81-
DeviceUUIDs: deviceUUIDs,
82-
AcceleratorLib: *acceleratorLibPath,
83-
}
84-
85-
if err := mgr.RegisterPool(pool); err != nil {
86-
klog.Fatalf("Failed to register pool: %v", err)
87-
}
88-
klog.Infof("Registered devices: %s with %d devices, isolation mode: %s", devices[0].Vendor, len(deviceUUIDs), mode)
89-
90-
// TODO: 2. If k8s mode, listen Pods from kubelet socket and build a map
91-
// TODO: 3. Extensible Device Plugin, to read config yaml of pool and
92-
// TODO: 4. Report GPU CR to API server, if DRA enabled, report ResourceSlice
93-
// TODO: 5. Build shm handle or ivshmem device for soft isolation mode for
94-
// limiter and hard isolation mode, manage shm lifecycle
95-
// TODO: 6. Expose HTTP APIs for watch worker pod status, or create workers process,
96-
// manage workers lifecycle in VM mode
61+
klog.Infof("Registered devices: %s with %d devices, isolation mode: %s", devices[0].Vendor, len(devices), mode)
9762

9863
// Wait for interrupt signal
9964
sigCh := make(chan os.Signal, 1)
10065
signal.Notify(sigCh, os.Interrupt, syscall.SIGTERM)
10166

102-
klog.Info("Hypervisor running, press Ctrl+C to stop")
67+
klog.Info("Hypervisor running")
10368
<-sigCh
10469
klog.Info("Shutting down...")
10570
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
package integration
2+
3+
import (
4+
"context"
5+
6+
"github.com/NexusGPU/tensor-fusion/internal/hypervisor/device"
7+
)
8+
9+
type Framework interface {
10+
AllocateDevice(ctx context.Context, request *device.DeviceAllocateRequest) (*device.DeviceAllocateResponse, error)
11+
12+
ListDevices(ctx context.Context) ([]*device.DeviceInfo, error)
13+
14+
DevicesUpdates(ctx context.Context) (<-chan []*device.DeviceInfo, error)
15+
16+
GetDevice(ctx context.Context, deviceUUID string) (*device.DeviceInfo, error)
17+
18+
GetDeviceAllocations(ctx context.Context, deviceUUID string) ([]*device.DeviceAllocation, error)
19+
20+
GetDeviceAllocationUpdates(ctx context.Context, deviceUUID string, allocationID string) (<-chan []*device.DeviceAllocation, error)
21+
}
22+
23+
// The backend interface for the hypervisor to interact with the underlying infrastructure
24+
type Backend interface {
25+
Start(ctx context.Context, framework Framework, params map[string]string) error
26+
27+
// Get GPU workers from the workload orchestration platform
28+
ListAndWatchWorkers(ctx context.Context) ([]string, error)
29+
30+
// Report devices to backend orchestration and O&M platform
31+
ReportDevices(ctx context.Context, devices []string) error
32+
33+
// Link workers to actual running process list on OS
34+
GetWorkerProcessMap(ctx context.Context) (map[string][]string, error)
35+
36+
// Spawn worker process on OS
37+
StartWorker(ctx context.Context, workerUID string) error
38+
39+
// Stop worker process on OS
40+
StopWorker(ctx context.Context, workerUID string) error
41+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package kubernetes
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package singlenode
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
package singlenode

0 commit comments

Comments
 (0)