@@ -19,7 +19,6 @@ import (
19
19
"github.com/stacklok/toolhive/pkg/logger"
20
20
"github.com/stacklok/toolhive/pkg/process"
21
21
"github.com/stacklok/toolhive/pkg/state"
22
- "github.com/stacklok/toolhive/pkg/transport/proxy"
23
22
"github.com/stacklok/toolhive/pkg/workloads/types"
24
23
)
25
24
@@ -114,6 +113,7 @@ type workloadStatusFile struct {
114
113
115
114
// GetWorkload retrieves the status of a workload by its name.
116
115
func (f * fileStatusManager ) GetWorkload (ctx context.Context , workloadName string ) (core.Workload , error ) {
116
+ var pid int
117
117
result := core.Workload {Name : workloadName }
118
118
fileFound := false
119
119
@@ -152,6 +152,8 @@ func (f *fileStatusManager) GetWorkload(ctx context.Context, workloadName string
152
152
}
153
153
}
154
154
155
+ pid = statusFile .ProcessID
156
+
155
157
return nil
156
158
})
157
159
if err != nil {
@@ -172,7 +174,7 @@ func (f *fileStatusManager) GetWorkload(ctx context.Context, workloadName string
172
174
173
175
// If workload is running, validate against runtime
174
176
if result .Status == rt .WorkloadStatusRunning {
175
- return f .validateRunningWorkload (ctx , workloadName , result )
177
+ return f .validateRunningWorkload (ctx , workloadName , result , pid )
176
178
}
177
179
178
180
// Return file data
@@ -197,7 +199,7 @@ func (f *fileStatusManager) ListWorkloads(ctx context.Context, listAll bool, lab
197
199
}
198
200
199
201
// Get workloads from files
200
- fileWorkloads , err := f .getWorkloadsFromFiles ()
202
+ fileWorkloadsWithPID , err := f .getWorkloadsFromFiles ()
201
203
if err != nil {
202
204
return nil , fmt .Errorf ("failed to get workloads from files: %w" , err )
203
205
}
@@ -206,7 +208,7 @@ func (f *fileStatusManager) ListWorkloads(ctx context.Context, listAll bool, lab
206
208
// There's currently an import cycle between this package and the runconfig package
207
209
208
210
// Create a map of runtime workloads by name for easy lookup
209
- workloadMap := f .mergeRuntimeAndFileWorkloads (ctx , runtimeContainers , fileWorkloads )
211
+ workloadMap := f .mergeRuntimeAndFileWorkloads (ctx , runtimeContainers , fileWorkloadsWithPID )
210
212
211
213
// Convert map to slice and apply filters
212
214
var workloads []core.Workload
@@ -553,8 +555,14 @@ func (f *fileStatusManager) getWorkloadFromRuntime(ctx context.Context, workload
553
555
return types .WorkloadFromContainerInfo (& info )
554
556
}
555
557
558
+ // workloadWithPID holds a workload and its associated PID for internal processing
559
+ type workloadWithPID struct {
560
+ workload core.Workload
561
+ pid int
562
+ }
563
+
556
564
// getWorkloadsFromFiles retrieves all workloads from status files.
557
- func (f * fileStatusManager ) getWorkloadsFromFiles () (map [string ]core. Workload , error ) {
565
+ func (f * fileStatusManager ) getWorkloadsFromFiles () (map [string ]workloadWithPID , error ) {
558
566
// Ensure base directory exists
559
567
if err := f .ensureBaseDir (); err != nil {
560
568
return nil , fmt .Errorf ("failed to ensure base directory: %w" , err )
@@ -566,7 +574,7 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
566
574
return nil , fmt .Errorf ("failed to list status files: %w" , err )
567
575
}
568
576
569
- workloads := make (map [string ]core. Workload )
577
+ workloads := make (map [string ]workloadWithPID )
570
578
ctx := context .Background () // Create context for file locking
571
579
572
580
for _ , file := range files {
@@ -614,13 +622,15 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
614
622
}
615
623
616
624
// Check if PID migration is needed
625
+ pid := statusFile .ProcessID
617
626
if statusFile .Status == rt .WorkloadStatusRunning && statusFile .ProcessID == 0 {
618
627
// Try PID migration - the migration function will handle cases
619
628
// where container info is not available gracefully
620
629
if migratedPID , wasMigrated := f .migratePIDFromFile (workloadName , nil ); wasMigrated {
621
630
// Update the status file with the migrated PID
622
631
statusFile .ProcessID = migratedPID
623
632
statusFile .UpdatedAt = time .Now ()
633
+ pid = migratedPID
624
634
if err := f .writeStatusFile (statusFilePath , * statusFile ); err != nil {
625
635
logger .Warnf ("failed to write migrated PID for workload %s: %v" , workloadName , err )
626
636
} else {
@@ -629,7 +639,10 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
629
639
}
630
640
}
631
641
632
- workloads [workloadName ] = workload
642
+ workloads [workloadName ] = workloadWithPID {
643
+ workload : workload ,
644
+ pid : pid ,
645
+ }
633
646
return nil
634
647
})
635
648
@@ -647,7 +660,7 @@ func (f *fileStatusManager) getWorkloadsFromFiles() (map[string]core.Workload, e
647
660
// validateRunningWorkload validates that a workload marked as running in the file
648
661
// is actually running in the runtime and has a healthy proxy process if applicable.
649
662
func (f * fileStatusManager ) validateRunningWorkload (
650
- ctx context.Context , workloadName string , result core.Workload ,
663
+ ctx context.Context , workloadName string , result core.Workload , pid int ,
651
664
) (core.Workload , error ) {
652
665
// For remote workloads, we don't need to validate against the container runtime
653
666
// since they don't have containers
@@ -667,7 +680,7 @@ func (f *fileStatusManager) validateRunningWorkload(
667
680
}
668
681
669
682
// Check if proxy process is running when workload is running
670
- if unhealthyWorkload , isUnhealthy := f .checkProxyHealth (ctx , workloadName , result , containerInfo ); isUnhealthy {
683
+ if unhealthyWorkload , isUnhealthy := f .isProxyUnhealthy (ctx , workloadName , result , containerInfo , pid ); isUnhealthy {
671
684
return unhealthyWorkload , nil
672
685
}
673
686
@@ -722,19 +735,21 @@ func (f *fileStatusManager) handleRuntimeMissing(
722
735
return fileWorkload , nil
723
736
}
724
737
725
- // checkProxyHealth checks if the proxy process is running for the workload.
738
+ // isProxyUnhealthy checks if the proxy process is running for the workload.
726
739
// Returns (unhealthyWorkload, true) if proxy is not running, (emptyWorkload, false) if proxy is healthy or not applicable.
727
- func (f * fileStatusManager ) checkProxyHealth (
728
- ctx context.Context , workloadName string , result core.Workload , containerInfo rt.ContainerInfo ,
740
+ func (f * fileStatusManager ) isProxyUnhealthy (
741
+ ctx context.Context , workloadName string , result core.Workload , containerInfo rt.ContainerInfo , pid int ,
729
742
) (core.Workload , bool ) {
730
743
// Use original container labels (before filtering) to get base name
731
744
baseName := labels .GetContainerBaseName (containerInfo .Labels )
732
745
if baseName == "" {
733
746
return core.Workload {}, false // No proxy check needed
734
747
}
735
748
736
- proxyRunning := proxy .IsRunning (baseName )
737
- if proxyRunning {
749
+ proxyRunning , err := process .FindProcess (pid )
750
+ if err != nil {
751
+ logger .Warnf ("unable to find process %d: %v" , pid , err )
752
+ } else if proxyRunning {
738
753
return core.Workload {}, false // Proxy is healthy
739
754
}
740
755
@@ -775,7 +790,7 @@ func (*fileStatusManager) mergeHealthyWorkloadData(containerInfo rt.ContainerInf
775
790
// validateWorkloadInList validates a workload during list operations, similar to validateRunningWorkload
776
791
// but with different error handling to avoid disrupting the entire list operation.
777
792
func (f * fileStatusManager ) validateWorkloadInList (
778
- ctx context.Context , workloadName string , fileWorkload core.Workload , containerInfo rt.ContainerInfo ,
793
+ ctx context.Context , workloadName string , fileWorkload core.Workload , containerInfo rt.ContainerInfo , pid int ,
779
794
) (core.Workload , error ) {
780
795
// Only validate if file shows running status
781
796
if fileWorkload .Status != rt .WorkloadStatusRunning {
@@ -797,7 +812,7 @@ func (f *fileStatusManager) validateWorkloadInList(
797
812
}
798
813
799
814
// Check if proxy process is running when workload is running
800
- if unhealthyWorkload , isUnhealthy := f .checkProxyHealth (ctx , workloadName , fileWorkload , containerInfo ); isUnhealthy {
815
+ if unhealthyWorkload , isUnhealthy := f .isProxyUnhealthy (ctx , workloadName , fileWorkload , containerInfo , pid ); isUnhealthy {
801
816
return unhealthyWorkload , nil
802
817
}
803
818
@@ -809,7 +824,7 @@ func (f *fileStatusManager) validateWorkloadInList(
809
824
func (f * fileStatusManager ) mergeRuntimeAndFileWorkloads (
810
825
ctx context.Context ,
811
826
runtimeContainers []rt.ContainerInfo ,
812
- fileWorkloads map [string ]core. Workload ,
827
+ fileWorkloadsWithPID map [string ]workloadWithPID ,
813
828
) map [string ]core.Workload {
814
829
runtimeWorkloadMap := make (map [string ]rt.ContainerInfo )
815
830
for _ , container := range runtimeContainers {
@@ -840,15 +855,17 @@ func (f *fileStatusManager) mergeRuntimeAndFileWorkloads(
840
855
}
841
856
842
857
// Then, merge with file workloads, validating running workloads
843
- for name , fileWorkload := range fileWorkloads {
858
+ for name , fileWorkloadWithPID := range fileWorkloadsWithPID {
859
+ fileWorkload := fileWorkloadWithPID .workload
860
+ pid := fileWorkloadWithPID .pid
844
861
845
862
if fileWorkload .Remote { // Remote workloads are not managed by the container runtime
846
863
workloadMap [name ] = fileWorkload
847
864
continue
848
865
}
849
866
if runtimeContainer , exists := runtimeWorkloadMap [name ]; exists {
850
867
// Validate running workloads similar to GetWorkload
851
- validatedWorkload , err := f .validateWorkloadInList (ctx , name , fileWorkload , runtimeContainer )
868
+ validatedWorkload , err := f .validateWorkloadInList (ctx , name , fileWorkload , runtimeContainer , pid )
852
869
if err != nil {
853
870
logger .Warnf ("failed to validate workload %s in list: %v" , name , err )
854
871
// Fall back to basic merge without validation
@@ -870,8 +887,9 @@ func (f *fileStatusManager) mergeRuntimeAndFileWorkloads(
870
887
if err != nil {
871
888
logger .Warnf ("failed to handle missing runtime for workload %s: %v" , name , err )
872
889
workloadMap [name ] = fileWorkload
890
+ } else {
891
+ workloadMap [name ] = updatedWorkload
873
892
}
874
- workloadMap [name ] = updatedWorkload
875
893
}
876
894
}
877
895
return workloadMap
0 commit comments