Skip to content

Commit c958f00

Browse files
committed
Merge release-20250820.0-33-g1941bc68e (automated)
2 parents f51dbb4 + 1941bc6 commit c958f00

File tree

13 files changed

+107
-46
lines changed

13 files changed

+107
-46
lines changed

pkg/sentry/kernel/fs_context.go

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,20 @@ func NewFSContext(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
6161
return &f
6262
}
6363

64+
// destroy destroys the FSContext.
65+
//
66+
// Preconditions: f must have no refcount.
67+
func (f *FSContext) destroy(ctx context.Context) {
68+
// Hold f.mu so that we don't race with RootDirectory() and
69+
// WorkingDirectory().
70+
f.mu.Lock()
71+
defer f.mu.Unlock()
72+
f.root.DecRef(ctx)
73+
f.root = vfs.VirtualDentry{}
74+
f.cwd.DecRef(ctx)
75+
f.cwd = vfs.VirtualDentry{}
76+
}
77+
6478
// DecRef implements RefCounter.DecRef.
6579
//
6680
// When f reaches zero references, DecRef will be called on both root and cwd
@@ -71,15 +85,7 @@ func NewFSContext(root, cwd vfs.VirtualDentry, umask uint) *FSContext {
7185
// proc files or other mechanisms.
7286
func (f *FSContext) DecRef(ctx context.Context) {
7387
f.FSContextRefs.DecRef(func() {
74-
// Hold f.mu so that we don't race with RootDirectory() and
75-
// WorkingDirectory().
76-
f.mu.Lock()
77-
defer f.mu.Unlock()
78-
79-
f.root.DecRef(ctx)
80-
f.root = vfs.VirtualDentry{}
81-
f.cwd.DecRef(ctx)
82-
f.cwd = vfs.VirtualDentry{}
88+
f.destroy(ctx)
8389
})
8490
}
8591

@@ -202,7 +208,7 @@ func (f *FSContext) checkAndPreventSharingOutsideTG(tg *ThreadGroup) bool {
202208

203209
tgCount := int64(0)
204210
tg.ForEachTask(func(t *Task) bool {
205-
if t.fsContext == f {
211+
if t.FSContext() == f {
206212
tgCount++
207213
}
208214
return true
@@ -233,3 +239,20 @@ func (f *FSContext) share() bool {
233239
f.IncRef()
234240
return true
235241
}
242+
243+
// unshareFromTask removes the FSContext f from the given Task t and replaces it with newF.
244+
// It returns a bool indicating whether f needs to be destroyed.
245+
246+
// This func operates without compromising a concurrent checkAndPreventSharingOutsideTG(): t's
247+
// association with f is severed atomically by holding f.mu, allowing the concurrent func to
248+
// correctly ascribe extra ref counts to tasks outside of t's thread group.
249+
//
250+
// Preconditions: The caller must be on the task goroutine and must hold t.mu.
251+
func (f *FSContext) unshareFromTask(t *Task, newF *FSContext) bool {
252+
f.mu.Lock()
253+
defer f.mu.Unlock()
254+
t.fsContext.Store(newF)
255+
destroy := false
256+
f.FSContextRefs.DecRef(func() { destroy = true })
257+
return destroy
258+
}

pkg/sentry/kernel/kernel.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1993,9 +1993,9 @@ func (k *Kernel) ReplaceFSContextRoots(ctx context.Context, oldRoot vfs.VirtualD
19931993
k.tasks.mu.RLock()
19941994
oldRootDecRefs := 0
19951995
k.tasks.forEachTaskLocked(func(t *Task) {
1996-
t.mu.Lock()
1996+
t.mu.Lock() // To prevent t's task goroutine from unsharing its fsContext from under us.
19971997
defer t.mu.Unlock()
1998-
if fsc := t.fsContext; fsc != nil {
1998+
if fsc := t.FSContext(); fsc != nil {
19991999
fsc.mu.Lock()
20002000
defer fsc.mu.Unlock()
20012001
if fsc.root == oldRoot {

pkg/sentry/kernel/kernel_state.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,16 @@ func (t *Task) loadSeccomp(_ context.Context, seccompData *taskSeccomp) {
6767
t.seccomp.Store(seccompData)
6868
}
6969

70+
// saveFsContext is invoked by stateify.
71+
func (t *Task) saveFsContext() *FSContext {
72+
return t.fsContext.Load()
73+
}
74+
75+
// loadFsContext is invoked by stateify.
76+
func (t *Task) loadFsContext(_ context.Context, fsContext *FSContext) {
77+
t.fsContext.Store(fsContext)
78+
}
79+
7080
// saveAppCPUClockLast is invoked by stateify.
7181
func (tg *ThreadGroup) saveAppCPUClockLast() *Task {
7282
return tg.appCPUClockLast.Load()

pkg/sentry/kernel/kernel_state_autogen.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,6 +1506,9 @@ func (t *Task) beforeSave() {}
15061506
// +checklocksignore
15071507
func (t *Task) StateSave(stateSinkObject state.Sink) {
15081508
t.beforeSave()
1509+
var fsContextValue *FSContext
1510+
fsContextValue = t.saveFsContext()
1511+
stateSinkObject.SaveValue(27, fsContextValue)
15091512
var vforkParentValue *Task
15101513
vforkParentValue = t.saveVforkParent()
15111514
stateSinkObject.SaveValue(29, vforkParentValue)
@@ -1542,7 +1545,6 @@ func (t *Task) StateSave(stateSinkObject state.Sink) {
15421545
stateSinkObject.Save(24, &t.k)
15431546
stateSinkObject.Save(25, &t.containerID)
15441547
stateSinkObject.Save(26, &t.image)
1545-
stateSinkObject.Save(27, &t.fsContext)
15461548
stateSinkObject.Save(28, &t.fdTable)
15471549
stateSinkObject.Save(30, &t.exitState)
15481550
stateSinkObject.Save(31, &t.exitTracerNotified)
@@ -1617,7 +1619,6 @@ func (t *Task) StateLoad(ctx context.Context, stateSourceObject state.Source) {
16171619
stateSourceObject.Load(24, &t.k)
16181620
stateSourceObject.Load(25, &t.containerID)
16191621
stateSourceObject.Load(26, &t.image)
1620-
stateSourceObject.Load(27, &t.fsContext)
16211622
stateSourceObject.Load(28, &t.fdTable)
16221623
stateSourceObject.Load(30, &t.exitState)
16231624
stateSourceObject.Load(31, &t.exitTracerNotified)
@@ -1661,6 +1662,7 @@ func (t *Task) StateLoad(ctx context.Context, stateSourceObject state.Source) {
16611662
stateSourceObject.Load(71, &t.Origin)
16621663
stateSourceObject.Load(72, &t.onDestroyAction)
16631664
stateSourceObject.Load(73, &t.execveCredsMutexOwner)
1665+
stateSourceObject.LoadValue(27, new(*FSContext), func(y any) { t.loadFsContext(ctx, y.(*FSContext)) })
16641666
stateSourceObject.LoadValue(29, new(*Task), func(y any) { t.loadVforkParent(ctx, y.(*Task)) })
16651667
stateSourceObject.LoadValue(35, new(*Task), func(y any) { t.loadPtraceTracer(ctx, y.(*Task)) })
16661668
stateSourceObject.LoadValue(52, new(*taskSeccomp), func(y any) { t.loadSeccomp(ctx, y.(*taskSeccomp)) })

pkg/sentry/kernel/task.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -291,8 +291,9 @@ type Task struct {
291291

292292
// fsContext is the task's filesystem context.
293293
//
294-
// fsContext is protected by mu, and is owned by the task goroutine.
295-
fsContext *FSContext
294+
// fsContext is protected by mu, and is owned by the task goroutine. It can be read without
295+
// synchronization using FSContext().
296+
fsContext atomic.Pointer[FSContext] `state:".(*FSContext)"`
296297

297298
// fdTable is the task's file descriptor table.
298299
//
@@ -750,7 +751,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
750751
func (t *Task) IsChrooted() bool {
751752
realRoot := t.mountNamespace.Root(t)
752753
defer realRoot.DecRef(t)
753-
root := t.fsContext.RootDirectory()
754+
root := t.FSContext().RootDirectory()
754755
defer root.DecRef(t)
755756
return root != realRoot
756757
}
@@ -766,10 +767,11 @@ func (t *Task) TaskImage() *TaskImage {
766767
// FSContext returns t's FSContext. FSContext does not take an additional
767768
// reference on the returned FSContext.
768769
//
769-
// Precondition: The caller must be running on the task goroutine, or t.mu must
770-
// be locked.
770+
// Precondition: No synchronization is needed for just read access. If the caller instead intends to
771+
// modify the contents of the FSContext, it must either be running on the task goroutine or have
772+
// t.mu locked.
771773
func (t *Task) FSContext() *FSContext {
772-
return t.fsContext
774+
return t.fsContext.Load()
773775
}
774776

775777
// FDTable returns t's FDTable. FDMTable does not take an additional reference

pkg/sentry/kernel/task_clone.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,9 +192,9 @@ func (t *Task) Clone(args *linux.CloneArgs) (ThreadID, *SyscallControl, error) {
192192

193193
var fsContext *FSContext
194194
if args.Flags&linux.CLONE_FS == 0 || args.Flags&linux.CLONE_NEWNS != 0 {
195-
fsContext = t.fsContext.Fork()
195+
fsContext = t.FSContext().Fork()
196196
} else {
197-
fsContext = t.fsContext
197+
fsContext = t.FSContext()
198198
if !fsContext.share() {
199199
// Linux fails clone with EAGAIN if there is a concurrent execve, see kernel/fork.c:copy_fs().
200200
return 0, nil, linuxerr.EAGAIN
@@ -518,7 +518,7 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
518518
!t.Credentials().HasCapability(linux.CAP_SYS_ADMIN) {
519519
return linuxerr.EPERM
520520
}
521-
oldFSContext := t.fsContext
521+
oldFSContext := t.FSContext()
522522
// The current task has to be an exclusive owner of its fs context.
523523
if oldFSContext.ReadRefs() != 1 {
524524
return linuxerr.EINVAL
@@ -535,7 +535,7 @@ func (t *Task) Setns(fd *vfs.FileDescription, flags int32) error {
535535
ns.IncRef()
536536
t.mu.Lock()
537537
t.mountNamespace = ns
538-
t.fsContext = fsContext
538+
t.fsContext.Store(fsContext)
539539
t.mu.Unlock()
540540
oldNS.DecRef(t)
541541
oldFSContext.DecRef(t)
@@ -671,7 +671,6 @@ func (t *Task) Unshare(flags int32) error {
671671
defer cu.Clean()
672672
t.mu.Lock()
673673
defer t.mu.Unlock()
674-
// Can't defer unlock: DecRefs must occur without holding t.mu.
675674
if flags&linux.CLONE_NEWUTS != 0 {
676675
if !haveCapSysAdmin {
677676
return linuxerr.EPERM
@@ -701,16 +700,21 @@ func (t *Task) Unshare(flags int32) error {
701700
cu.Add(func() { oldFDTable.DecRef(t) })
702701
}
703702
if flags&linux.CLONE_FS != 0 || flags&linux.CLONE_NEWNS != 0 {
704-
oldFSContext := t.fsContext
705-
t.fsContext = oldFSContext.Fork()
706-
cu.Add(func() { oldFSContext.DecRef(t) })
703+
oldFSContext := t.FSContext()
704+
// unshareFromTask() lowers the old fs context's ref count, but its for us to
705+
// destroy it if there are no other references.
706+
if oldFSContext.unshareFromTask(t, oldFSContext.Fork()) {
707+
// destroy() requires t.mu to not be held, hence the deferral.
708+
cu.Add(func() { oldFSContext.destroy(t) })
709+
}
707710
}
708711
if flags&linux.CLONE_NEWNS != 0 {
709712
if !haveCapSysAdmin {
710713
return linuxerr.EPERM
711714
}
712715
oldMountNS := t.mountNamespace
713-
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &t.fsContext.root, &t.fsContext.cwd, t.k)
716+
fsContext := t.FSContext()
717+
mntns, err := t.k.vfs.CloneMountNamespace(t, creds.UserNamespace, oldMountNS, &fsContext.root, &fsContext.cwd, t.k)
714718
if err != nil {
715719
return err
716720
}

pkg/sentry/kernel/task_context.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ func (t *Task) contextValue(key any, isTaskGoroutine bool) any {
9696
t.mu.Lock()
9797
defer t.mu.Unlock()
9898
}
99-
return t.fsContext.RootDirectory()
99+
return t.FSContext().RootDirectory()
100100
case vfs.CtxMountNamespace:
101101
if !isTaskGoroutine {
102102
t.mu.Lock()

pkg/sentry/kernel/task_exec.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,15 +144,16 @@ func (r *runExecveAfterExecveCredsLock) execute(t *Task) taskRunState {
144144
return (*runInterrupt)(nil)
145145
}
146146

147-
root := t.FSContext().RootDirectory()
147+
fsContext := t.FSContext()
148+
root := fsContext.RootDirectory()
148149
defer root.DecRef(t)
149-
wd := t.FSContext().WorkingDirectory()
150+
wd := fsContext.WorkingDirectory()
150151
defer wd.DecRef(t)
151152

152153
// Cannot gain privileges if the ptracer's capabilities prevent it.
153154
stopPrivGain := t.shouldStopPrivGainDueToPtracerLocked()
154155
// Cannot gain privileges if the FSContext is shared outside of our thread group.
155-
if t.fsContext.checkAndPreventSharingOutsideTG(t.tg) {
156+
if fsContext.checkAndPreventSharingOutsideTG(t.tg) {
156157
stopPrivGain = true
157158
}
158159

@@ -612,8 +613,8 @@ func (t *Task) shouldStopPrivGainDueToPtracerLocked() bool {
612613

613614
// Releases the mutex that serializes an execve(2) with a PTRACE_ATTACH and allows t.fsContext to
614615
// be shared again via clone(2). The caller must have called execveCredsMutexStartLock() and
615-
// fsContext.CheckAndPreventSharingOutsideTG() earlier.
616+
// fsContext.checkAndPreventSharingOutsideTG() earlier.
616617
func (t *Task) releaseExecveCredsLocks() {
617618
t.execveCredsMutexUnlock()
618-
t.fsContext.allowSharing()
619+
t.FSContext().allowSharing()
619620
}

pkg/sentry/kernel/task_exit.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ func (*runExitMain) execute(t *Task) taskRunState {
298298
// Releasing the MM unblocks a blocked CLONE_VFORK parent.
299299
t.unstopVforkParent()
300300

301-
t.fsContext.DecRef(t)
301+
t.FSContext().DecRef(t)
302302
t.fdTable.DecRef(t)
303303

304304
// Detach task from all cgroups. This must happen before potentially the

pkg/sentry/kernel/task_start.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
159159
signalMask: atomicbitops.FromUint64(uint64(cfg.SignalMask)),
160160
signalStack: linux.SignalStack{Flags: linux.SS_DISABLE},
161161
image: *image,
162-
fsContext: cfg.FSContext,
163162
fdTable: cfg.FDTable,
164163
k: cfg.Kernel,
165164
ptraceTracees: make(map[*Task]struct{}),
@@ -183,6 +182,7 @@ func (ts *TaskSet) newTask(ctx context.Context, cfg *TaskConfig) (*Task, error)
183182
}
184183
t.netns = cfg.NetworkNamespace
185184
t.creds.Store(cfg.Credentials)
185+
t.fsContext.Store(cfg.FSContext)
186186
t.endStopCond.L = &t.tg.signalHandlers.mu
187187
// We don't construct t.blockingTimer until Task.run(); see that function
188188
// for justification.

0 commit comments

Comments
 (0)