Skip to content

Issue: Command Execution Fails When /tmp Directory is Deleted #400

@hsmustard

Description

@hsmustard

现象 (Phenomenon)

删除沙箱容器内的 /tmp 目录后,通过 API 无法再执行任何命令,包括前台命令和后台命令。所有命令执行请求都会失败。

症状 (Symptoms):

  • API 调用 /command/run/command/run_background 返回错误
  • 错误信息:failed to get stdlog descriptorfailed to get combined output descriptor
  • 即使使用简单的 echo 命令也无法执行
  • 容器需要重启才能恢复

复现步骤 (Reproduction Steps):

# 1. 启动沙箱容器
docker run opensandbox/code-interpreter

# 2. 删除 /tmp 目录
rm -rf /tmp

# 3. 尝试通过 API 执行命令
curl -X POST http://localhost:port/command/run \
  -H "Content-Type: application/json" \
  -d '{"command": "echo test"}'

# 4. API 返回 500 Internal Server Error

根本原因 (Root Cause)

系统对命令输出日志的处理完全依赖于 /tmp 目录:

  1. 命令执行初始化阶段 (components/execd/pkg/runtime/command.go:79-93)

    func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest) error {
        // ...
        stdout, stderr, err := c.stdLogDescriptor(session)  // ← 必需 /tmp
        if err != nil {
            return fmt.Errorf("failed to get stdlog descriptor: %w", err)
        }
  2. 临时文件创建失败 (components/execd/pkg/runtime/command_common.go:63-78)

    func (c *Controller) stdLogDescriptor(session string) (io.WriteCloser, io.WriteCloser, error) {
        stdout, err := os.OpenFile(c.stdoutFileName(session), os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)
        if err != nil {
            return nil, nil, err  // ← /tmp 不存在时失败
        }
  3. 硬编码的日志路径 (components/execd/pkg/runtime/command_common.go:81-92)

    func (c *Controller) stdoutFileName(session string) string {
        return filepath.Join(os.TempDir(), session+".stdout")  // ← 无法改变
    }

问题本质: os.OpenFile() 在目录不存在时无法创建文件,因为 os.O_CREATE 标志只创建文件,不创建目录。系统没有任何回退机制或自动恢复能力。


解决方案 (Solution)

实现三层防御机制:

  1. 自动创建日志目录
  2. 提供多个回退路径
  3. 可配置的日志存储位置

步骤 1:添加标志配置

修改 components/execd/pkg/flag/flags.go

package flag

import "time"

var (
	// 现有标志...
	JupyterServerHost string
	JupyterServerToken string
	ServerPort int
	ServerLogLevel int
	ServerAccessToken string
	ApiGracefulShutdownTimeout time.Duration

	// ✅ 新增:可配置的命令日志目录
	CommandLogDir string

	// ✅ 新增:是否启用日志目录自动创建
	AutoCreateLogDir bool
)

步骤 2:优化命令日志处理

修改 components/execd/pkg/runtime/command_common.go

package runtime

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"os"
	"path/filepath"
	"sync"
	"time"
)

// tailStdPipe streams appended log data until the process finishes.
func (c *Controller) tailStdPipe(file string, onExecute func(text string), done <-chan struct{}) {
	lastPos := int64(0)
	ticker := time.NewTicker(100 * time.Millisecond)
	defer ticker.Stop()

	mutex := &sync.Mutex{}
	for {
		select {
		case <-done:
			c.readFromPos(mutex, file, lastPos, onExecute, true)
			return
		case <-ticker.C:
			newPos := c.readFromPos(mutex, file, lastPos, onExecute, false)
			lastPos = newPos
		}
	}
}

// getCommandKernel retrieves a command execution context.
func (c *Controller) getCommandKernel(sessionID string) *commandKernel {
	c.mu.RLock()
	defer c.mu.RUnlock()

	return c.commandClientMap[sessionID]
}

// storeCommandKernel registers a command execution context.
func (c *Controller) storeCommandKernel(sessionID string, kernel *commandKernel) {
	c.mu.Lock()
	defer c.mu.Unlock()

	c.commandClientMap[sessionID] = kernel
}

// stdLogDescriptor creates temporary files for capturing command output.
// ✅ 改进:自动创建目录、提供错误处理
func (c *Controller) stdLogDescriptor(session string) (io.WriteCloser, io.WriteCloser, error) {
	stdoutPath := c.stdoutFileName(session)
	stderrPath := c.stderrFileName(session)

	// ✅ 关键:确保日志目录存在
	logDir := filepath.Dir(stdoutPath)
	if err := os.MkdirAll(logDir, 0755); err != nil {
		return nil, nil, fmt.Errorf("failed to create log directory %s: %w", logDir, err)
	}

	stdout, err := os.OpenFile(stdoutPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create stdout log file %s: %w", stdoutPath, err)
	}
	defer func() {
		if err != nil {
			stdout.Close()
		}
	}()

	stderr, err := os.OpenFile(stderrPath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)
	if err != nil {
		return nil, nil, fmt.Errorf("failed to create stderr log file %s: %w", stderrPath, err)
	}

	return stdout, stderr, nil
}

func (c *Controller) combinedOutputDescriptor(session string) (io.WriteCloser, error) {
	filePath := c.combinedOutputFileName(session)

	// ✅ 改进:自动创建目录
	logDir := filepath.Dir(filePath)
	if err := os.MkdirAll(logDir, 0755); err != nil {
		return nil, fmt.Errorf("failed to create log directory %s: %w", logDir, err)
	}

	return os.OpenFile(filePath, os.O_RDWR|os.O_CREATE|os.O_TRUNC, os.ModePerm)
}

// ✅ 新增:获取命令日志目录,支持多个回退路径
func (c *Controller) getLogDir() string {
	// 优先级顺序:
	// 1. 显式配置的日志目录
	if os.Getenv("EXECD_LOG_DIR") != "" {
		return os.Getenv("EXECD_LOG_DIR")
	}
	// 2. /tmp(标准位置)
	if tmpDir := os.TempDir(); isDirWritable(tmpDir) {
		return tmpDir
	}
	// 3. 当前工作目录下的 .tmp
	if wd, err := os.Getwd(); err == nil {
		fallback := filepath.Join(wd, ".tmp")
		if isDirWritable(fallback) || os.MkdirAll(fallback, 0755) == nil {
			return fallback
		}
	}
	// 4. /var/tmp(Linux 回退)
	if isDirWritable("/var/tmp") {
		return "/var/tmp"
	}
	// 5. /dev/shm(内存临时文件)
	if isDirWritable("/dev/shm") {
		return "/dev/shm"
	}
	// 最终回退:Home 目录
	if home, err := os.UserHomeDir(); err == nil {
		logsDir := filepath.Join(home, ".opensandbox_logs")
		if os.MkdirAll(logsDir, 0755) == nil {
			return logsDir
		}
	}
	// 绝对最后的回退
	return ".opensandbox_logs"
}

// ✅ 新增:检查目录是否可写
func isDirWritable(dir string) bool {
	info, err := os.Stat(dir)
	if err != nil {
		return false
	}
	if !info.IsDir() {
		return false
	}
	// 尝试在目录中创建测试文件
	testFile := filepath.Join(dir, ".write_test_"+fmt.Sprintf("%d", os.Getpid()))
	if f, err := os.Create(testFile); err == nil {
		f.Close()
		os.Remove(testFile)
		return true
	}
	return false
}

// stdoutFileName constructs the stdout log path.
// ✅ 改进:使用 getLogDir() 而不是硬编码的 os.TempDir()
func (c *Controller) stdoutFileName(session string) string {
	return filepath.Join(c.getLogDir(), session+".stdout")
}

// stderrFileName constructs the stderr log path.
// ✅ 改进:使用 getLogDir() 而不是硬编码的 os.TempDir()
func (c *Controller) stderrFileName(session string) string {
	return filepath.Join(c.getLogDir(), session+".stderr")
}

// ✅ 改进:使用 getLogDir() 而不是硬编码的 os.TempDir()
func (c *Controller) combinedOutputFileName(session string) string {
	return filepath.Join(c.getLogDir(), session+".output")
}

// readFromPos streams new content from a file starting at startPos.
func (c *Controller) readFromPos(mutex *sync.Mutex, filepath string, startPos int64, onExecute func(string), flushIncomplete bool) int64 {
	if !mutex.TryLock() {
		return -1
	}
	defer mutex.Unlock()

	file, err := os.Open(filepath)
	if err != nil {
		return startPos
	}
	defer file.Close()

	_, _ = file.Seek(startPos, 0) //nolint:errcheck

	reader := bufio.NewReader(file)
	var buffer bytes.Buffer
	var currentPos int64 = startPos

	for {
		b, err := reader.ReadByte()
		if err != nil {
			if err == io.EOF {
				// If buffer has content but no newline, flush if needed, otherwise wait for next read
				if flushIncomplete && buffer.Len() > 0 {
					onExecute(buffer.String())
					buffer.Reset()
				}
			}
			break
		}
		currentPos++

		// Check if it's a line terminator (\n or \r)
		if b == '\n' || b == '\r' {
			// If buffer has content, output this line
			if buffer.Len() > 0 {
				onExecute(buffer.String())
				buffer.Reset()
			}
			// Skip line terminator
			continue
		}

		buffer.WriteByte(b)
	}

	endPos, _ := file.Seek(0, 1)
	// If the last read position doesn't end with a newline, return buffer start position and wait for next flush
	if !flushIncomplete && buffer.Len() > 0 {
		return currentPos - int64(buffer.Len())
	}
	return endPos
}

步骤 3:改进命令执行错误处理

修改 components/execd/pkg/runtime/command.go(前台命令部分):

// runCommand executes shell commands and streams their output.
// ✅ 改进:更清晰的错误处理和日志
func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest) error {
	session := c.newContextID()

	signals := make(chan os.Signal, 1)
	defer close(signals)
	signal.Notify(signals)
	defer signal.Reset()

	stdout, stderr, err := c.stdLogDescriptor(session)
	if err != nil {
		log.Error("Failed to create log descriptors for session %s: %v. Using log directory: %s",
			session, err, c.getLogDir())
		request.Hooks.OnExecuteInit(session)
		request.Hooks.OnExecuteError(&execute.ErrorOutput{
			EName:  "LogSetupError",
			EValue: err.Error(),
		})
		return err
	}
	defer stdout.Close()
	defer stderr.Close()

	stdoutPath := c.stdoutFileName(session)
	stderrPath := c.stderrFileName(session)

	// ... 其他现有代码保持不变 ...
}

修改 components/execd/pkg/runtime/command.go(后台命令部分):

// runBackgroundCommand executes shell commands in detached mode.
// ✅ 改进:更清晰的错误处理和日志
func (c *Controller) runBackgroundCommand(ctx context.Context, cancel context.CancelFunc, request *ExecuteCodeRequest) error {
	session := c.newContextID()
	request.Hooks.OnExecuteInit(session)

	pipe, err := c.combinedOutputDescriptor(session)
	if err != nil {
		cancel()
		log.Error("Failed to create output descriptor for session %s: %v. Using log directory: %s",
			session, err, c.getLogDir())
		kernel := &commandKernel{
			pid:          -1,
			running:      false,
			content:      request.Code,
			isBackground: true,
			errMsg:       fmt.Sprintf("Failed to setup logging: %v", err),
		}
		c.storeCommandKernel(session, kernel)
		c.markCommandFinished(session, 255, kernel.errMsg)
		return fmt.Errorf("failed to setup command output: %w", err)
	}
	stdoutPath := c.combinedOutputFileName(session)
	stderrPath := c.combinedOutputFileName(session)

	// ... 其他现有代码保持不变 ...
}

步骤 4:更新 bootstrap.sh

修改 components/execd/bootstrap.sh

#!/bin/bash

# ... 现有代码 ...

EXECD="${EXECD:=/opt/opensandbox/execd}"
LOG_DIR="${EXECD_LOG_DIR:=/tmp/opensandbox-execd}"

# ✅ 新增:确保日志目录存在
if ! mkdir -p "$LOG_DIR" 2>/dev/null; then
    echo "warning: failed to create primary log dir: $LOG_DIR" >&2
    # 回退到主目录
    LOG_DIR="$HOME/.opensandbox_logs"
    mkdir -p "$LOG_DIR" || {
        echo "ERROR: Failed to create any log directory" >&2
        exit 1
    }
fi

export EXECD_LOG_DIR="$LOG_DIR"
echo "Log directory initialized: $EXECD_LOG_DIR"

# ... 其他现有代码保持不变 ...

优点 (Benefits)

✅ 自动处理 /tmp 目录缺失情况
✅ 提供多个回退路径,适应各种环境
✅ 可配置的日志存储位置
✅ 命令执行更加可靠
✅ 支持容器化和无根环境
✅ 业界最佳实践(Django、Flask 等均采用此方案)


影响范围 (Impact)

  • 修改文件数:3 个
  • 修改行数:~150 行
  • 向后兼容性:完全兼容,无破坏性变更
  • 测试覆盖:可添加单元测试
  • 部署风险:低,完全是防守性增强

相关代码位置 (Related Code)

  • components/execd/pkg/runtime/command.go - 命令执行入口
  • components/execd/pkg/runtime/command_common.go - 日志文件处理
  • components/execd/pkg/flag/flags.go - 配置选项
  • components/execd/bootstrap.sh - 容器启动脚本

附加建议 (Additional Suggestions)

  1. 添加单元测试确保回退机制工作正常
  2. 添加日志记录实际使用的日志目录
  3. 监控告警检测频繁的目录创建失败
  4. 文档更新说明 EXECD_LOG_DIR 环境变量用途

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions