From dc998539505c7df9fe2de6a3ea0fd471f6851079 Mon Sep 17 00:00:00 2001
From: l17728 <1322552785@qq.com>
Date: Fri, 3 Apr 2026 09:35:39 +0800
Subject: [PATCH] feat: add E2E test framework for Electron applications with
 robust port management, instance isolation, and safety guards
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## 功能
添加 Electron 应用的 E2E 测试启动框架，支持并行实例运行、CDP 调试、独立数据隔离、完善的故障检测和数据保护。

### 核心特性

#### 1. 可靠的端口扫描和预留机制
- 自动查找可用的 TCP 端口（默认从 9222 开始）
- **新增**：保持端口预留直到进程启动，防止 TOCTTOU 竞态
- 修复：递归逻辑中的 null + 1 bug
- 添加最大重试限制（100 次），防止无限递归
- 100ms 超时保护，快速检测端口占用
- 并发安全的 completed 标志和资源清理

#### 2. 正确的 CDP 端口配置
- 使用 --remote-debugging-port=<port> 命令行参数
- 移除无效的 REMOTE_DEBUGGING_PORT 环境变量
- 确保 E2E 客户端能正确连接调试器

#### 3. 完善的启动检测和错误处理
- 验证应用目录和 Electron 可执行文件存在性
- 捕获 spawn 错误事件并提供清晰错误信息
- 检测启动期间的早期退出（非零退出码）
- 检测信号终止的启动失败（SIGKILL、SIGTERM 等）
- 启动后立即抛出错误而不仅记录日志
- 避免连接到死进程，失败诊断清晰

#### 4. 并行实例隔离和数据保护
- 为每个 E2E 实例创建独立的用户数据目录
- 基于端口号生成唯一路径：`/tmp/chatlab-e2e-{port}`
- 通过环境变量 CHATLAB_E2E_USER_DATA_DIR 传递给主进程
- 主进程自动调用 app.setPath('userData', dir) 隔离存储
- **新增**：E2E 模式下跳过遗留数据迁移，保护用户真实数据
- 防止并发进程的状态泄漏、死锁、数据库冲突

#### 5. 优雅且高效的进程管理
- 防止多次调用 close() 导致的事件监听器泄漏
- 检查 proc.exitCode 和 proc.signalCode
- 已退出进程立即返回（无 5s 延迟）
- 使用 SIGTERM 允许进程正常清理资源
- 5 秒超时后强制 SIGKILL 防止僵尸进程
- 使用活力检查（proc.kill(0)）替代 proc.killed
- 使用正确的 Node.js API signalCode（不是 signalDescription）
- 强制 SIGKILL 路径对 stubborn 进程有效
- 进程退出时立即清除超时定时器

#### 6. 并行实例支持
- 在主进程添加 TEST_MODE 环境变量检查
- 绕过单实例锁允许多个 Electron 实例
- 跳过遗留迁移保护用户数据
- 每个实例自动分配不同的 CDP 端口和数据目录

### 文件变更
- `electron/main/index.ts`:
  * 添加 TEST_MODE 检查，绕过单实例锁
  * 添加 CHATLAB_E2E_USER_DATA_DIR 读取和隔离
  * **新增**：跳过 E2E 模式下的遗留迁移
- `tests/e2e/helpers/app-launcher.js`: 完整的应用启动管理模块（260+ 行）
  * **新增**：port 预留机制，防止 TOCTTOU 竞态

### 使用示例
```javascript
const { launchApp } = require('./tests/e2e/helpers/app-launcher')

// 启动应用（自动查找可用端口和独立数据目录）
const app = await launchApp()
console.log('CDP 端口:', app.port)

// 运行测试...

// 关闭应用（已退出进程快速返回）
await app.close()
```

### 可配置选项
```javascript
await launchApp({
  port: 9222,                    // 指定端口，默认自动查找（带预留）
  userDataDir: '/custom/path',   // 自定义用户数据目录
  startupWaitTime: 2000          // 启动等待时间（毫秒），默认 2000
})
```

### 并行测试场景
✅ 多个实例同时运行（每个实例独立端口和数据目录）
✅ 自动端口分配（9222, 9223, 9224...）
✅ **端口预留防止竞态**（同时启动无冲突）
✅ 自动数据隔离（/tmp/chatlab-e2e-9222, /tmp/chatlab-e2e-9223...）
✅ 资源正确清理（无泄漏）
✅ 启动失败快速失败（清晰诊断）
✅ 进程强制清理（防止僵尸进程）
✅ 慢速 CI 环境中不挂起
✅ Stubborn 进程能被正确杀死
✅ 已退出进程快速检测和返回
✅ 信号退出的进程也能快速返回
✅ 无共享状态，支持真正的并行测试
✅ 用户数据受保护，不会被测试污染

### 修复的关键问题
**P1 问题：**
- 端口扫描超时导致的无限挂起
- 进程强制杀死被 proc.killed 误导的竞态条件
- 启动期间的早期退出未被检测
- 并行 E2E 实例共享 userData 导致冲突
- **新增**：遗留迁移在 E2E 模式下可能删除用户数据
- **新增**：端口分配 TOCTTOU 竞态导致启动冲突

**P2 问题：**
- 慢速 CI 中的 listen 回调延迟
- close() 对已退出进程的 5 秒不必要延迟
- signalCode API 错误
- 信号终止的启动失败未被检测

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
---
 electron/main/index.ts            |  29 +++-
 tests/e2e/helpers/app-launcher.js | 265 ++++++++++++++++++++++++++++++
 2 files changed, 290 insertions(+), 4 deletions(-)
 create mode 100644 tests/e2e/helpers/app-launcher.js
diff --git a/electron/main/index.ts b/electron/main/index.ts
index 2d3fe5da..8d3f6bfd 100644
--- a/electron/main/index.ts
+++ b/electron/main/index.ts
@@ -15,10 +15,21 @@ const appWithQuitFlag = app as AppWithQuitFlag
 
 class MainProcess {
   mainWindow: BrowserWindow | null
+  isTestMode: boolean
   constructor() {
     // 主窗口
     this.mainWindow = null
 
+    // E2E 测试模式检查：跳过遗留数据迁移和其他测试无关的初始化
+    this.isTestMode = process.env.TEST_MODE === 'true'
+
+    // E2E 测试隔离：为并行测试实例设置独立的用户数据目录
+    // 这防止了并发进程的状态泄漏、死锁和数据库冲突
+    const e2eUserDataDir = process.env.CHATLAB_E2E_USER_DATA_DIR
+    if (e2eUserDataDir) {
+      app.setPath('userData', e2eUserDataDir)
+    }
+
     // 设置应用程序名称
     if (process.platform === 'win32') app.setAppUserModelId(app.getName())
     // 初始化
@@ -31,6 +42,12 @@ class MainProcess {
 
   // 单例锁
   async checkApp() {
+    // E2E 测试模式：绕过单实例锁以支持并行实例
+    const isTestMode = process.env.TEST_MODE === 'true'
+    if (isTestMode) {
+      return true
+    }
+
     if (!app.requestSingleInstanceLock()) {
       app.quit()
       // 未获得锁
@@ -54,11 +71,15 @@ class MainProcess {
   async init() {
     initAnalytics()
 
-    // 清理上次切换目录后的旧数据目录
-    cleanupPendingDeleteDir()
+    // E2E 测试模式：跳过遗留数据迁移
+    // 遗留迁移会删除 Documents/ChatLab，在本地测试时可能破坏用户数据
+    if (!this.isTestMode) {
+      // 清理上次切换目录后的旧数据目录
+      cleanupPendingDeleteDir()
 
-    // 执行数据目录迁移（从 Documents/ChatLab 迁移到 userData）
-    this.migrateDataIfNeeded()
+      // 执行数据目录迁移（从 Documents/ChatLab 迁移到 userData）
+      this.migrateDataIfNeeded()
+    }
 
     // 确保应用目录存在
     ensureAppDirs()
diff --git a/tests/e2e/helpers/app-launcher.js b/tests/e2e/helpers/app-launcher.js
new file mode 100644
index 00000000..d5bcc574
--- /dev/null
+++ b/tests/e2e/helpers/app-launcher.js
@@ -0,0 +1,265 @@
+'use strict'
+
+/**
+ * Electron 应用启动器
+ * 通过 CDP 端口启动 Electron 实例以供 E2E 测试使用
+ * 支持 TEST_MODE 绕过单实例锁，允许并行运行多个实例
+ */
+
+const { spawn } = require('child_process')
+const path = require('path')
+const fs = require('fs')
+const os = require('os')
+
+/**
+ * 查找可用的 TCP 端口，并保持预留直到进程启动
+ *
+ * 问题修复：
+ * 1. 原代码递归时使用 null + 1 = 1，应该使用 startPort + 1
+ * 2. 添加最大重试限制，避免无限递归
+ * 3. 改进错误处理和超时逻辑
+ * 4. 返回保留的服务器和端口，避免 TOCTTOU 竞态
+ */
+async function findAvailablePortWithReservation(startPort = 9222, maxRetries = 100, currentRetry = 0) {
+  const net = require('net')
+
+  // 最大重试次数检查
+  if (currentRetry >= maxRetries) {
+    throw new Error(
+      `Unable to find available port after ${maxRetries} attempts (tried ports ${startPort}-${startPort + maxRetries - 1})`
+    )
+  }
+
+  const port = startPort + currentRetry
+
+  return new Promise((resolve) => {
+    const server = net.createServer()
+    let completed = false
+
+    const cleanup = () => {
+      if (!completed) {
+        completed = true
+        // 确保 server 被正确关闭
+        if (!server.closed) {
+          server.close()
+        }
+      }
+    }
+
+    // 端口可用：成功监听，保持预留直到使用
+    server.listen(port, () => {
+      if (!completed) {
+        completed = true
+        // 返回保留的服务器和端口，调用方负责在启动进程后关闭
+        resolve({ port, reservationServer: server })
+      }
+    })
+
+    // 端口被占用或其他错误：标记失败
+    server.on('error', () => {
+      cleanup()
+      resolve(null)
+    })
+
+    // 超时保护：100ms 未响应视为超时
+    setTimeout(() => {
+      if (!completed) {
+        completed = true
+        // 确保 server 被正确关闭
+        if (!server.closed) {
+          server.close()
+        }
+        resolve(null)
+      }
+    }, 100)
+  }).then((result) => {
+    // 找到可用端口，返回保留结果
+    if (result) return result
+
+    // 未找到，继续尝试下一个端口
+    return findAvailablePortWithReservation(startPort, maxRetries, currentRetry + 1)
+  })
+}
+
+/**
+ * 启动 Electron 应用
+ */
+async function launchApp(options = {}) {
+  let reservationServer = null
+  let port = options.port
+
+  if (!port) {
+    // 查找可用端口并保持预留，防止 TOCTTOU 竞态
+    // 两个并行启动不会发现相同的端口
+    const reservation = await findAvailablePortWithReservation(9222)
+    if (!reservation) {
+      throw new Error('[AppLauncher] 无法找到可用端口')
+    }
+    port = reservation.port
+    reservationServer = reservation.reservationServer
+  }
+
+  // 为并行 E2E 实例创建独立的用户数据目录，避免共享造成的冲突
+  // 这防止了并发进程的状态泄漏、死锁和数据库冲突
+  const userDataDir = options.userDataDir || (process.env.CHATLAB_E2E_USER_DATA_DIR ?
+    path.join(process.env.CHATLAB_E2E_USER_DATA_DIR, `instance-${port}`) :
+    path.join(os.tmpdir(), `chatlab-e2e-${port}`)
+  )
+
+  // 确保用户数据目录存在
+  if (!fs.existsSync(userDataDir)) {
+    fs.mkdirSync(userDataDir, { recursive: true })
+  }
+
+  const appPath = path.resolve(__dirname, '../../..')
+
+  // 验证应用目录存在
+  if (!fs.existsSync(appPath)) {
+    throw new Error(`[AppLauncher] 应用目录不存在: ${appPath}`)
+  }
+
+  let electronExe
+  if (process.platform === 'win32') {
+    electronExe = path.resolve(appPath, 'node_modules/.bin/electron.cmd')
+  } else {
+    electronExe = path.resolve(appPath, 'node_modules/.bin/electron')
+  }
+
+  if (!fs.existsSync(electronExe)) {
+    throw new Error(`Electron 可执行文件不存在: ${electronExe}`)
+  }
+
+  console.log(`[AppLauncher] 启动 Electron，CDP 端口: ${port}`)
+
+  // 构建 Electron 启动参数
+  // 重要：必须使用 --remote-debugging-port 命令行参数，而不是环境变量
+  // Electron 不会读取 REMOTE_DEBUGGING_PORT 环境变量
+  const electronArgs = [
+    `--remote-debugging-port=${port}`,  // 启用 CDP 调试端口
+    appPath,  // 应用路径作为最后的参数
+  ]
+
+  const proc = spawn(electronExe, electronArgs, {
+    stdio: 'inherit',
+    env: {
+      ...process.env,
+      TEST_MODE: 'true',  // E2E 测试模式：允许多个实例
+      CHATLAB_E2E_USER_DATA_DIR: userDataDir,  // 为该实例设置隔离的用户数据目录
+      ELECTRON_ENABLE_LOGGING: '1',
+    },
+  })
+
+  // 进程启动后，立即释放端口预留
+  // 这样 Electron 可以绑定 --remote-debugging-port，避免其他进程抢占
+  if (reservationServer) {
+    reservationServer.close()
+  }
+
+  // 处理进程启动失败
+  if (proc.exitCode !== null && proc.exitCode !== 0) {
+    throw new Error(`[AppLauncher] Electron 启动失败，退出码: ${proc.exitCode}`)
+  }
+
+  // 监听进程错误事件
+  let launchError = null
+  let exitCode = null
+
+  proc.on('error', (error) => {
+    console.error(`[AppLauncher] Electron 进程错误:`, error.message)
+    launchError = error
+  })
+
+  // 监听启动期间的进程退出
+  // Node.js exit 事件有两个参数：code 和 signal
+  // - code: null 当进程被信号杀死时；否则是数字退出码
+  // - signal: 信号名称（如 'SIGKILL'）；正常退出时为 null
+  let exitSignal = null
+  proc.on('exit', (code, signal) => {
+    exitCode = code
+    exitSignal = signal
+    if (code !== null && code !== 0) {
+      console.error(`[AppLauncher] Electron 进程异常退出，退出码: ${code}`)
+    }
+    if (signal !== null) {
+      console.error(`[AppLauncher] Electron 进程被信号杀死: ${signal}`)
+    }
+  })
+
+  // 等待应用就绪
+  // 注：这个延迟需要等应用真正启动完成，避免立即测试导致测试失败
+  // TODO: 可以改进为监听应用就绪事件而不是固定延迟
+  const startupWaitTime = options.startupWaitTime || 2000
+  await new Promise((resolve) => setTimeout(resolve, startupWaitTime))
+
+  // 检查启动过程中是否出现错误
+  if (launchError) {
+    throw new Error(`[AppLauncher] Electron 启动期间发生错误: ${launchError.message}`)
+  }
+
+  // 检查启动期间是否有非零退出或信号终止
+  if (exitCode !== null && exitCode !== 0) {
+    throw new Error(`[AppLauncher] Electron 启动期间异常退出，退出码: ${exitCode}`)
+  }
+  if (exitSignal !== null) {
+    throw new Error(`[AppLauncher] Electron 启动期间被信号杀死: ${exitSignal}`)
+  }
+
+  return {
+    proc,
+    port,
+    async close() {
+      console.log(`[AppLauncher] 关闭应用 (PID: ${proc.pid})`)
+
+      // 检查进程是否已经退出（自行退出或被杀死）
+      // proc.killed 只在我们主动 kill 时为 true，不包括自行退出的情况
+      // Node.js ChildProcess 使用 signalCode（不是 signalDescription）表示信号退出
+      if (proc.exitCode !== null || proc.signalCode !== null) {
+        // 进程已退出，直接返回
+        console.log(`[AppLauncher] 应用已退出 (exit code: ${proc.exitCode}, signal: ${proc.signalCode})`)
+        return
+      }
+
+      return new Promise((resolve) => {
+        let resolved = false
+        const exitHandler = () => {
+          if (!resolved) {
+            resolved = true
+            clearTimeout(forceKillTimer)
+            resolve()
+          }
+        }
+
+        // 监听进程退出事件
+        proc.once('exit', exitHandler)
+
+        // 发送 SIGTERM 信号要求进程正常终止
+        proc.kill('SIGTERM')
+
+        // 强制杀死超时：5秒后强制 SIGKILL
+        // 防止僵尸进程，确保测试能够顺利清理
+        // 注：使用活力检查而不是 proc.killed，因为 proc.killed 在 SIGTERM 后立即变为 true
+        // 但进程可能还未实际退出，需要检查进程是否真的存在
+        const forceKillTimer = setTimeout(() => {
+          if (!resolved) {
+            // 尝试杀死进程：检查进程是否真的还在运行
+            // 如果进程已退出，kill() 会抛出错误，我们忽略它
+            try {
+              proc.kill(0)  // 检查进程是否存在（发送信号 0 不会真的杀死）
+              // 进程存在，发送 SIGKILL
+              proc.kill('SIGKILL')
+            } catch (err) {
+              // 进程不存在，已正常退出
+            }
+          }
+          // 5秒后必须 resolve，防止永久挂起
+          if (!resolved) {
+            resolved = true
+            resolve()
+          }
+        }, 5000)
+      })
+    },
+  }
+}
+
+module.exports = { launchApp }