From 43245d8d4bf92751947b2e9aac18deec9b1fff67 Mon Sep 17 00:00:00 2001 From: v-duanyan Date: Tue, 17 Mar 2026 16:26:34 +0800 Subject: [PATCH 1/3] feat(tgb): add tgb site support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for tgb.cn (淘股吧): - post: export article content with markdown format - post-comments: export comments with parent-child relationship support - user-posts: list user posts All commands use cookie-based authentication strategy. --- src/clis/tgb/post-comments.yaml | 151 ++++++++++++++++++++++++++++++++ src/clis/tgb/post.yaml | 131 +++++++++++++++++++++++++++ src/clis/tgb/user-posts.yaml | 123 ++++++++++++++++++++++++++ 3 files changed, 405 insertions(+) create mode 100644 src/clis/tgb/post-comments.yaml create mode 100644 src/clis/tgb/post.yaml create mode 100644 src/clis/tgb/user-posts.yaml diff --git a/src/clis/tgb/post-comments.yaml b/src/clis/tgb/post-comments.yaml new file mode 100644 index 0000000..d595827 --- /dev/null +++ b/src/clis/tgb/post-comments.yaml @@ -0,0 +1,151 @@ +site: tgb +name: post-comments +description: 抓取淘股吧帖子分页评论 +domain: tgb.cn +strategy: cookie +browser: true + +args: + url: + type: str + required: true + description: 帖子URL(支持自动提取topicID和跳转页码) + page: + type: int + default: 1 + description: 起始页码,默认1 + max_pages: + type: int + default: 1 + description: 最大抓取页数,默认1 + limit: + type: int + default: 50 + description: 每页最大评论数 + +pipeline: + - navigate: ${{ args.url }} + - evaluate: | + (async () => { + // 从URL提取topicCode + const url = window.location.href; + const urlMatch = url.match(/\/a\/([a-zA-Z0-9]+)(?:-(\d+))?/); + if (!urlMatch) { + return { error: '无法从URL提取topicID' }; + } + + const topicCode = urlMatch[1]; + const currentPage = parseInt(urlMatch[2]) || 1; + const startPage = ${{ args.page }} || currentPage; + const maxPages = ${{ args.max_pages }} || 1; + const limitPerPage = ${{ args.limit }} || 50; + + const allComments = []; + let pagesScanned = 0; + + // 抓取多页评论 + for (let pageNum = startPage; pageNum < startPage + maxPages; pageNum++) { + const pageUrl = `https://www.tgb.cn/a/${topicCode}-${pageNum}?type=Z`; + + try { + const res = await fetch(pageUrl, { + credentials: 'include', + headers: { + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.9', + } + }); + + if (!res.ok) { + console.error(`Page ${pageNum} fetch failed: ${res.status}`); + continue; + } + + const html = await res.text(); + + if (!html.includes('gioMsg_R_')) { + console.error(`Page ${pageNum} has no comments`); + break; + } + + // 提取所有评论 + const commentRegex = /
]*subject=['"]([^'"]*)['"][^>]*userID=['"]([^'"]*)['"][^>]*userName=['"]([^'"]*)['"]\s*><\/div>/g; + let match; + const pageComments = []; + + while ((match = commentRegex.exec(html)) !== null) { + const commentId = match[1]; + const content = match[2] + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, "'"); + const userId = match[3]; + const userName = match[4]; + + pageComments.push({ + comment_id: commentId, + user_id: userId, + user_name: userName, + content: content, + page: pageNum, + time: '', + floor: '', + parent_id: '' + }); + } + + // 提取每条评论的额外信息 + for (const comment of pageComments) { + const commentId = comment.comment_id; + const commentMarker = `id="gioMsg_R_${commentId}"`; + const idx = html.indexOf(commentMarker); + if (idx === -1) continue; + + // 提取评论块 + const nextCommentMatch = html.substring(idx + 1).match(/id="gioMsg_R_\d+"/); + let blockEnd; + if (nextCommentMatch) { + blockEnd = idx + 1 + html.substring(idx + 1).indexOf(nextCommentMatch[0]); + } else { + blockEnd = Math.min(html.length, idx + 5000); + } + const block = html.substring(idx, blockEnd); + + // 提取时间 + const timeMatch = block.match(/]*pcyclspan[^>]*>(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})<\/span>/); + if (timeMatch) comment.time = timeMatch[1]; + + // 提取楼层 + const floorMatch = block.match(/>(沙发|板凳|地板|第\d+楼) { + // fetch 获取 HTML + const res = await fetch(window.location.href, { + credentials: 'include' + }); + const html = await res.text(); + + // 提取标题 + const titleMatch = html.match(/([^<]+)<\/title>/); + let title = titleMatch ? titleMatch[1].replace(/_淘股吧$/, '').trim() : ''; + + // 从 gioMsg 提取信息 + const gioMatch = html.match(/userName="([^"]+)"/); + const author = gioMatch ? gioMatch[1] : ''; + if (gioMatch && gioMatch[2] && !title) { + title = gioMatch[2]; + } + + // 提取时间、浏览、评论 + const timeMatch = html.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); + const publishTime = timeMatch ? timeMatch[1] : ''; + + const viewMatch = html.match(/浏览\s+(\d+)/); + const views = viewMatch ? viewMatch[1] : ''; + + const commentMatch = html.match(/评论\s+(\d+)/); + const comments = commentMatch ? commentMatch[1] : ''; + + // 提取正文内容 - 在 article-content 中 + let contentHtml = ''; + const contentMatch = html.match(/<div class="article-content">([\s\S]*?)<div class="handle-box/); + if (contentMatch) { + contentHtml = contentMatch[1]; + } + + // 需要过滤的关键词 + const skipPatterns = [ + '打赏Ta', + '话题与分类', + '主题股票', + '主题概念', + '声明:', + '分享文章', + '举报', + /^\d+\s+\d+\/\d+\s+\d+\s+\d+/, + /^\d+\s+评论$/, + ]; + + function shouldSkip(line) { + for (const pattern of skipPatterns) { + if (pattern instanceof RegExp) { + if (pattern.test(line)) return true; + } else if (line.includes(pattern)) { + return true; + } + } + return false; + } + + // 解析正文段落 + const paragraphs = []; + if (contentHtml) { + // 将 <br> 转换为换行 + const text = contentHtml + .replace(/<br\s*\/?>/gi, '\n') + .replace(/<[^>]+>/g, '') // 移除其他 HTML 标签 + .replace(/ /g, ' ') + .replace(/&/g, '&'); + + // 按空行分割段落 + const blocks = text.split(/\n{2,}/); + + for (const block of blocks) { + const para = block.replace(/\n+/g, ' ').trim(); + if (shouldSkip(para)) break; + if (para && para.length > 0) { + paragraphs.push(para); + } + } + } + + // 生成 markdown + const mdLines = [ + `# ${title}`, + '', + `> **作者**: ${author || '未知'}`, + `> **发布时间**: ${publishTime || '未知'}`, + `> **浏览**: ${views || '-'} | **评论**: ${comments || '-'}`, + `> **原文**: ${window.location.href}`, + '', + '---', + '' + ]; + + for (const para of paragraphs) { + const clean = para.replace(/\s+/g, ' ').trim(); + if (clean) { + mdLines.push(clean); + mdLines.push(''); + } + } + + return { + title: title, + author: author || '未知', + publish_time: publishTime || '未知', + views: views, + comments: comments, + url: window.location.href, + paragraphs: paragraphs.length, + markdown: mdLines.join('\n') + }; + })() + +columns: [title, author, publish_time, views, comments] diff --git a/src/clis/tgb/user-posts.yaml b/src/clis/tgb/user-posts.yaml new file mode 100644 index 0000000..fc9c0e9 --- /dev/null +++ b/src/clis/tgb/user-posts.yaml @@ -0,0 +1,123 @@ +site: tgb +name: user-posts +description: 获取用户文章列表 +domain: tgb.cn +strategy: cookie +browser: true + +args: + user_id: + type: str + required: true + description: 用户ID + limit: + type: int + default: 20 + description: 获取文章数量,默认20 + +pipeline: + - navigate: "https://www.tgb.cn/user/blog/moreTopic?userID=${{ args.user_id }}" + - evaluate: | + (async () => { + // 优先尝试从已加载的DOM中提取(更快) + const tables = document.querySelectorAll('table'); + const posts = []; + + for (const table of tables) { + const rows = table.querySelectorAll('tr'); + + for (const row of rows) { + const cells = row.querySelectorAll('td'); + if (cells.length < 8) continue; + + const titleCell = cells[1]; + if (!titleCell) continue; + + const link = titleCell.querySelector('a'); + if (!link) continue; + + const href = link.getAttribute('href') || ''; + if (!href.match(/^a\/[a-zA-Z0-9]+/)) continue; + + const url = 'https://www.tgb.cn/' + href; + const title = titleCell.innerText.trim(); + const author = cells[2]?.innerText?.trim() || ''; + const commentTime = cells[3]?.innerText?.trim() || ''; + const publishDate = cells[7]?.innerText?.trim() || ''; + + if (title && url) { + posts.push({ + title: title, + author: author, + comment_time: commentTime, + publish_date: publishDate, + url: url + }); + } + } + } + + // 如果DOM提取不到,尝试fetch获取HTML + if (posts.length === 0) { + try { + const res = await fetch(window.location.href, { + credentials: 'include' + }); + const html = await res.text(); + + // 解析表格行 + const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/g; + let rowMatch; + + while ((rowMatch = rowRegex.exec(html)) !== null) { + const rowHtml = rowMatch[1]; + + // 提取所有单元格 + const cellRegex = /<td[^>]*>([\s\S]*?)<\/td>/g; + const cells = []; + let cellMatch; + while ((cellMatch = cellRegex.exec(rowHtml)) !== null) { + cells.push(cellMatch[1]); + } + + if (cells.length < 8) continue; + + // 提取标题和链接 + const titleMatch = cells[1].match(/<a[^>]*href=["\'](a\/[a-zA-Z0-9]+)["\'][^>]*>([^<]+)/); + if (!titleMatch) continue; + + const href = titleMatch[1]; + const title = titleMatch[2].trim(); + + // 提取其他字段 + const author = cells[2].replace(/<[^>]+>/g, '').trim(); + const commentTime = cells[3].replace(/<[^>]+>/g, '').trim(); + const publishDate = cells[7].replace(/<[^>]+>/g, '').trim(); + + if (title && href) { + posts.push({ + title: title, + author: author, + comment_time: commentTime, + publish_date: publishDate, + url: 'https://www.tgb.cn/' + href + }); + } + } + } catch (e) { + // fetch失败,返回空数组 + } + } + + // 去重 + const seen = new Set(); + const unique = posts.filter(p => { + if (seen.has(p.url)) return false; + seen.add(p.url); + return true; + }); + + return unique.slice(0, ${{ args.limit }}); + })() + +columns: [title, author, comment_time, publish_date, url] From ef24426801d38bae6264579c2d893ba78199193a Mon Sep 17 00:00:00 2001 From: v-duanyan <hxuanliang@163.com> Date: Tue, 17 Mar 2026 17:00:37 +0800 Subject: [PATCH 2/3] =?UTF-8?q?fix(tgb):=20=E4=BF=AE=E5=A4=8D=E5=B8=96?= =?UTF-8?q?=E5=AD=90=E6=AD=A3=E6=96=87=E6=8A=93=E5=8F=96=E9=80=BB=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 修正正文选择器:从 article-content 改为 article-text - 修复标题提取:优先从 gioMsg subject 提取,兼容等号前后空格 - 优化 HTML 清理逻辑,正确提取段落内容 - 添加段落数统计 --- src/clis/tgb/post.yaml | 92 +++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/src/clis/tgb/post.yaml b/src/clis/tgb/post.yaml index ef03d96..cae385c 100644 --- a/src/clis/tgb/post.yaml +++ b/src/clis/tgb/post.yaml @@ -1,6 +1,6 @@ site: tgb name: post -description: 抓取淘股吧帖子正文 +description: 抓取淘股吧帖子正文(极速版) domain: tgb.cn strategy: cookie browser: true @@ -21,74 +21,74 @@ pipeline: }); const html = await res.text(); - // 提取标题 - const titleMatch = html.match(/<title>([^<]+)<\/title>/); - let title = titleMatch ? titleMatch[1].replace(/_淘股吧$/, '').trim() : ''; + // 从 gioMsg 提取主题/标题(优先) + const subjectMatch = html.match(/id="gioMsg"[^>]*subject\s*=\s*"([^"]+)"/); + let title = subjectMatch ? subjectMatch[1].trim() : ''; - // 从 gioMsg 提取信息 - const gioMatch = html.match(/userName="([^"]+)"/); - const author = gioMatch ? gioMatch[1] : ''; - if (gioMatch && gioMatch[2] && !title) { - title = gioMatch[2]; + // 备选:从 title 标签提取 + if (!title) { + const titleMatch = html.match(/<title>([^<]+)<\/title>/); + if (titleMatch) { + title = titleMatch[1].replace(/_.*$/, '').trim(); + } } - // 提取时间、浏览、评论 + // 从 gioMsg 提取作者信息 + const gioMatch = html.match(/id="gioMsg"[^>]*userName="([^"]+)"/); + const author = gioMatch ? gioMatch[1] : ''; + + // 提取时间 - 在 article-data 中 const timeMatch = html.match(/(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2})/); const publishTime = timeMatch ? timeMatch[1] : ''; + // 提取浏览数 const viewMatch = html.match(/浏览\s+(\d+)/); const views = viewMatch ? viewMatch[1] : ''; + // 提取评论数 const commentMatch = html.match(/评论\s+(\d+)/); const comments = commentMatch ? commentMatch[1] : ''; - // 提取正文内容 - 在 article-content 中 + // 提取正文内容 - 在 article-text p_coten 中 let contentHtml = ''; - const contentMatch = html.match(/<div class="article-content">([\s\S]*?)<div class="handle-box/); - if (contentMatch) { - contentHtml = contentMatch[1]; - } - - // 需要过滤的关键词 - const skipPatterns = [ - '打赏Ta', - '话题与分类', - '主题股票', - '主题概念', - '声明:', - '分享文章', - '举报', - /^\d+\s+\d+\/\d+\s+\d+\s+\d+/, - /^\d+\s+评论$/, - ]; - function shouldSkip(line) { - for (const pattern of skipPatterns) { - if (pattern instanceof RegExp) { - if (pattern.test(line)) return true; - } else if (line.includes(pattern)) { - return true; + // 方法1: 匹配 article-text 到 videoImg + const contentMatch1 = html.match(/<div class="article-text[^"]*"[^>]*>([\s\S]*?)<div id="videoImg"/); + if (contentMatch1) { + contentHtml = contentMatch1[1]; + } else { + // 方法2: 匹配 article-text 到其闭合标签 + const contentMatch2 = html.match(/<div class="article-text[^"]*"[^>]*>([\s\S]*?)<\/div>\s*<div style="background:/); + if (contentMatch2) { + contentHtml = contentMatch2[1]; + } else { + // 方法3: 直接查找 article-text 的内容 + const contentMatch3 = html.match(/<div class="article-text[^"]*"[^>]*>([\s\S]*?)<\/div>/); + if (contentMatch3) { + contentHtml = contentMatch3[1]; } } - return false; } - // 解析正文段落 + // 清理 HTML 标签,提取纯文本段落 const paragraphs = []; if (contentHtml) { // 将 <br> 转换为换行 const text = contentHtml .replace(/<br\s*\/?>/gi, '\n') - .replace(/<[^>]+>/g, '') // 移除其他 HTML 标签 + .replace(/<a[^>]*>/gi, '') // 移除链接标签开始 + .replace(/<\/a>/gi, '') // 移除链接标签结束 + .replace(/<[^>]+>/g, '') // 移除其他 HTML 标签 .replace(/ /g, ' ') - .replace(/&/g, '&'); + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"'); // 按空行分割段落 - const blocks = text.split(/\n{2,}/); - - for (const block of blocks) { - const para = block.replace(/\n+/g, ' ').trim(); - if (shouldSkip(para)) break; + const lines = text.split('\n'); + for (const line of lines) { + const para = line.trim(); if (para && para.length > 0) { paragraphs.push(para); } @@ -97,7 +97,7 @@ pipeline: // 生成 markdown const mdLines = [ - `# ${title}`, + `# ${title || '无标题'}`, '', `> **作者**: ${author || '未知'}`, `> **发布时间**: ${publishTime || '未知'}`, @@ -117,7 +117,7 @@ pipeline: } return { - title: title, + title: title || '无标题', author: author || '未知', publish_time: publishTime || '未知', views: views, @@ -128,4 +128,4 @@ pipeline: }; })() -columns: [title, author, publish_time, views, comments] +columns: [title, author, publish_time, views, comments, paragraphs] From 2fef4cab911cd795ae36469ead4080207c78e8cd Mon Sep 17 00:00:00 2001 From: v-duanyan <hxuanliang@163.com> Date: Tue, 17 Mar 2026 17:02:09 +0800 Subject: [PATCH 3/3] =?UTF-8?q?chore(tgb):=20=E7=A7=BB=E9=99=A4=E6=8F=8F?= =?UTF-8?q?=E8=BF=B0=E4=B8=AD=E7=9A=84=E6=9E=81=E9=80=9F=E7=89=88=E6=A0=87?= =?UTF-8?q?=E8=AE=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/clis/tgb/post.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/clis/tgb/post.yaml b/src/clis/tgb/post.yaml index cae385c..bc0403e 100644 --- a/src/clis/tgb/post.yaml +++ b/src/clis/tgb/post.yaml @@ -1,6 +1,6 @@ site: tgb name: post -description: 抓取淘股吧帖子正文(极速版) +description: 抓取淘股吧帖子正文 domain: tgb.cn strategy: cookie browser: true