Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions clis/scys/activity.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { cli, Strategy } from '@jackwener/opencli/registry';
import { extractScysActivity } from './extractors.js';
cli({
site: 'scys',
name: 'activity',
description: 'Extract SCYS activity landing page structure (tabs, stages, tasks)',
domain: 'scys.com',
strategy: Strategy.COOKIE,
navigateBefore: false,
args: [
{ name: 'url', required: true, positional: true, help: 'Activity landing URL: /activity/landing/:id' },
{ name: 'wait', type: 'int', default: 3, help: 'Seconds to wait after page load' },
],
columns: ['title', 'subtitle', 'tabs', 'stages', 'url'],
func: async (page, kwargs) => {
return extractScysActivity(page, String(kwargs.url), {
waitSeconds: Number(kwargs.wait ?? 3),
});
},
});
48 changes: 48 additions & 0 deletions clis/scys/article.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { EmptyResultError } from '@jackwener/opencli/errors';
import { cli, Strategy } from '@jackwener/opencli/registry';
import { extractScysArticle } from './extractors.js';

function isRetryableScysArticleError(error) {
const message = error instanceof Error ? error.message : String(error);
return error instanceof EmptyResultError
|| /stale page identity/i.test(message)
|| /Page not found:/i.test(message)
|| /Article detail page did not hydrate beyond shell content/i.test(message);
}
cli({
site: 'scys',
name: 'article',
description: 'Extract SCYS article detail page content and metadata',
domain: 'scys.com',
strategy: Strategy.COOKIE,
navigateBefore: false,
args: [
{ name: 'url', required: true, positional: true, help: 'Article URL or topic id: /articleDetail/<entityType>/<topicId>' },
{ name: 'wait', type: 'int', default: 5, help: 'Seconds to wait after page load' },
{ name: 'max-length', type: 'int', default: 4000, help: 'Max content length for long text fields' },
],
columns: ['topic_id', 'entity_type', 'title', 'author', 'time', 'tags', 'flags', 'image_count', 'external_link_count', 'content', 'ai_summary', 'url'],
func: async (page, kwargs) => {
const url = String(kwargs.url);
const options = {
waitSeconds: Number(kwargs.wait ?? 5),
maxLength: Number(kwargs['max-length'] ?? 4000),
};
let lastError = null;
const maxAttempts = 5;
for (let attempt = 1; attempt <= maxAttempts; attempt += 1) {
try {
return await extractScysArticle(page, url, options);
} catch (error) {
lastError = error;
if (!isRetryableScysArticleError(error) || attempt === maxAttempts) {
throw error;
}
// A full window reset is closer to the successful manual re-run path
// than another probe inside the same browser state.
await page.closeWindow?.().catch(() => { });
}
}
throw lastError;
},
});
71 changes: 71 additions & 0 deletions clis/scys/article.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
import { EmptyResultError } from '@jackwener/opencli/errors';

const { mockExtractScysArticle } = vi.hoisted(() => ({
mockExtractScysArticle: vi.fn(),
}));

vi.mock('./extractors.js', () => ({
extractScysArticle: mockExtractScysArticle,
}));

import { getRegistry } from '@jackwener/opencli/registry';
import './article.js';

describe('scys article command retry', () => {
const command = getRegistry().get('scys/article');
const page = {
closeWindow: vi.fn().mockResolvedValue(undefined),
};

beforeEach(() => {
mockExtractScysArticle.mockReset();
page.closeWindow.mockClear();
});

it('retries once after shell-only EmptyResultError', async () => {
mockExtractScysArticle
.mockRejectedValueOnce(new EmptyResultError('scys/article', 'Article detail page did not hydrate beyond shell content'))
.mockResolvedValueOnce({ topic_id: '14422288551185512', title: 'ok' });

const result = await command.func(page, {
url: 'https://scys.com/articleDetail/xq_topic/14422288551185512',
wait: 6,
'max-length': 4000,
});

expect(result).toEqual({ topic_id: '14422288551185512', title: 'ok' });
expect(mockExtractScysArticle).toHaveBeenCalledTimes(2);
expect(page.closeWindow).toHaveBeenCalledTimes(1);
});

it('retries up to three attempts for retryable shell-only errors', async () => {
mockExtractScysArticle
.mockRejectedValueOnce(new EmptyResultError('scys/article', 'Article detail page did not hydrate beyond shell content'))
.mockRejectedValueOnce(new EmptyResultError('scys/article', 'Article detail page did not hydrate beyond shell content'))
.mockResolvedValueOnce({ topic_id: '14422288551185512', title: 'ok' });

const result = await command.func(page, {
url: 'https://scys.com/articleDetail/xq_topic/14422288551185512',
wait: 6,
'max-length': 4000,
});

expect(result).toEqual({ topic_id: '14422288551185512', title: 'ok' });
expect(mockExtractScysArticle).toHaveBeenCalledTimes(3);
expect(page.closeWindow).toHaveBeenCalledTimes(2);
});

it('does not retry non-retryable errors', async () => {
mockExtractScysArticle.mockRejectedValueOnce(new Error('boom'));

await expect(command.func(page, {
url: 'https://scys.com/articleDetail/xq_topic/14422288551185512',
wait: 6,
'max-length': 4000,
})).rejects.toThrow('boom');

expect(mockExtractScysArticle).toHaveBeenCalledTimes(1);
expect(page.closeWindow).not.toHaveBeenCalled();
});
});
99 changes: 99 additions & 0 deletions clis/scys/common.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import { ArgumentError } from '@jackwener/opencli/errors';
const SCYS_ORIGIN = 'https://scys.com';
export function normalizeScysUrl(input) {
const raw = String(input ?? '').trim();
if (!raw) {
throw new ArgumentError('SCYS URL is required');
}
if (/^https?:\/\//i.test(raw)) {
return raw;
}
if (raw.startsWith('/')) {
return `${SCYS_ORIGIN}${raw}`;
}
if (raw.startsWith('scys.com')) {
return `https://${raw}`;
}
return `${SCYS_ORIGIN}/${raw.replace(/^\/+/, '')}`;
}
export function toScysCourseUrl(input) {
const raw = String(input ?? '').trim();
if (!raw)
throw new ArgumentError('Course URL or course id is required');
if (/^\d+$/.test(raw)) {
return `${SCYS_ORIGIN}/course/detail/${raw}`;
}
return normalizeScysUrl(raw);
}
export function toScysArticleUrl(input) {
const raw = String(input ?? '').trim();
if (!raw)
throw new ArgumentError('Article URL is required');
if (/^\d{8,}$/.test(raw)) {
return `${SCYS_ORIGIN}/articleDetail/xq_topic/${raw}`;
}
const url = normalizeScysUrl(raw);
const parsed = new URL(url);
const match = parsed.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
if (!match) {
throw new ArgumentError(`Unsupported SCYS article URL: ${input}`, 'Use /articleDetail/<entityType>/<topicId> or pass a numeric topic id');
}
return url;
}
export function detectScysPageType(input) {
const url = new URL(normalizeScysUrl(input));
const pathname = url.pathname;
if (pathname.startsWith('/course/detail/'))
return 'course';
if (pathname.startsWith('/opportunity'))
return 'opportunity';
if (pathname.startsWith('/activity/landing/'))
return 'activity';
if (/^\/articleDetail\/[^/]+\/[^/]+$/.test(pathname))
return 'article';
if (pathname.startsWith('/personal/')) {
const tab = (url.searchParams.get('tab') || '').toLowerCase();
if (tab === 'posts')
return 'feed';
}
if (pathname === '/' || pathname === '') {
const filter = (url.searchParams.get('filter') || '').toLowerCase();
if (filter === 'essence')
return 'feed';
}
return 'unknown';
}
export function extractScysCourseId(input) {
const url = new URL(toScysCourseUrl(input));
const match = url.pathname.match(/\/course\/detail\/(\d+)/);
return match?.[1] ?? '';
}
export function extractScysArticleMeta(input) {
const url = new URL(toScysArticleUrl(input));
const match = url.pathname.match(/^\/articleDetail\/([^/]+)\/([^/]+)$/);
return {
entityType: match?.[1] ?? '',
topicId: match?.[2] ?? '',
};
}
export function cleanText(value) {
return String(value ?? '').replace(/\s+/g, ' ').trim();
}
export function extractInteractions(raw) {
const text = cleanText(raw);
if (!text)
return '';
const pieces = text.match(/[0-9]+(?:\.[0-9]+)?(?:万|亿)?/g);
if (!pieces || pieces.length === 0)
return text;
return pieces.join(' ');
}
export function inferScysReadUrl(input) {
return normalizeScysUrl(input);
}
export function buildScysHomeEssenceUrl() {
return `${SCYS_ORIGIN}/?filter=essence`;
}
export function buildScysOpportunityUrl() {
return `${SCYS_ORIGIN}/opportunity`;
}
68 changes: 68 additions & 0 deletions clis/scys/common.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import { describe, expect, it } from 'vitest';
import { cleanText, detectScysPageType, extractScysArticleMeta, extractInteractions, normalizeScysUrl, toScysArticleUrl, toScysCourseUrl, } from './common.js';
describe('normalizeScysUrl', () => {
it('normalizes bare domain and keeps path/query', () => {
expect(normalizeScysUrl('scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
});
it('normalizes root-relative paths', () => {
expect(normalizeScysUrl('/opportunity')).toBe('https://scys.com/opportunity');
});
});
describe('toScysCourseUrl', () => {
it('accepts numeric course id', () => {
expect(toScysCourseUrl('92')).toBe('https://scys.com/course/detail/92');
});
it('keeps full course detail URL unchanged', () => {
expect(toScysCourseUrl('https://scys.com/course/detail/142?chapterId=9445')).toBe('https://scys.com/course/detail/142?chapterId=9445');
});
});
describe('toScysArticleUrl', () => {
it('accepts numeric topic id', () => {
expect(toScysArticleUrl('55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
});
it('keeps full article detail url', () => {
expect(toScysArticleUrl('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('https://scys.com/articleDetail/xq_topic/55188458224514554');
});
});
describe('extractScysArticleMeta', () => {
it('extracts entity type and topic id from url', () => {
expect(extractScysArticleMeta('https://scys.com/articleDetail/xq_topic/55188458224514554')).toEqual({
entityType: 'xq_topic',
topicId: '55188458224514554',
});
});
});
describe('detectScysPageType', () => {
it('detects course detail with chapterId', () => {
expect(detectScysPageType('https://scys.com/course/detail/142?chapterId=9445')).toBe('course');
});
it('detects course detail without chapterId', () => {
expect(detectScysPageType('https://scys.com/course/detail/92')).toBe('course');
});
it('detects essence feed on homepage', () => {
expect(detectScysPageType('https://scys.com/?filter=essence')).toBe('feed');
});
it('detects profile posts feed', () => {
expect(detectScysPageType('https://scys.com/personal/421122582111848?number=18563&tab=posts')).toBe('feed');
});
it('detects opportunity page', () => {
expect(detectScysPageType('https://scys.com/opportunity')).toBe('opportunity');
});
it('detects activity landing page', () => {
expect(detectScysPageType('https://scys.com/activity/landing/5505?tabIndex=1')).toBe('activity');
});
it('detects article detail page', () => {
expect(detectScysPageType('https://scys.com/articleDetail/xq_topic/55188458224514554')).toBe('article');
});
it('returns unknown for unsupported pages', () => {
expect(detectScysPageType('https://scys.com/help')).toBe('unknown');
});
});
describe('text helpers', () => {
it('cleanText collapses whitespace', () => {
expect(cleanText(' hello\n\nworld ')).toBe('hello world');
});
it('extractInteractions keeps compact numeric text', () => {
expect(extractInteractions('赞 1.2万 评论 35')).toBe('1.2万 35');
});
});
Loading
Loading