Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
.env
.env.*
*.env
.DS_Store
node_modules/
data/
Expand Down
4 changes: 2 additions & 2 deletions scripts/create-source.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ const apiBody = JSON.stringify(parsedPayload);

// Auto-provision API key if missing, resolve base URL
import { ensureKey, getBaseUrl } from "./ensure-key";
import { openUrlInBrowser } from "./security";
const baseUrl = getBaseUrl();
const apiKey = await ensureKey();
if (!apiKey) {
Expand Down Expand Up @@ -120,8 +121,7 @@ try {
}

// Open the live page in the user's browser automatically
const { $ } = await import("bun");
await $`open ${result.source_url}`.quiet().nothrow();
await openUrlInBrowser(result.source_url, new URL(baseUrl).hostname);

result.run_id = runId;
console.log(JSON.stringify(result));
Expand Down
14 changes: 12 additions & 2 deletions scripts/diarize.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ function extractVideoId(url: string): string | null {
return match?.[1] ?? null;
}

function sanitizeCliUrl(url: string): string {
const trimmed = url.trim();
if (!trimmed || trimmed.startsWith("-")) {
throw new Error("Invalid URL for command execution.");
}
return trimmed;
}

// ---------------------------------------------------------------------------
// Timestamp math
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -92,9 +100,10 @@ interface YoutubeMeta {

/** Fetch YouTube video metadata via yt-dlp --dump-json */
async function fetchYoutubeMeta(url: string): Promise<YoutubeMeta> {
const safeUrl = sanitizeCliUrl(url);
const empty: YoutubeMeta = { publishedAt: null, title: null, channel: null, channelHandle: null, channelUrl: null };
try {
const result = await $`yt-dlp --dump-json --skip-download ${url}`.quiet().nothrow();
const result = await $`yt-dlp --dump-json --skip-download -- ${safeUrl}`.quiet().nothrow();
if (result.exitCode !== 0) return empty;
const meta = JSON.parse(result.stdout.toString());

Expand Down Expand Up @@ -123,12 +132,13 @@ async function fetchYoutubeMeta(url: string): Promise<YoutubeMeta> {
}

async function downloadAudio(url: string): Promise<string> {
const safeUrl = sanitizeCliUrl(url);
const videoId = extractVideoId(url) || "audio";
const outPath = join(tmpdir(), `diarize-${videoId}.mp3`);

const { streamLog } = await import("./stream-log");
streamLog("Downloading audio...");
const result = await $`yt-dlp --extract-audio --audio-format mp3 --audio-quality 5 -o ${outPath} ${url}`
const result = await $`yt-dlp --extract-audio --audio-format mp3 --audio-quality 5 -o ${outPath} -- ${safeUrl}`
.quiet()
.nothrow();

Expand Down
14 changes: 9 additions & 5 deletions scripts/ensure-key.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import { writeFileSync, existsSync, appendFileSync } from "fs";
import { getEnvSearchPaths, getPreferredEnvWritePath, readEnvValue } from "./runtime-paths";
import { normalizeTrustedBaseUrl } from "./security";

/** Read a key from process.env or the nearest user/project .env context. */
export function loadKey(key: string): string | undefined {
Expand All @@ -18,7 +19,10 @@ export function loadKey(key: string): string | undefined {

/** Resolve the base URL for paste.trade API. */
export function getBaseUrl(): string {
return loadKey("PASTE_TRADE_URL") || loadKey("BOARD_URL") || loadKey("BELIEF_BOARD_URL") || "https://paste.trade";
const configured = loadKey("PASTE_TRADE_URL") || loadKey("BOARD_URL") || loadKey("BELIEF_BOARD_URL");
const { baseUrl, trusted, reason } = normalizeTrustedBaseUrl(configured);
if (!trusted) throw new Error(reason ?? "Invalid base URL configuration.");
return baseUrl;
}

/**
Expand All @@ -30,11 +34,11 @@ export async function ensureKey(): Promise<string | null> {
const existing = loadKey("PASTE_TRADE_KEY");
if (existing) return existing;

// No key found — auto-provision
const baseUrl = getBaseUrl();
console.error(`[paste.trade] No API key found. Creating your identity...`);

try {
// No key found — auto-provision
const baseUrl = getBaseUrl();
console.error(`[paste.trade] No API key found. Creating your identity...`);

const res = await fetch(`${baseUrl}/api/keys`, {
method: "POST",
headers: { "Content-Type": "application/json" },
Expand Down
51 changes: 34 additions & 17 deletions scripts/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import { mkdirSync } from "fs";
import { tmpdir } from "os";
import { join } from "path";
import { getRuntimeSourceDir, readEnvValue } from "./runtime-paths";
import { fetchWithSafeRedirects, parseSafeExternalUrl } from "./security";

// ---------------------------------------------------------------------------
// X API tokens (optional)
Expand Down Expand Up @@ -256,6 +257,14 @@ function extractVideoId(url: string): string | null {
return m?.[1] ?? null;
}

function sanitizeCliUrl(url: string): string {
const trimmed = url.trim();
if (!trimmed || trimmed.startsWith("-")) {
throw new Error("Invalid URL for command execution.");
}
return trimmed;
}

// ---------------------------------------------------------------------------
// YouTube transcript via yt-dlp
// ---------------------------------------------------------------------------
Expand Down Expand Up @@ -288,9 +297,10 @@ interface YoutubeMeta {

/** Fetch YouTube video metadata via yt-dlp --dump-json */
async function fetchYoutubeMeta(url: string): Promise<YoutubeMeta> {
const safeUrl = sanitizeCliUrl(url);
const empty: YoutubeMeta = { publishedAt: null, title: null, channel: null, channelHandle: null, channelUrl: null, description: null, durationSeconds: null };
try {
const result = await $`yt-dlp --dump-json --skip-download ${url}`.quiet().nothrow();
const result = await $`yt-dlp --dump-json --skip-download -- ${safeUrl}`.quiet().nothrow();
if (result.exitCode !== 0) return empty;
const meta = JSON.parse(result.stdout.toString());

Expand Down Expand Up @@ -333,6 +343,7 @@ async function streamStatus(message: string): Promise<void> {
}

async function extractYoutube(url: string): Promise<string> {
const safeUrl = sanitizeCliUrl(url);
const videoId = extractVideoId(url);
if (!videoId) throw new Error("Could not extract video ID from URL");

Expand All @@ -353,7 +364,7 @@ async function extractYoutube(url: string): Promise<string> {
const capFile = join(tmpdir(), `yt-transcript-${videoId}-${videoId}.en.json3`);

// Step 2: attempt caption download — quiet captures stderr for diagnosis
const result = await $`yt-dlp --write-auto-sub --write-sub --skip-download --sub-lang en --sub-format json3 -o ${outTemplate} ${url}`
const result = await $`yt-dlp --write-auto-sub --write-sub --skip-download --sub-lang en --sub-format json3 -o ${outTemplate} -- ${safeUrl}`
.quiet()
.nothrow();

Expand Down Expand Up @@ -413,7 +424,7 @@ async function extractYoutube(url: string): Promise<string> {
}

// Step 3: fetch metadata (parallel-safe, runs while we parse captions)
const metaPromise = fetchYoutubeMeta(url);
const metaPromise = fetchYoutubeMeta(safeUrl);

// Step 4: parse the caption file
const data = (await Bun.file(capFile).json()) as {
Expand Down Expand Up @@ -1189,20 +1200,21 @@ function extractArticleMetadataFromHtml(
}

async function fetchArticleMetadata(url: string): Promise<ArticleMetadata | null> {
const safeUrl = parseSafeExternalUrl(url)?.href;
if (!safeUrl) return null;
try {
const res = await fetch(url, {
const res = await fetchWithSafeRedirects(safeUrl, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
Accept: "text/html,application/xhtml+xml,*/*",
},
redirect: "follow",
});
if (!res.ok) return null;
const body = await res.text();
if (!body || body.length < 50) return null;
const looksLikeHtml = /<html|<meta|<title|<script/i.test(body);
if (!looksLikeHtml) return null;
return extractArticleMetadataFromHtml(body, url);
return extractArticleMetadataFromHtml(body, safeUrl);
} catch {
return null;
}
Expand Down Expand Up @@ -1242,20 +1254,24 @@ function buildArticlePayload(
// ---------------------------------------------------------------------------

async function extractText(url: string): Promise<string> {
const safeUrl = parseSafeExternalUrl(url)?.href;
if (!safeUrl) {
return JSON.stringify({ source: "text", url, error: "Blocked unsafe or invalid URL." });
}
streamStatus("Extracting article...");
const metadataPromise = fetchArticleMetadata(url);
const metadataPromise = fetchArticleMetadata(safeUrl);

// Try markdown.new first (clean article extraction, handles JS-rendered pages)
try {
const mdRes = await fetch(`https://markdown.new/${url}`, {
const mdRes = await fetch(`https://markdown.new/${encodeURIComponent(safeUrl)}`, {
headers: { Accept: "text/markdown" },
});
if (mdRes.ok) {
const md = (await mdRes.text()).trim();
if (md.length > 100) {
const images = extractImagesFromMarkdown(md);
const metadata = await metadataPromise;
const payload = buildArticlePayload("markdown.new", url, md, images, metadata);
const payload = buildArticlePayload("markdown.new", safeUrl, md, images, metadata);
const parsed = JSON.parse(payload) as { word_count?: number };
console.error(` markdown.new: ${parsed.word_count ?? 0} words, ${images.length} images extracted`);
return payload;
Expand All @@ -1266,23 +1282,22 @@ async function extractText(url: string): Promise<string> {
}

// Fallback: raw fetch + regex strip
const res = await fetch(url, {
const res = await fetchWithSafeRedirects(safeUrl, {
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
Accept: "text/html,application/xhtml+xml",
},
redirect: "follow",
});

if (!res.ok) {
return JSON.stringify({ source: "text", url, error: `HTTP ${res.status}` });
return JSON.stringify({ source: "text", url: safeUrl, error: `HTTP ${res.status}` });
}

const html = await res.text();

// Extract images before stripping HTML
const images = extractImagesFromHtml(html, url);
const htmlMetadata = extractArticleMetadataFromHtml(html, url, undefined, images);
const images = extractImagesFromHtml(html, safeUrl);
const htmlMetadata = extractArticleMetadataFromHtml(html, safeUrl, undefined, images);

const text = html
.replace(/<script[\s\S]*?<\/script>/gi, "")
Expand All @@ -1293,7 +1308,7 @@ async function extractText(url: string): Promise<string> {
.replace(/\s+/g, " ")
.trim();

const payload = buildArticlePayload("text", url, text, images, htmlMetadata);
const payload = buildArticlePayload("text", safeUrl, text, images, htmlMetadata);
const parsed = JSON.parse(payload) as { word_count?: number };
console.error(` raw fetch: ${parsed.word_count ?? 0} words, ${images.length} images extracted`);
return payload;
Expand Down Expand Up @@ -1342,10 +1357,12 @@ async function main() {
const img = parsed.images[i];
const imgUrl = typeof img === "string" ? img : img?.url;
if (!imgUrl) continue;
const safeImgUrl = parseSafeExternalUrl(imgUrl)?.href;
if (!safeImgUrl) continue;
try {
const imgRes = await fetch(imgUrl);
const imgRes = await fetchWithSafeRedirects(safeImgUrl);
if (imgRes.ok) {
const ext = imgUrl.match(/\.(jpg|jpeg|png|gif|webp)/i)?.[1] ?? "jpg";
const ext = safeImgUrl.match(/\.(jpg|jpeg|png|gif|webp)/i)?.[1] ?? "jpg";
const imgPath = join(dir, `source-${hash}-img${i}.${ext}`);
await Bun.write(imgPath, await imgRes.arrayBuffer());
imageFiles.push(imgPath);
Expand Down
126 changes: 126 additions & 0 deletions scripts/security.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import { spawn } from "child_process";
import { isIP } from "net";
import { relative, resolve } from "path";

const DEFAULT_BASE_URL = "https://paste.trade";

function isPrivateIp(hostname: string): boolean {
if (isIP(hostname) === 0) return false;
const lower = hostname.toLowerCase();

// IPv6 local/link-local/loopback
if (lower === "::1" || lower.startsWith("fe80:") || lower.startsWith("fc") || lower.startsWith("fd")) {
return true;
}

// IPv4 private/link-local/loopback ranges
const parts = hostname.split(".").map((n) => Number(n));
if (parts.length !== 4 || parts.some(Number.isNaN)) return true;
const [a, b] = parts;
if (a === 10 || a === 127) return true;
if (a === 169 && b === 254) return true;
if (a === 172 && b >= 16 && b <= 31) return true;
if (a === 192 && b === 168) return true;
return false;
}

function hasForbiddenHost(hostname: string): boolean {
const h = hostname.toLowerCase();
return (
h === "localhost" ||
h.endsWith(".localhost") ||
h.endsWith(".local") ||
h.endsWith(".internal") ||
isPrivateIp(h)
);
}

export function parseSafeExternalUrl(raw: string): URL | null {
try {
const url = new URL(raw);
if (url.protocol !== "https:") return null;
if (hasForbiddenHost(url.hostname)) return null;
return url;
} catch {
return null;
}
}

export function normalizeTrustedBaseUrl(
configured?: string,
): { baseUrl: string; trusted: boolean; reason?: string } {
const candidate = (configured ?? "").trim() || DEFAULT_BASE_URL;
try {
const parsed = new URL(candidate);
if (parsed.protocol === "https:" || parsed.protocol === "http:") {
return { baseUrl: parsed.origin, trusted: true };
}
return {
baseUrl: candidate,
trusted: false,
reason: `Invalid base URL "${candidate}" blocked. Only http/https are allowed.`,
};
} catch {
return {
baseUrl: candidate,
trusted: false,
reason: `Invalid base URL "${candidate}" blocked.`,
};
}
}

export async function openUrlInBrowser(rawUrl: string, allowedHost?: string): Promise<boolean> {
let parsed: URL;
try {
parsed = new URL(rawUrl);
} catch {
return false;
}

if (parsed.protocol !== "https:") return false;
if (allowedHost && parsed.hostname !== allowedHost) return false;

const cmd =
process.platform === "darwin"
? { bin: "open", args: [parsed.href] }
: process.platform === "linux"
? { bin: "xdg-open", args: [parsed.href] }
: { bin: "rundll32", args: ["url.dll,FileProtocolHandler", parsed.href] };

return await new Promise<boolean>((resolveDone) => {
const child = spawn(cmd.bin, cmd.args, { stdio: "ignore", shell: false });
child.on("error", () => resolveDone(false));
child.on("exit", (code) => resolveDone(code === 0));
});
}

export function ensurePathInsideDir(filePath: string, allowedDir: string): string | null {
const resolvedFile = resolve(filePath);
const resolvedDir = resolve(allowedDir);
const rel = relative(resolvedDir, resolvedFile);
if (rel === "" || (!rel.startsWith("..") && rel !== ".")) return resolvedFile;
return null;
}

export async function fetchWithSafeRedirects(
inputUrl: string,
init?: RequestInit,
maxRedirects = 5,
): Promise<Response> {
let current = parseSafeExternalUrl(inputUrl);
if (!current) throw new Error("Blocked unsafe or invalid URL.");

for (let i = 0; i <= maxRedirects; i++) {
const res = await fetch(current.href, { ...init, redirect: "manual" });
if (res.status >= 300 && res.status < 400) {
const location = res.headers.get("location");
if (!location) return res;
const next = parseSafeExternalUrl(new URL(location, current).href);
if (!next) throw new Error("Blocked unsafe redirect target.");
current = next;
continue;
}
return res;
}
throw new Error("Too many redirects.");
}
Loading