diff --git a/.github/workflows/llmstxt.yml b/.github/workflows/llmstxt.yml index 9e437b782..55fc3091e 100644 --- a/.github/workflows/llmstxt.yml +++ b/.github/workflows/llmstxt.yml @@ -1,9 +1,9 @@ name: Generate LLMs.txt on: - schedule: - - cron: "0 0 * * 0" # Run at 00:00 every Sunday workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened] permissions: contents: write @@ -27,7 +27,8 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - ref: main + ref: ${{ github.event.pull_request.head.ref || github.ref }} + token: ${{ secrets.DOCS_PUBLISHABLE_GH_TOKEN }} - name: Install dependencies run: npm install -g pnpm @@ -40,8 +41,26 @@ jobs: env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - # commit the changes and make a PR (branch protection) - - name: Create Pull Request + - name: Check for changes + id: check-changes + run: | + if [ -n "$(git status --porcelain)" ]; then + echo "has_changes=true" >> $GITHUB_OUTPUT + else + echo "has_changes=false" >> $GITHUB_OUTPUT + fi + + - name: Commit changes to PR + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name == 'pull_request' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add public/llms.txt + git commit -m "šŸ¤– Regenerate LLMs.txt" + git push + + - name: Create Pull Request (for scheduled/manual runs) + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name != 'pull_request' id: cpr uses: peter-evans/create-pull-request@v7 with: @@ -55,6 +74,7 @@ jobs: torresmateo - name: Enable Pull Request Automerge + if: steps.check-changes.outputs.has_changes == 'true' && github.event_name != 'pull_request' run: gh pr merge --squash --auto ${{ steps.cpr.outputs.pull-request-number }} env: GH_TOKEN: ${{ secrets.DOCS_PUBLISHABLE_GH_TOKEN }} diff --git a/public/llms.txt b/public/llms.txt index 181c3d70c..82123458f 100644 --- a/public/llms.txt +++ b/public/llms.txt @@ -1,3 +1,5 @@ + + # Arcade > Arcade is an AI Tool-calling Platform. For the first time, AI can securely act on behalf of users through Arcade's authenticated integrations, or "tools" in AI lingo. Connect AI to email, files, calendars, and APIs to build assistants that don't just chat - they get work done. Start building in minutes with our pre-built connectors or custom SDK. @@ -20,18 +22,13 @@ Arcade enables your AI agent to securely take real-world actions through user-sp - [Arcade API Reference](https://docs.arcade.dev/en/references/api.md): The Arcade API Reference documentation provides users with essential information about the Arcade API, including the base URL for API requests and the requirement for a valid account. It also features an interactive Swagger UI for exploring the API's functionalities. Users can learn how to effectively - [Arcade MCP (MCP Server SDK) - Python Overview](https://docs.arcade.dev/en/references/mcp/python/overview.md): This documentation page provides an overview of the Arcade MCP (MCP Server SDK) for Python, detailing its minimal API designed for programmatically building MCP servers. Users will learn how to configure server settings, manage tools, and utilize the `MCPApp -- [Claude Desktop](https://docs.arcade.dev/en/references/mcp/python/clients/claude-desktop.md): This documentation page provides a comprehensive guide for setting up and configuring the Arcade MCP server with Claude Desktop, including prerequisites, quick setup instructions, and advanced configuration options. Users will learn how to manage multiple servers, troubleshoot common issues, and implement best practices for -- [Cursor IDE](https://docs.arcade.dev/en/references/mcp/python/clients/cursor.md): This documentation page provides a comprehensive guide for integrating Arcade MCP servers with Cursor IDE, an AI-powered development environment. Users will learn how to configure their MCP servers, set up development workflows, utilize integration features, and troubleshoot common issues to enhance their coding experience - [Errors](https://docs.arcade.dev/en/references/mcp/python/errors.md): This documentation page provides an overview of domain-specific error types associated with the MCP server and its components, detailing the exception hierarchy for improved error handling and debugging. Users can learn about various exceptions, such as `MCPError`, `ServerError`, and -- [MCP Inspector](https://docs.arcade.dev/en/references/mcp/python/clients/mcp-inspector.md): The MCP Inspector documentation provides users with a comprehensive guide to install, configure, and utilize the MCP Inspector tool for debugging and testing Arcade MCP servers. It outlines features such as interactive testing, protocol monitoring, and resource browsing, along with advanced usage tips for - [Middleware](https://docs.arcade.dev/en/references/mcp/python/middleware.md): This documentation page provides an overview of the Middleware component in the Arcade MCP Server SDK for Python, detailing how users can intercept and modify MCP requests and responses through a series of handler methods. It includes information on base classes, built-in middleware options like logging - [Server](https://docs.arcade.dev/en/references/mcp/python/server.md): This documentation page provides a reference for the `MCPServer` class in the Arcade MCP Python library, detailing its purpose as a low-level server for hosting Arcade tools over MCP. Users can learn about its features, including middleware support, context injection, - [Settings](https://docs.arcade.dev/en/references/mcp/python/settings.md): This documentation page provides guidance on configuring global and environment-driven settings for the Arcade MCP Server, detailing the structure and usage of various settings containers such as MCPSettings, ServerSettings, and others. Users will learn how to create settings from environment variables, convert -- [Sharing your MCP server](https://docs.arcade.dev/en/references/mcp/python/sharing.md): This documentation page guides users on how to share their MCP server with others by creating a secure tunnel and registering it with Arcade. It outlines the steps for running the server, establishing a public URL using various tunneling options, and registering the server in the - [Telemetry](https://docs.arcade.dev/en/references/mcp/telemetry.md): This documentation page provides an overview of the telemetry data collected by the `arcade-mcp` framework, detailing what information is tracked, the purpose of this data collection, and how users can opt-out if desired. It emphasizes the optional nature of participation - [Transport Modes](https://docs.arcade.dev/en/references/mcp/python/transports.md): This documentation page provides an overview of the different transport modes (stdio and HTTP) available for MCP servers, detailing their characteristics, use cases, and configuration options. Users will learn how to implement and choose the appropriate transport mode based on their application needs, - [Types](https://docs.arcade.dev/en/references/mcp/python/types.md): This documentation page provides an overview of core Pydantic models and enums used in the MCP protocol, specifically detailing the `CallToolResult` and `SessionMessage` types. It helps users understand how to construct JSON-RPC requests and responses, as -- [Visual Studio Code](https://docs.arcade.dev/en/references/mcp/python/clients/visual-studio-code.md): This documentation page provides a comprehensive guide for integrating Arcade MCP servers with Visual Studio Code (VSCode) through various methods, including terminal integration, task runners, and launch configurations. It outlines prerequisites, development workflows, debugging tips, and best practices to enhance ## Arcade Cli @@ -42,7 +39,7 @@ Arcade enables your AI agent to securely take real-world actions through user-sp - [Authorized Tool Calling](https://docs.arcade.dev/en/home/auth/auth-tool-calling.md): The "Authorized Tool Calling" documentation provides a comprehensive guide for developers on how to implement an authorization system using Arcade, enabling AI agents to securely access external services on behalf of users. It covers the steps for initializing the client, authorizing tools, checking - [Checking Tool Authorization Status](https://docs.arcade.dev/en/home/auth/tool-auth-status.md): This documentation page provides a comprehensive guide on how to check the authorization status of tools before execution, helping users understand the necessary permissions and tool availability. It includes instructions for initializing the client in Python or JavaScript, checking the authorization status for all tools or - [Direct Third-Party API Call](https://docs.arcade.dev/en/home/auth/call-third-party-apis-directly.md): This documentation page provides a comprehensive guide on how to retrieve an authorization token using Arcade to directly call third-party APIs, exemplified by the Gmail API. Users will learn to manage user authentication flows, handle authorization requests, and utilize tokens for accessing external services -- [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization for applications that require access to sensitive user data and services. It details the challenges of authentication and authorization, and how Arcade's system supports OAuth 2.0, API keys, and user tokens, enabling +- [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization for AI applications, enabling them to securely access and act on user-specific data from external services like Gmail and Google Calendar. It details the challenges of authentication and outlines how Arcade's authorization system, which supports OAuth - [How Arcade helps with Agent Authorization](https://docs.arcade.dev/en/home/auth/how-arcade-helps.md): This documentation page explains how Arcade facilitates agent authorization for AI applications, enabling them to securely access and act on user-specific data from external services like Gmail and Google Calendar. It details the challenges of authentication and outlines how Arcade's authorization system, which supports OAuth - [Secure and Brand the Auth Flow in Production](https://docs.arcade.dev/en/home/auth/secure-auth-production.md): This documentation page provides guidance on securing and branding authentication flows in production using Arcade.dev. It outlines two methods for user verification: utilizing the default Arcade user verifier for development and implementing a custom user verifier for production applications. Users will learn how to configure these @@ -162,7 +159,7 @@ Arcade enables your AI agent to securely take real-world actions through user-sp ## Mcp Gateways -- [MCP Gateways](https://docs.arcade.dev/en/home/mcp-gateways.md): This documentation page provides a comprehensive guide on configuring and using MCP Gateways, which facilitate the connection of multiple MCP Servers to a single agent, application, or IDE. Users can learn how to create and customize MCP Gateways by selecting tools from different servers +- [MCP Gateways](https://docs.arcade.dev/en/home/mcp-gateways.md): This documentation page provides a comprehensive guide on configuring and using MCP Gateways, which facilitate the connection of multiple MCP Servers to a single agent, application, or IDE. Users will learn how to create and manage MCP Gateways, select tools from various MCP ## MCP Servers diff --git a/scripts/generate-llmstxt.ts b/scripts/generate-llmstxt.ts index 8f98ddb42..3fb91b471 100644 --- a/scripts/generate-llmstxt.ts +++ b/scripts/generate-llmstxt.ts @@ -1,3 +1,4 @@ +import { execSync } from "node:child_process"; import fs from "node:fs/promises"; import path from "node:path"; import glob from "fast-glob"; @@ -19,6 +20,11 @@ type Section = { }>; }; +type LlmsTxtMetadata = { + gitSha: string; + generationDate: string; +}; + const BASE_URL = "https://docs.arcade.dev"; const OUTPUT_PATH = path.join(process.cwd(), "public", "llms.txt"); @@ -29,16 +35,123 @@ const MDX_SUFFIX_REGEX = /\.mdx$/; const TITLE_H1_REGEX = /^#\s+(.+)$/m; const EN_LOCALE_PREFIX_REGEX = /^en\//; const MD_EXTENSION_REGEX = /\.md$/; +const METADATA_REGEX = + /^/; +const LINK_REGEX = /- \[([^\]]+)\]\(([^)]+)\):\s*(.+)$/gm; // Constants for content processing const MAX_CONTENT_LENGTH = 4000; const BATCH_DELAY_MS = 1000; +const SHA_SHORT_LENGTH = 7; // Initialize OpenAI client const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY, }); +/** + * Gets the current git SHA + */ +function getCurrentGitSha(): string { + try { + return execSync("git rev-parse HEAD", { encoding: "utf-8" }).trim(); + } catch (_error) { + console.error( + pc.red("āœ— Could not get git SHA. Make sure you're in a git repository.") + ); + throw new Error("Failed to get git SHA"); + } +} + +/** + * Parses metadata from existing llms.txt file + */ +async function parseLlmsTxtMetadata(): Promise { + try { + const content = await fs.readFile(OUTPUT_PATH, "utf-8"); + const metadataMatch = content.match(METADATA_REGEX); + if (metadataMatch) { + return { + gitSha: metadataMatch[1], + generationDate: metadataMatch[2], + }; + } + } catch (_error) { + // File doesn't exist or can't be read - that's okay + } + return null; +} + +/** + * Gets changed files since the last git SHA + */ +function getChangedFilesSince(lastSha: string): Set { + try { + // Get files that were added, modified, or deleted + const added = execSync( + `git diff --name-only --diff-filter=A ${lastSha} HEAD`, + { + encoding: "utf-8", + } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const modified = execSync( + `git diff --name-only --diff-filter=M ${lastSha} HEAD`, + { encoding: "utf-8" } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const deleted = execSync( + `git diff --name-only --diff-filter=D ${lastSha} HEAD`, + { + encoding: "utf-8", + } + ) + .trim() + .split("\n") + .filter((line) => line.length > 0); + + const allChanged = new Set([...added, ...modified, ...deleted]); + return allChanged; + } catch (_error) { + console.warn( + pc.yellow( + `⚠ Could not get changed files since ${lastSha}, processing all files` + ) + ); + return new Set(); + } +} + +/** + * Extracts existing page summaries from llms.txt + */ +async function extractExistingSummaries(): Promise< + Map +> { + const summaries = new Map(); + try { + const content = await fs.readFile(OUTPUT_PATH, "utf-8"); + // Match markdown links with descriptions: - [title](url): description + let match: RegExpExecArray | null; + // biome-ignore lint/suspicious/noAssignInExpressions: needed for regex.exec loop + while ((match = LINK_REGEX.exec(content)) !== null) { + const title = match[1]; + const url = match[2]; + const description = match[3].trim(); + summaries.set(url, { title, description }); + } + } catch (_error) { + // File doesn't exist or can't be read - that's okay + } + return summaries; +} + /** * Discovers all MDX pages in the documentation */ @@ -217,9 +330,18 @@ function formatSectionName(segment: string): string { /** * Generates the llms.txt file content */ -function generateLlmsTxt(sections: Section[]): string { +function generateLlmsTxt( + sections: Section[], + metadata: LlmsTxtMetadata +): string { const lines: string[] = []; + // Metadata comment (hidden in markdown but parseable) + lines.push( + `` + ); + lines.push(""); + // Header lines.push("# Arcade"); lines.push(""); @@ -261,6 +383,134 @@ function generateLlmsTxt(sections: Section[]): string { return lines.join("\n"); } +/** + * Determines which pages need summarization based on changes + */ +function determinePagesToSummarize( + pages: PageMetadata[], + previousMetadata: LlmsTxtMetadata | null, + existingSummaries: Map +): { + pagesToSummarize: PageMetadata[]; + pagesToKeep: Array; + hasChanges: boolean; +} { + const pagesToSummarize: PageMetadata[] = []; + const pagesToKeep: Array< + PageMetadata & { title: string; description: string } + > = []; + let hasChanges = false; + + if (previousMetadata && previousMetadata.gitSha !== "unknown") { + // Get changed files since last generation + const changedFiles = getChangedFilesSince(previousMetadata.gitSha); + console.log( + pc.blue( + `\nšŸ“Š Found ${changedFiles.size} changed files since last generation` + ) + ); + + // Create a set of current page URLs for quick lookup + const currentPageUrls = new Set(pages.map((page) => page.url)); + + // Identify deleted pages (pages that exist in previous llms.txt but not in current filesystem) + const deletedPageUrls = Array.from(existingSummaries.keys()).filter( + (url) => !currentPageUrls.has(url) + ); + + if (deletedPageUrls.length > 0) { + hasChanges = true; + console.log( + pc.yellow( + `\nšŸ—‘ļø Found ${deletedPageUrls.length} deleted pages (will be removed from output)` + ) + ); + } + + // Filter pages based on changes + for (const page of pages) { + const url = page.url; + const existingSummary = existingSummaries.get(url); + + // Check if this page's file was changed + const isChanged = changedFiles.has(page.path); + + if (isChanged || !existingSummary) { + // Need to summarize this page + pagesToSummarize.push(page); + hasChanges = true; + } else { + // Keep existing summary + pagesToKeep.push({ + ...page, + title: existingSummary.title, + description: existingSummary.description, + }); + } + } + + console.log( + pc.green( + `āœ“ ${pagesToKeep.length} pages unchanged, ${pagesToSummarize.length} pages to summarize${deletedPageUrls.length > 0 ? `, ${deletedPageUrls.length} pages deleted` : ""}` + ) + ); + } else { + // No previous generation or can't determine, summarize all pages + console.log( + pc.yellow("⚠ No previous generation found, summarizing all pages") + ); + pagesToSummarize.push(...pages); + hasChanges = true; // Always regenerate if no previous metadata + } + + return { pagesToSummarize, pagesToKeep, hasChanges }; +} + +/** + * Summarizes pages in batches + */ +async function summarizePagesInBatches( + pagesToSummarize: PageMetadata[], + pagesToKeep: Array +): Promise> { + const summarizedPages: Array< + PageMetadata & { title: string; description: string } + > = [...pagesToKeep]; + + if (pagesToSummarize.length === 0) { + return summarizedPages; + } + + console.log(pc.blue("\nšŸ“ Summarizing pages with OpenAI...")); + // Process in batches to avoid rate limits + const batchSize = 5; + for (let i = 0; i < pagesToSummarize.length; i += batchSize) { + const batch = pagesToSummarize.slice(i, i + batchSize); + const batchResults = await Promise.all(batch.map(summarizePage)); + + for (let j = 0; j < batch.length; j += 1) { + summarizedPages.push({ + ...batch[j], + ...batchResults[j], + }); + } + + console.log( + pc.gray( + ` Processed ${Math.min(i + batchSize, pagesToSummarize.length)}/${pagesToSummarize.length} pages` + ) + ); + + // Add a small delay between batches + if (i + batchSize < pagesToSummarize.length) { + await new Promise((resolve) => setTimeout(resolve, BATCH_DELAY_MS)); + } + } + + console.log(pc.green(`āœ“ Summarized ${pagesToSummarize.length} pages`)); + return summarizedPages; +} + /** * Main execution function */ @@ -274,54 +524,63 @@ async function main() { } try { - // Step 1: Discover all pages - const pages = await discoverPages(); - - // Step 2: Summarize each page using OpenAI - console.log(pc.blue("\nšŸ“ Summarizing pages with OpenAI...")); - const summarizedPages: Array< - PageMetadata & { title: string; description: string } - > = []; - - // Process in batches to avoid rate limits - const batchSize = 5; - for (let i = 0; i < pages.length; i += batchSize) { - const batch = pages.slice(i, i + batchSize); - const batchResults = await Promise.all(batch.map(summarizePage)); - - for (let j = 0; j < batch.length; j += 1) { - summarizedPages.push({ - ...batch[j], - ...batchResults[j], - }); - } + // Step 0: Get current git SHA and check for previous generation + const currentSha = getCurrentGitSha(); + const previousMetadata = await parseLlmsTxtMetadata(); + const existingSummaries = await extractExistingSummaries(); + console.log(pc.blue(`šŸ“Œ Current git SHA: ${currentSha}`)); + if (previousMetadata) { console.log( pc.gray( - ` Processed ${Math.min(i + batchSize, pages.length)}/${pages.length} pages` + ` Previous generation: ${previousMetadata.generationDate} (SHA: ${previousMetadata.gitSha.substring(0, SHA_SHORT_LENGTH)})` ) ); - - // Add a small delay between batches - if (i + batchSize < pages.length) { - await new Promise((resolve) => setTimeout(resolve, BATCH_DELAY_MS)); - } } - console.log(pc.green(`āœ“ Summarized ${summarizedPages.length} pages`)); + // Step 1: Discover all pages + const pages = await discoverPages(); + + // Step 2: Determine which pages need summarization and identify deleted pages + const { pagesToSummarize, pagesToKeep, hasChanges } = + determinePagesToSummarize(pages, previousMetadata, existingSummaries); + + // Step 3: Summarize changed/new pages using OpenAI + const summarizedPages = await summarizePagesInBatches( + pagesToSummarize, + pagesToKeep + ); - // Step 3: Organize into sections + // Step 4: Organize into sections console.log(pc.blue("\nšŸ“‚ Organizing sections...")); const sections = organizeSections(summarizedPages); console.log(pc.green(`āœ“ Created ${sections.length} sections`)); - // Step 4: Generate llms.txt content + // Step 5: Generate llms.txt content console.log(pc.blue("\nāœļø Generating llms.txt content...")); - const content = generateLlmsTxt(sections); - - // Step 5: Write to file + // Only update metadata if there are changes, otherwise keep previous metadata + const metadata: LlmsTxtMetadata = hasChanges + ? { + gitSha: currentSha, + generationDate: new Date().toISOString(), + } + : previousMetadata || { + gitSha: currentSha, + generationDate: new Date().toISOString(), + }; + const content = generateLlmsTxt(sections, metadata); + + // Step 6: Write to file await fs.writeFile(OUTPUT_PATH, content, "utf-8"); - console.log(pc.green(`āœ“ Generated llms.txt at ${OUTPUT_PATH}`)); + if (hasChanges) { + console.log(pc.green(`āœ“ Generated llms.txt at ${OUTPUT_PATH}`)); + } else { + console.log( + pc.gray( + "āœ“ No changes detected, llms.txt unchanged (SHA and date preserved)" + ) + ); + } console.log(pc.bold(pc.green("\n✨ Done!\n"))); } catch (error) {