diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ebfc2e4198bf5..5de11dda6d6b0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,13 +10,10 @@ on: push: branches: - main - pull_request_target: + pull_request: branches: - main - types: - - labeled merge_group: - workflow_dispatch: defaults: run: @@ -33,11 +30,6 @@ env: jobs: build: - # This Job should run either on non-`pull_request_target` events, - # or `pull_request_target` event with a `labeled` action with a label named `github_actions:pull-request` - # since we want to run Website Builds on all these occasions. As this allows us to be certain the that builds are passing - if: github.event_name != 'pull_request_target' || github.event.label.name == 'github_actions:pull-request' - name: Build on ${{ matrix.os }} runs-on: ${{ matrix.os }} @@ -60,15 +52,6 @@ jobs: - name: Git Checkout uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - # Provides the Pull Request commit SHA or the GitHub merge group ref - ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.ref }} - # We only need to fetch the last commit from the head_ref - # since we're not using the `--filter` operation from turborepo - # We don't use the `--filter` as we always want to force builds regardless of having changes or not - # this ensures that our bundle analysis script always runs and that we always ensure next.js is building - # regardless of having code changes or not - fetch-depth: 1 - uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 with: @@ -90,7 +73,7 @@ jobs: # We only want to install required production packages run: pnpm install --prod --frozen-lockfile - - name: Build Next.js (ISR) + - name: Build Next.js # We want a ISR build on CI to ensure that regular Next.js builds work as expected. run: node_modules/.bin/turbo build ${{ env.TURBO_ARGS }} env: @@ -98,51 +81,5 @@ jobs: # this should be a last resort in case by any chances the build memory gets too high # but in general this should never happen NODE_OPTIONS: '--max_old_space_size=4096' - # Used for API requests that require GitHub API scopes - NEXT_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - - - name: Build Next.js (Static All Locales) - # We only run full static builds within Pull Requests. This step is also used to export - # static output in all languages, and it only works on `push` events. - if: github.event_name == 'push' - run: node_modules/.bin/turbo deploy ${{ env.TURBO_ARGS }} - env: - # We want to ensure we have enough RAM allocated to the Node.js process - # this should be a last resort in case by any chances the build memory gets too high - # but in general this should never happen - NODE_OPTIONS: '--max_old_space_size=4096' - # Used for API requests that require GitHub API scopes - NEXT_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - # We want to ensure that static exports for all locales are triggered only on `push` events to save resources - # and time. - NEXT_PUBLIC_STATIC_EXPORT_LOCALE: true - - - name: Build Next.js (Static Default Locale) - # We want to generate static output in the default language within Pull Requests - # in order to reduce source wastages and build times. - # Note that we skip full static builds on Crowdin-based Pull Requests as these PRs should only contain translation changes - if: | - (github.event_name == 'pull_request_target' && - github.event.pull_request.head.ref != 'chore/crowdin') - run: node_modules/.bin/turbo deploy ${{ env.TURBO_ARGS }} - env: - # We want to ensure we have enough RAM allocated to the Node.js process - # this should be a last resort in case by any chances the build memory gets too high - # but in general this should never happen - NODE_OPTIONS: '--max_old_space_size=4096' - # Used for API requests that require GitHub API scopes - NEXT_GITHUB_API_KEY: ${{ secrets.GITHUB_TOKEN }} - # We want to ensure that static exports for all locales do not occur on `pull_request_target` events - NEXT_PUBLIC_STATIC_EXPORT_LOCALE: false - - - name: Sync Orama Cloud - # We only want to sync the Orama Cloud production indexes on `push` events. - # We also want to sync the Orama Cloud preview (deployment) indexes on `pull_request_target` events (or manual triggers). - # We also want to ensure that the sync only happens on the `ubuntu-latest` runner to avoid duplicate syncs - # or Windows-based path issues. - env: - ORAMA_INDEX_ID: ${{ github.event_name == 'push' && secrets.ORAMA_PRODUCTION_INDEX_ID || secrets.ORAMA_INDEX_ID }} - ORAMA_SECRET_KEY: ${{ github.event_name == 'push' && secrets.ORAMA_PRODUCTION_SECRET_KEY || secrets.ORAMA_SECRET_KEY }} - if: matrix.os == 'ubuntu-latest' && github.event_name != 'merge_group' - working-directory: apps/site - run: node --run sync-orama + # We want to ensure that static exports for all locales do not occur on `pull_request` events + NEXT_PUBLIC_STATIC_EXPORT_LOCALE: ${{ github.event_name == 'push' }} diff --git a/.github/workflows/sync-orama.yml b/.github/workflows/sync-orama.yml new file mode 100644 index 0000000000000..4814e7af56856 --- /dev/null +++ b/.github/workflows/sync-orama.yml @@ -0,0 +1,59 @@ +# Security Notes +# This workflow uses `pull_request_target`, so will run against all PRs automatically (without approval), be careful with allowing any user-provided code to be run here +# Only selected Actions are allowed within this repository. Please refer to (https://github.com/nodejs/nodejs.org/settings/actions) +# for the full list of available actions. If you want to add a new one, please reach out a maintainer with Admin permissions. +# REVIEWERS, please always double-check security practices before merging a PR that contains Workflow changes!! +# AUTHORS, please only use actions with explicit SHA references, and avoid using `@master` or `@main` references or `@version` tags. +# MERGE QUEUE NOTE: This Workflow does not run on `merge_group` trigger, as this Workflow is not required for Merge Queue's + +name: Sync Orama Cloud + +on: + push: + branches: + - main + pull_request_target: + branches: + - main + types: + - labeled + +permissions: + contents: read + +jobs: + sync-orama-cloud: + name: Sync Orama Cloud + runs-on: ubuntu-latest + + # This Job should run either on non-`pull_request_target` events, + # or `pull_request_target` event with a `labeled` action with a label named `github_actions:pull-request` + # since we want to run Website Builds on all these occasions. As this allows us to be certain the that builds are passing + if: github.event_name != 'pull_request_target' || github.event.label.name == 'github_actions:pull-request' + + steps: + - name: Git Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.ref }} + + - name: Set up pnpm + uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda # v4.1.0 + + - name: Set up Node.js + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + with: + # We want to ensure that the Node.js version running here respects our supported versions + node-version-file: '.nvmrc' + cache: 'pnpm' + + - name: Install packages + run: pnpm install --frozen-lockfile + + - name: Sync Orama Cloud + working-directory: apps/site + run: node --run sync-orama + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ORAMA_INDEX_ID: ${{ github.event_name == 'push' && secrets.ORAMA_PRODUCTION_INDEX_ID || secrets.ORAMA_INDEX_ID }} + ORAMA_SECRET_KEY: ${{ github.event_name == 'push' && secrets.ORAMA_PRODUCTION_SECRET_KEY || secrets.ORAMA_SECRET_KEY }} diff --git a/apps/site/app/[locale]/next-data/api-data/route.ts b/apps/site/app/[locale]/next-data/api-data/route.ts deleted file mode 100644 index 2e469ec77f717..0000000000000 --- a/apps/site/app/[locale]/next-data/api-data/route.ts +++ /dev/null @@ -1,89 +0,0 @@ -import { deflateSync } from 'node:zlib'; - -import provideReleaseData from '#site/next-data/providers/releaseData'; -import { GITHUB_API_KEY } from '#site/next.constants.mjs'; -import { defaultLocale } from '#site/next.locales.mjs'; -import type { GitHubApiFile } from '#site/types'; -import { getGitHubApiDocsUrl } from '#site/util/github'; -import { parseRichTextIntoPlainText } from '#site/util/string'; - -// Defines if we should use the GitHub API Key for the request -// based on the environment variable `GITHUB_API_KEY` -const authorizationHeaders = GITHUB_API_KEY - ? { headers: { Authorization: `Bearer ${GITHUB_API_KEY}` } } - : undefined; - -// Formats a pathname for an API file from Markdown file basename -const getPathnameForApiFile = (name: string, version: string) => - `docs/${version}/api/${name.replace('.md', '.html')}`; - -// This is the Route Handler for the `GET` method which handles the request -// for a digest and metadata of all API pages from the Node.js Website -// @see https://nextjs.org/docs/app/building-your-application/routing/router-handlers -export const GET = async () => { - const releases = provideReleaseData(); - - const { versionWithPrefix } = releases.find( - release => release.status === 'Active LTS' - )!; - - const gitHubApiResponse = await fetch( - getGitHubApiDocsUrl(versionWithPrefix), - authorizationHeaders - ); - - // transforms the response into an array of GitHubApiFile - const apiDocsFiles: Array = await gitHubApiResponse.json(); - - // prevent the route from crashing if the response is not an array of GitHubApiFile - // and return an empty array instead. This is a fallback for when the GitHub API is not available. - if (!Array.isArray(apiDocsFiles)) { - return Response.json([]); - } - - // maps over each api file and get the download_url, fetch the content and deflates it - const mappedApiFiles = apiDocsFiles.map( - async ({ name, path: filename, download_url }) => { - const apiFileResponse = await fetch(download_url); - - // Retrieves the content as a raw text string - const source = await apiFileResponse.text(); - - // Removes empty/blank lines or lines just with spaces and trims each line - // from leading and trailing paddings/spaces - const cleanedContent = parseRichTextIntoPlainText(source); - - const deflatedSource = deflateSync(cleanedContent).toString('base64'); - - return { - filename: filename, - pathname: getPathnameForApiFile(name, versionWithPrefix), - content: deflatedSource, - }; - } - ); - - const data = await Promise.all(mappedApiFiles); - - return Response.json(data); -}; - -// This function generates the static paths that come from the dynamic segments -// `[locale]/next-data/api-data/` and returns an array of all available static paths -// This is used for ISR static validation and generation -export const generateStaticParams = async () => [ - { locale: defaultLocale.code }, -]; - -// Enforces that only the paths from `generateStaticParams` are allowed, giving 404 on the contrary -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#dynamicparams -export const dynamicParams = false; - -// Enforces that this route is used as static rendering -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#dynamic -export const dynamic = 'force-static'; - -// Ensures that this endpoint is invalidated and re-executed every X minutes -// so that when new deployments happen, the data is refreshed -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#revalidate -export const revalidate = 300; diff --git a/apps/site/app/[locale]/next-data/page-data/route.ts b/apps/site/app/[locale]/next-data/page-data/route.ts deleted file mode 100644 index 58d82a27e5993..0000000000000 --- a/apps/site/app/[locale]/next-data/page-data/route.ts +++ /dev/null @@ -1,77 +0,0 @@ -import { deflateSync } from 'node:zlib'; - -import matter from 'gray-matter'; - -import { dynamicRouter } from '#site/next.dynamic.mjs'; -import { defaultLocale } from '#site/next.locales.mjs'; -import { parseRichTextIntoPlainText } from '#site/util/string'; - -// This is the Route Handler for the `GET` method which handles the request -// for a digest and metadata of all existing pages on Node.js Website -// @see https://nextjs.org/docs/app/building-your-application/routing/router-handlers -export const GET = async () => { - // Retrieves all available routes for the default locale - const allAvailbleRoutes = await dynamicRouter.getAllRoutes(); - - // We exclude the blog routes from the available pages metadata - // as they are generated separately and are not part of the static pages - // and are not part of the static pages metadata - const routesExceptBlog = allAvailbleRoutes.filter( - route => !route.startsWith('blog') - ); - - const availablePagesMetadata = routesExceptBlog.map(async pathname => { - const { source, filename } = await dynamicRouter.getMarkdownFile( - defaultLocale.code, - pathname - ); - - // Gets the title and the Description from the Page Metadata - const { title, description } = await dynamicRouter.getPageMetadata( - defaultLocale.code, - pathname - ); - - // Parser the Markdown source with `gray-matter` and then only - // grabs the markdown content and cleanses it by removing HTML/JSX tags - // removing empty/blank lines or lines just with spaces and trims each line - // from leading and trailing paddings/spaces - const cleanedContent = parseRichTextIntoPlainText(matter(source).content); - - // Deflates a String into a base64 string-encoded (zlib compressed) - const content = deflateSync(cleanedContent).toString('base64'); - - // Returns metadata of each page available on the Website - return { - filename, - pathname, - title, - description, - content, - }; - }); - - const data = await Promise.all(availablePagesMetadata); - - return Response.json(data); -}; - -// This function generates the static paths that come from the dynamic segments -// `[locale]/next-data/page-data/` and returns an array of all available static paths -// This is used for ISR static validation and generation -export const generateStaticParams = async () => [ - { locale: defaultLocale.code }, -]; - -// Enforces that only the paths from `generateStaticParams` are allowed, giving 404 on the contrary -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#dynamicparams -export const dynamicParams = false; - -// Enforces that this route is used as static rendering -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#dynamic -export const dynamic = 'error'; - -// Ensures that this endpoint is invalidated and re-executed every X minutes -// so that when new deployments happen, the data is refreshed -// @see https://nextjs.org/docs/app/api-reference/file-conventions/route-segment-config#revalidate -export const revalidate = 300; diff --git a/apps/site/next.constants.mjs b/apps/site/next.constants.mjs index 6a6c184aa79b9..4009fbf4af9de 100644 --- a/apps/site/next.constants.mjs +++ b/apps/site/next.constants.mjs @@ -161,14 +161,6 @@ export const ORAMA_CLOUD_ENDPOINT = export const ORAMA_CLOUD_API_KEY = process.env.NEXT_PUBLIC_ORAMA_API_KEY || 'qopIuAERiWP2EZOpDjvczjws7WV40yrj'; -/** - * A GitHub Access Token for accessing the GitHub API and not being rate-limited - * The current token is registered on the "nodejs-vercel" GitHub Account. - * - * Note: This has no NEXT_PUBLIC prefix as it should not be exposed to the Browser. - */ -export const GITHUB_API_KEY = process.env.NEXT_GITHUB_API_KEY || ''; - /** * The resource we point people to when discussing internationalization efforts. */ diff --git a/apps/site/package.json b/apps/site/package.json index 7174e6dbe56f6..cebc9cb944d02 100644 --- a/apps/site/package.json +++ b/apps/site/package.json @@ -88,6 +88,7 @@ "@testing-library/user-event": "~14.6.1", "@types/mdx": "^2.0.13", "@types/semver": "~7.7.0", + "dedent": "^1.6.0", "eslint-config-next": "15.5.0", "eslint-import-resolver-typescript": "~4.4.4", "eslint-plugin-mdx": "~3.6.2", @@ -96,6 +97,9 @@ "global-jsdom": "^26.0.0", "handlebars": "4.7.8", "jsdom": "^26.0.0", + "mdast-util-from-markdown": "^2.0.2", + "mdast-util-to-string": "^4.0.0", + "nock": "^14.0.10", "remark-frontmatter": "^5.0.0", "stylelint": "16.23.0", "stylelint-config-standard": "39.0.0", diff --git a/apps/site/scripts/orama-search/__tests__/get-documents.test.mjs b/apps/site/scripts/orama-search/__tests__/get-documents.test.mjs new file mode 100644 index 0000000000000..3aec546acd434 --- /dev/null +++ b/apps/site/scripts/orama-search/__tests__/get-documents.test.mjs @@ -0,0 +1,38 @@ +import assert from 'node:assert/strict'; +import { test, mock } from 'node:test'; + +import nock from 'nock'; + +mock.module('node:fs/promises', { + namedExports: { + glob: () => ['filename'], + readFile: name => name.endsWith('filename') && 'content', + }, +}); + +const { getAPIDocs, getArticles } = await import('../get-documents.mjs'); + +test('getAPIDocs', async () => { + nock('https://api.github.com') + .get('/repos/nodejs/node/contents/doc/api') + .query(true) + .reply(200, [ + { + name: 'fs.md', + download_url: 'data:text/plain,fs', + }, + ]); + + const result = await getAPIDocs(); + + assert.equal(result.length, 1); + assert.equal(result[0].content, 'fs'); + assert.match(result[0].pathname, /^docs\/v[^/]+\/api\/fs\.html$/); +}); + +test('getArticles', async () => { + const result = await getArticles(); + assert.deepStrictEqual(result, [ + { content: 'content', pathname: 'filename' }, + ]); +}); diff --git a/apps/site/scripts/orama-search/__tests__/process-documents.test.mjs b/apps/site/scripts/orama-search/__tests__/process-documents.test.mjs new file mode 100644 index 0000000000000..83db237682721 --- /dev/null +++ b/apps/site/scripts/orama-search/__tests__/process-documents.test.mjs @@ -0,0 +1,104 @@ +import assert from 'node:assert'; +import test from 'node:test'; + +import dedent from 'dedent'; + +import { processDocument } from '../process-documents.mjs'; + +const testCases = [ + { + name: 'Uses front matter title if available', + input: { + pathname: 'blog/my-post.html', + content: dedent` + --- + title: Custom Title + --- + # Intro + Hello world + `, + }, + expected: [ + { + path: 'blog/my-post.html#intro', + siteSection: 'Blog', + pageTitle: 'Custom Title', + pageSectionTitle: 'Intro', + pageSectionContent: 'Hello world', + }, + ], + }, + { + name: 'Falls back to filename for title', + input: { + pathname: 'docs/another-post.html', + content: dedent` + # Start + Content here + `, + }, + expected: [ + { + path: 'docs/another-post.html#start', + siteSection: 'Docs', + pageTitle: 'another post', + pageSectionTitle: 'Start', + pageSectionContent: 'Content here', + }, + ], + }, + { + name: 'Handles multiple sections', + input: { + pathname: 'guides/test.html', + content: dedent` + # First + Paragraph A + + # Second + Paragraph B + `, + }, + expected: [ + { + path: 'guides/test.html#first', + siteSection: 'Guides', + pageTitle: 'test', + pageSectionTitle: 'First', + pageSectionContent: 'Paragraph A', + }, + { + path: 'guides/test.html#second', + siteSection: 'Guides', + pageTitle: 'test', + pageSectionTitle: 'Second', + pageSectionContent: 'Paragraph B', + }, + ], + }, + { + name: 'Section with no heading', + input: { + pathname: 'misc/untitled.html', + content: dedent` + Just some text without a heading + `, + }, + expected: [ + { + path: 'misc/untitled.html#', + siteSection: 'Misc', + pageTitle: 'untitled', + pageSectionTitle: '', + pageSectionContent: 'Just some text without a heading', + }, + ], + }, +]; + +for (const { name, input, expected } of testCases) { + test(name, () => { + const result = processDocument(input); + assert.deepStrictEqual(result, expected); + }); +} diff --git a/apps/site/scripts/orama-search/get-documents.mjs b/apps/site/scripts/orama-search/get-documents.mjs index 509f07d460299..01580f3833ecd 100644 --- a/apps/site/scripts/orama-search/get-documents.mjs +++ b/apps/site/scripts/orama-search/get-documents.mjs @@ -1,80 +1,79 @@ -import { existsSync, readFileSync } from 'node:fs'; -import { join } from 'node:path'; -import zlib from 'node:zlib'; - -import { slug } from 'github-slugger'; - -import { getRelativePath } from '../../next.helpers.mjs'; - -const currentRoot = getRelativePath(import.meta.url); -const dataBasePath = join(currentRoot, '../../.next/server/app/en/next-data'); - -if (!existsSync(dataBasePath)) { - throw new Error( - 'The data directory does not exist. Please run `npm run build` first.' +import { readFile, glob } from 'node:fs/promises'; +import { join, basename, posix, win32 } from 'node:path'; + +import generateReleaseData from '#site/next-data/generators/releaseData.mjs'; +import { getRelativePath } from '#site/next.helpers.mjs'; + +import { processDocument } from './process-documents.mjs'; + +// If a GitHub token is available, include it for higher rate limits +const fetchOptions = process.env.GITHUB_TOKEN + ? { headers: { Authorization: `Bearer ${process.env.GITHUB_TOKEN}` } } + : undefined; + +/** + * Fetch Node.js API documentation directly from GitHub + * for the current Active LTS version. + */ +export const getAPIDocs = async () => { + // Find the current Active LTS version + const releaseData = await generateReleaseData(); + const { versionWithPrefix } = releaseData.find( + r => r.status === 'Active LTS' ); -} - -const nextPageData = readFileSync(`${dataBasePath}/page-data.body`, 'utf-8'); -const nextAPIPageData = readFileSync(`${dataBasePath}/api-data.body`, 'utf-8'); -const pageData = JSON.parse(nextPageData); -const apiData = JSON.parse(nextAPIPageData); - -const splitIntoSections = markdownContent => { - const lines = markdownContent.split(/\n/gm); - const sections = []; + // Get list of API docs from the Node.js repo + const fetchResponse = await fetch( + `https://api.github.com/repos/nodejs/node/contents/doc/api?ref=${versionWithPrefix}`, + fetchOptions + ); + const documents = await fetchResponse.json(); - let section = null; + // Download and return content + metadata for each doc + return Promise.all( + documents.map(async ({ name, download_url }) => { + const res = await fetch(download_url, fetchOptions); - for (const line of lines) { - if (line.match(/^#{1,6}\s/)) { - section = { - pageSectionTitle: line.replace(/^#{1,6}\s*/, ''), - pageSectionContent: [], + return { + content: await res.text(), + pathname: `docs/${versionWithPrefix}/api/${basename(name, '.md')}.html`, }; - - sections.push(section); - } else if (section) { - section.pageSectionContent.push(line); - } - } - - return sections.map(section => ({ - ...section, - pageSectionContent: section.pageSectionContent.join('\n'), - })); + }) + ); }; -const uppercaseFirst = string => - string.charAt(0).toUpperCase() + string.slice(1); - -const getPageTitle = data => - data.title || - data.pathname - .split('/') - .pop() - .replace(/\.html$/, '') - .replace(/-/g, ' '); - -export const siteContent = [...pageData, ...apiData] - .map(data => { - const { pathname, title = getPageTitle(data), content } = data; - const markdownContent = zlib - .inflateSync(Buffer.from(content, 'base64')) - .toString('utf-8'); - - const siteSection = pathname.split('/').shift(); - const subSections = splitIntoSections(markdownContent); - return subSections.map(section => { - const path = `${pathname}#${slug(section.pageSectionTitle)}`; +/** + * Collect all local markdown/mdx articles under /pages/en, + * excluding blog content. + */ +export const getArticles = async () => { + const relativePath = getRelativePath(import.meta.url); + const root = join(relativePath, '..', '..', 'pages', 'en'); + + // Find all markdown files (excluding blog) + const files = await Array.fromAsync(glob('**/*.{md,mdx}', { cwd: root })); + + // Read content + metadata + return Promise.all( + files + .filter(path => !path.startsWith('blog')) + .map(async path => ({ + content: await readFile(join(root, path), 'utf8'), + pathname: path + // Strip the extension + .replace(/\.mdx?$/, '') + // Normalize to a POSIX path + .replaceAll(win32.sep, posix.sep), + })) + ); +}; - return { - path: path, - siteSection: uppercaseFirst(siteSection), - pageTitle: title, - ...section, - }; - }); - }) - .flat(); +/** + * Aggregate all documents (API docs + local articles). + */ +export const getDocuments = async () => { + const documentPromises = await Promise.all([getAPIDocs(), getArticles()]); + return documentPromises.flatMap(documents => + documents.flatMap(processDocument) + ); +}; diff --git a/apps/site/scripts/orama-search/process-documents.mjs b/apps/site/scripts/orama-search/process-documents.mjs new file mode 100644 index 0000000000000..c327f0474d406 --- /dev/null +++ b/apps/site/scripts/orama-search/process-documents.mjs @@ -0,0 +1,89 @@ +import { basename } from 'node:path'; + +import { slug } from 'github-slugger'; +import matter from 'gray-matter'; +import { fromMarkdown } from 'mdast-util-from-markdown'; +import { toString } from 'mdast-util-to-string'; + +/** + * Extracts top-level sections from a Markdown AST. + * Each section starts with a heading (if present) and includes all subsequent nodes + * until the next heading. + */ +const extractSections = tree => { + const sections = []; + let current = null; + + // Visit each top-level node + tree.children.forEach(node => { + if (node.type === 'heading') { + // Push the previous section if it exists + if (current) { + sections.push(current); + } + + // Start a new section with the current heading + current = { + heading: node, + children: [], + }; + } else { + // If no heading yet, initialize an empty section + if (!current) { + current = { heading: null, children: [] }; + } + + // Add the node to the current section's children + current.children.push(node); + } + }); + + // Push the last section if it exists + if (current) { + sections.push(current); + } + + // Convert AST nodes to strings and structure the output + return sections.map(({ heading, children }) => ({ + pageSectionTitle: toString(heading), + pageSectionContent: children + .map(child => toString(child, { includeHtml: false })) + .join('\n'), + })); +}; + +// Derive page title from path +const getPageTitle = path => basename(path, '.html').replace(/-/g, ' '); + +// Capitalize first character +const getSiteSection = path => { + const subpath = path.split('/')[0]; + + return subpath[0].toUpperCase() + subpath.slice(1); +}; + +/** + * Processes a Markdown document with front matter. + * Extracts sections and logs them. + */ +export const processDocument = ({ pathname, content }) => { + // Parse front matter and separate body + const { data, content: body } = matter(content); + + // Convert Markdown body to AST + const ast = fromMarkdown(body); + + // Extract sections from the AST + const sections = extractSections(ast); + + // Get titles + const siteSection = getSiteSection(pathname); + const pageTitle = data.title || getPageTitle(pathname); + + return sections.map(section => ({ + path: `${pathname}#${slug(section.pageSectionTitle)}`, + siteSection: siteSection, + pageTitle: pageTitle, + ...section, + })); +}; diff --git a/apps/site/scripts/orama-search/sync-orama-cloud.mjs b/apps/site/scripts/orama-search/sync-orama-cloud.mjs index 276d19c4e45ad..fe8b53b1117e9 100644 --- a/apps/site/scripts/orama-search/sync-orama-cloud.mjs +++ b/apps/site/scripts/orama-search/sync-orama-cloud.mjs @@ -1,6 +1,6 @@ import { CloudManager } from '@oramacloud/client'; -import { siteContent } from './get-documents.mjs'; +import { getDocuments } from './get-documents.mjs'; import { ORAMA_SYNC_BATCH_SIZE } from '../../next.constants.mjs'; // The following follows the instructions at https://docs.orama.com/cloud/data-sources/custom-integrations/webhooks @@ -8,40 +8,47 @@ import { ORAMA_SYNC_BATCH_SIZE } from '../../next.constants.mjs'; const INDEX_ID = process.env.ORAMA_INDEX_ID; const API_KEY = process.env.ORAMA_SECRET_KEY; -const oramaCloudManager = new CloudManager({ - api_key: API_KEY, -}); - +const oramaCloudManager = new CloudManager({ api_key: API_KEY }); const oramaIndex = oramaCloudManager.index(INDEX_ID); -console.log(`Syncing ${siteContent.length} documents to Orama Cloud index`); - -// Orama allows to send several documents at once, so we batch them in groups of 50. -// This is not strictly necessary, but it makes the process faster. -const runUpdate = async () => { - const batchSize = ORAMA_SYNC_BATCH_SIZE; +// Helper to batch documents +const batchDocuments = (documents, batchSize) => { const batches = []; - - for (let i = 0; i < siteContent.length; i += batchSize) { - batches.push(siteContent.slice(i, i + batchSize)); + for (let i = 0; i < documents.length; i += batchSize) { + batches.push(documents.slice(i, i + batchSize)); } + return batches; +}; - console.log(`Sending ${batches.length} batches of ${batchSize} documents`); +// Orama allows to send several documents at once, so we batch them in groups of ORAMA_SYNC_BATCH_SIZE. +// This is not strictly necessary, but it makes the process faster. +const runUpdate = async documents => { + console.log(`Syncing ${documents.length} documents to Orama Cloud index`); + + const batches = batchDocuments(documents, ORAMA_SYNC_BATCH_SIZE); + console.log( + `Sending ${batches.length} batches of up to ${ORAMA_SYNC_BATCH_SIZE} documents` + ); - for (const batch of batches) { + for (const [i, batch] of batches.entries()) { // In Orama, "update" is an upsert operation. + console.log(`Updating batch ${i + 1} of ${batches.length}`); await oramaIndex.update(batch); } }; -// Now we proceed to call the APIs in order: +// Proceed to call the APIs in order: // 1. Empty the index // 2. Insert the documents // 3. Trigger a deployment // Once all these steps are done, the new documents will be available in the live index. // Allow Orama up to 1 minute to distribute the documents to all the 300+ nodes worldwide. +console.log('Emptying the Orama Cloud index...'); await oramaIndex.empty(); -await runUpdate(); + +await runUpdate(await getDocuments()); + +console.log('Triggering Orama Cloud deployment...'); await oramaIndex.deploy(); console.log('Orama Cloud sync completed successfully!'); diff --git a/apps/site/util/__tests__/github.test.mjs b/apps/site/util/__tests__/github.test.mjs index 03f20de2d4372..3d359863fc502 100644 --- a/apps/site/util/__tests__/github.test.mjs +++ b/apps/site/util/__tests__/github.test.mjs @@ -5,12 +5,8 @@ mock.module('github-slugger', { defaultExport: class {}, }); -const { - getGitHubAvatarUrl, - createGitHubSlugger, - getGitHubBlobUrl, - getGitHubApiDocsUrl, -} = await import('#site/util/github'); +const { getGitHubAvatarUrl, createGitHubSlugger, getGitHubBlobUrl } = + await import('#site/util/github'); describe('gitHubUtils', () => { it('getGitHubAvatarUrl returns the correct URL', () => { @@ -31,13 +27,6 @@ describe('gitHubUtils', () => { assert.equal(result, expected); }); - it('getGitHubApiDocsUrl returns the correct URL', () => { - const result = getGitHubApiDocsUrl('assert'); - const expected = - 'https://api.github.com/repos/nodejs/node/contents/doc/api?ref=assert'; - assert.equal(result, expected); - }); - describe('getGitHubAvatarUrl', () => { it('should return a valid GitHub avatar URL', () => { assert.equal( @@ -56,13 +45,4 @@ describe('gitHubUtils', () => { ); }); }); - - describe('getGitHubApiDocsUrl', () => { - it('should return the correct API docs URL', () => { - assert.equal( - getGitHubApiDocsUrl('v18.x'), - 'https://api.github.com/repos/nodejs/node/contents/doc/api?ref=v18.x' - ); - }); - }); }); diff --git a/apps/site/util/__tests__/string.test.mjs b/apps/site/util/__tests__/string.test.mjs index c4d95f125a9b8..dcae9d29a4c32 100644 --- a/apps/site/util/__tests__/string.test.mjs +++ b/apps/site/util/__tests__/string.test.mjs @@ -1,11 +1,7 @@ import assert from 'node:assert/strict'; import { describe, it } from 'node:test'; -import { - getAcronymFromString, - parseRichTextIntoPlainText, - dashToCamelCase, -} from '#site/util/string'; +import { getAcronymFromString, dashToCamelCase } from '#site/util/string'; describe('String utils', () => { it('getAcronymFromString returns the correct acronym', () => { @@ -20,50 +16,6 @@ describe('String utils', () => { assert.equal(getAcronymFromString(''), ''); }); - it('parseRichTextIntoPlainText returns plain text without HTML and JSX tags', () => { - const richText = '

This is bold and italic

'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'This is bold and italic'); - }); - - it('parseRichTextIntoPlainText replaces Markdown links with their text content', () => { - const richText = - 'Check out [Node.js](https://nodejs.org/en/) for more information.'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'Check out Node.js for more information.'); - }); - - it('parseRichTextIntoPlainText replaces Markdown lists with their content', () => { - const richText = '- Item 1\n- Item 2\n- Item 3'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'Item 1\nItem 2\nItem 3'); - }); - - it('parseRichTextIntoPlainText replaces Markdown underscore, bold, and italic with their content', () => { - const richText = 'This is _underscore_, **bold**, and *italic*.'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'This is underscore, bold, and italic.'); - }); - - it('parseRichTextIntoPlainText replaces Markdown multiline code blocks with an empty string', () => { - const richText = - 'Some text\n```\nconst x = 42;\nconsole.log(x);\n```\nMore text'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'Some text\nMore text'); - }); - - it('parseRichTextIntoPlainText removes empty lines or lines with just spaces', () => { - const richText = 'Line 1\n \nLine 3'; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'Line 1\nLine 3'); - }); - - it('parseRichTextIntoPlainText removes leading and trailing spaces from each line', () => { - const richText = ' Line 1 \n Line 2 \n Line 3 '; - const result = parseRichTextIntoPlainText(richText); - assert.equal(result, 'Line 1\nLine 2\nLine 3'); - }); - it('dashToCamelCase returns correct camelCase', () => { assert.equal(dashToCamelCase('foo-bar-baz'), 'fooBarBaz'); }); @@ -81,13 +33,6 @@ describe('String utils', () => { assert.equal(result, 'NJ'); }); - it('parseRichTextIntoPlainText removes markdown syntax', () => { - const result = parseRichTextIntoPlainText( - 'Hello **world**!\n*italic text*' - ); - assert.equal(result, 'Hello world!\nitalic text'); - }); - it('dashToCamelCase converts dashed strings to camelCase', () => { const result = dashToCamelCase('es-2015'); assert.equal(result, 'es2015'); @@ -101,13 +46,6 @@ describe('String utils', () => { }); }); - describe('parseRichTextIntoPlainText', () => { - it('should remove markdown and HTML', () => { - const result = parseRichTextIntoPlainText('

Hello

*world*'); - assert.equal(result, 'Hello world'); - }); - }); - describe('dashToCamelCase', () => { it('should convert dash-case to camelCase', () => { assert.equal(dashToCamelCase('es-2015-config'), 'es2015Config'); diff --git a/apps/site/util/github.ts b/apps/site/util/github.ts index 813eaada810d2..3070612c3b5ce 100644 --- a/apps/site/util/github.ts +++ b/apps/site/util/github.ts @@ -11,6 +11,3 @@ export const createGitHubSlugger = () => { export const getGitHubBlobUrl = (filename: string) => `https://github.com/nodejs/nodejs.org/blob/main/apps/site/pages/en/${filename}`; - -export const getGitHubApiDocsUrl = (ref: string) => - `https://api.github.com/repos/nodejs/node/contents/doc/api?ref=${ref}`; diff --git a/apps/site/util/string.ts b/apps/site/util/string.ts index d830a6dfc4f2c..2dc5077462000 100644 --- a/apps/site/util/string.ts +++ b/apps/site/util/string.ts @@ -1,27 +1,6 @@ export const getAcronymFromString = (str: string) => [...(str.trim().match(/\b(\w)/g) || '')].join('').toUpperCase(); -// Note: We don't remove Markdown Headers delimiters as they're useful for delimiting sections -export const parseRichTextIntoPlainText = (richText: string) => - richText - // replaces JSX and HTML and their properties with an empty string - // keeping only the content left - .replace(/<[^>]+>/gm, '') - // replaces Markdown links with their text content - .replace(/\[([^\]]+)\]\([^)]+\)/gm, '$1') - // replaces Markdown lists with their content - .replace(/^[*-] (.*)$/gm, '$1') - // replaces Markdown underscore, bold and italic with their content - .replace(/(\*\*|\*|__|_)(.*?)\1/gm, '$2') - // replaces Markdown multiline codeblocks with their content - .replace(/```.+?```/gms, '') - // replaces empty lines or lines just with spaces with an empty string - .replace(/^\s*\n/gm, '') - // replaces leading and trailing spaces from each line with an empty string - .replace(/^[ ]+|[ ]+$/gm, '') - // replaces leading numbers and dots from each line with an empty string - .replace(/^\d+\.\s/gm, ''); - export const dashToCamelCase = (str: string) => str .replace(/-([a-z])/g, (match, chr) => chr.toUpperCase()) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e7634528d75b1..6e6365353c76f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -229,6 +229,9 @@ importers: '@types/semver': specifier: ~7.7.0 version: 7.7.0 + dedent: + specifier: ^1.6.0 + version: 1.6.0 eslint-config-next: specifier: 15.5.0 version: 15.5.0(eslint-plugin-import-x@4.16.1(@typescript-eslint/utils@8.38.0(eslint@9.32.0(jiti@2.4.2))(typescript@5.8.3))(eslint-import-resolver-node@0.3.9)(eslint@9.32.0(jiti@2.4.2)))(eslint@9.32.0(jiti@2.4.2))(typescript@5.8.3) @@ -253,6 +256,15 @@ importers: jsdom: specifier: ^26.0.0 version: 26.1.0 + mdast-util-from-markdown: + specifier: ^2.0.2 + version: 2.0.2 + mdast-util-to-string: + specifier: ^4.0.0 + version: 4.0.0 + nock: + specifier: ^14.0.10 + version: 14.0.10 remark-frontmatter: specifier: ^5.0.0 version: 5.0.0 @@ -1975,6 +1987,10 @@ packages: '@mdx-js/mdx@3.1.0': resolution: {integrity: sha512-/QxEhPAvGwbQmy1Px8F899L5Uc2KZ6JtXwlCgJmjSTBedwOZkByYcBG4GceIGPXRDsmfxhHazuS+hlOShRLeDw==} + '@mswjs/interceptors@0.39.6': + resolution: {integrity: sha512-bndDP83naYYkfayr/qhBHMhk0YGwS1iv6vaEGcr0SQbO0IZtbOPqjKjds/WcG+bJA+1T5vCx6kprKOzn5Bg+Vw==} + engines: {node: '>=18'} + '@napi-rs/wasm-runtime@0.2.11': resolution: {integrity: sha512-9DPkXtvHydrcOsopiYpUgPHpmj0HWZKMUnL2dZqpvC42lsratuBG06V5ipyno0fUek5VlFsNQ+AcFATSrJXgMA==} @@ -2105,6 +2121,15 @@ packages: resolution: {integrity: sha512-xhfYPXoV5Dy4UkY0D+v2KkwvnDfiA/8Mt3sWCGI/hM03NsYIH8ZaG6QzS9x7pje5vHZBZJ2v6VRFVTWACnqcmQ==} engines: {node: ^16.14.0 || >=18.0.0} + '@open-draft/deferred-promise@2.2.0': + resolution: {integrity: sha512-CecwLWx3rhxVQF6V4bAgPS5t+So2sTbPgAzafKkVizyi7tlwpcFpdFqq+wqF2OwNBmqFuu6tOyouTuxgpMfzmA==} + + '@open-draft/logger@0.3.0': + resolution: {integrity: sha512-X2g45fzhxH238HKO4xbSr7+wBS8Fvw6ixhTDuvLd5mqh6bJJCFAPwU9mPDxbcrRtfxv4u5IHCEH77BmxvXmmxQ==} + + '@open-draft/until@2.1.0': + resolution: {integrity: sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==} + '@opennextjs/aws@3.7.4': resolution: {integrity: sha512-s50dmKrgQ62GliffoI/hGGQVb3q/7ZN5VRa4jJw0ZsEGLfk25XuAulO/ySCYeE7/A23KoAYuhafnKr/h+vxOeQ==} hasBin: true @@ -5655,6 +5680,9 @@ packages: resolution: {integrity: sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==} engines: {node: '>= 0.4'} + is-node-process@1.2.0: + resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==} + is-number-object@1.1.1: resolution: {integrity: sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==} engines: {node: '>= 0.4'} @@ -5798,6 +5826,9 @@ packages: json-stable-stringify-without-jsonify@1.0.1: resolution: {integrity: sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==} + json-stringify-safe@5.0.1: + resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==} + json5@1.0.2: resolution: {integrity: sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==} hasBin: true @@ -6407,6 +6438,10 @@ packages: no-case@3.0.4: resolution: {integrity: sha512-fgAN3jGAh+RoxUGZHTSOLJIqUc2wmoBwGR4tbpNAKmmovFoWq0OdRkb0VkldReO2a2iBT/OEulG9XSUc10r3zg==} + nock@14.0.10: + resolution: {integrity: sha512-Q7HjkpyPeLa0ZVZC5qpxBt5EyLczFJ91MEewQiIi9taWuA0KB/MDJlUWtON+7dGouVdADTQsf9RA7TZk6D8VMw==} + engines: {node: '>=18.20.0 <20 || >=20.12.1'} + node-abort-controller@3.1.1: resolution: {integrity: sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==} @@ -6543,6 +6578,9 @@ packages: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} + outvariant@1.4.3: + resolution: {integrity: sha512-+Sl2UErvtsoajRDKCE5/dBz4DIvHXQQnAxtQTF04OJxY0+DyZXSo5P5Bb7XYWOh81syohlYL24hbDwxedPUJCA==} + own-keys@1.0.1: resolution: {integrity: sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==} engines: {node: '>= 0.4'} @@ -6909,6 +6947,10 @@ packages: prop-types@15.8.1: resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==} + propagate@2.0.1: + resolution: {integrity: sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag==} + engines: {node: '>= 8'} + property-information@7.0.0: resolution: {integrity: sha512-7D/qOz/+Y4X/rzSB6jKxKUsQnphO046ei8qxG59mtM3RG3DHgTK81HrxrmoDVINJb8NKT5ZsRbwHvQ6B68Iyhg==} @@ -7510,6 +7552,9 @@ packages: prettier: optional: true + strict-event-emitter@0.5.1: + resolution: {integrity: sha512-vMgjE/GGEPEFnhFub6pa4FmJBRBVOLpIII2hvCZ8Kzb7K0hlHo7mQv6xYrBvCL2LtAIBwFUK8wvuJgTVSQ5MFQ==} + string-argv@0.3.2: resolution: {integrity: sha512-aqD2Q0144Z+/RqG52NeHEkZauTAUWJO8c6yTftGJKO3Tja5tUgIfmIl6kExvhtxSDP7fXB6DvzkfMpCd/F3G+Q==} engines: {node: '>=0.6.19'} @@ -10222,6 +10267,15 @@ snapshots: - acorn - supports-color + '@mswjs/interceptors@0.39.6': + dependencies: + '@open-draft/deferred-promise': 2.2.0 + '@open-draft/logger': 0.3.0 + '@open-draft/until': 2.1.0 + is-node-process: 1.2.0 + outvariant: 1.4.3 + strict-event-emitter: 0.5.1 + '@napi-rs/wasm-runtime@0.2.11': dependencies: '@emnapi/core': 1.4.3 @@ -10363,6 +10417,15 @@ snapshots: dependencies: which: 4.0.0 + '@open-draft/deferred-promise@2.2.0': {} + + '@open-draft/logger@0.3.0': + dependencies: + is-node-process: 1.2.0 + outvariant: 1.4.3 + + '@open-draft/until@2.1.0': {} + '@opennextjs/aws@3.7.4': dependencies: '@ast-grep/napi': 0.35.0 @@ -14514,6 +14577,8 @@ snapshots: is-negative-zero@2.0.3: {} + is-node-process@1.2.0: {} + is-number-object@1.1.1: dependencies: call-bound: 1.0.4 @@ -14661,6 +14726,8 @@ snapshots: json-stable-stringify-without-jsonify@1.0.1: {} + json-stringify-safe@5.0.1: {} + json5@1.0.2: dependencies: minimist: 1.2.8 @@ -15528,6 +15595,12 @@ snapshots: lower-case: 2.0.2 tslib: 2.8.1 + nock@14.0.10: + dependencies: + '@mswjs/interceptors': 0.39.6 + json-stringify-safe: 5.0.1 + propagate: 2.0.1 + node-abort-controller@3.1.1: {} node-domexception@1.0.0: {} @@ -15675,6 +15748,8 @@ snapshots: type-check: 0.4.0 word-wrap: 1.2.5 + outvariant@1.4.3: {} + own-keys@1.0.1: dependencies: get-intrinsic: 1.3.0 @@ -15983,6 +16058,8 @@ snapshots: object-assign: 4.1.1 react-is: 16.13.1 + propagate@2.0.1: {} + property-information@7.0.0: {} property-information@7.1.0: {} @@ -17035,6 +17112,8 @@ snapshots: - supports-color - utf-8-validate + strict-event-emitter@0.5.1: {} + string-argv@0.3.2: {} string-width@4.2.3: