diff --git a/scripts/notion-fetch/contentSanitizer.test.ts b/scripts/notion-fetch/contentSanitizer.test.ts index 5a1a10f7..5f2cf17b 100644 --- a/scripts/notion-fetch/contentSanitizer.test.ts +++ b/scripts/notion-fetch/contentSanitizer.test.ts @@ -121,4 +121,42 @@ describe("contentSanitizer", () => { expect(result).toBe("[tag](#tag)"); }); }); + + describe("restoreSoftLineBreaks", () => { + it("should convert single newlines between text into
elements", () => { + const input = "First line\nSecond line"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe("First line
\nSecond line"); + }); + + it("should leave paragraph breaks (double newlines) untouched", () => { + const input = "First paragraph\n\nSecond paragraph"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe(input); + }); + + it("should ignore newlines that start markdown list items", () => { + const input = "Intro text\n- list item"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe(input); + }); + + it("should ignore newlines before numbered list items", () => { + const input = "Intro text\n1. First item"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe(input); + }); + + it("should not modify content inside fenced code blocks", () => { + const input = "```js\nconst x = 1;\nconst y = 2;\n```\nOutside"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe(input); + }); + + it("should normalize unicode line separators into
line breaks", () => { + const input = "Line one\u2028Line two"; + const result = scriptModule.restoreSoftLineBreaks(input); + expect(result).toBe("Line one
\nLine two"); + }); + }); }); diff --git a/scripts/notion-fetch/contentSanitizer.ts b/scripts/notion-fetch/contentSanitizer.ts index 153fc0d6..6efddf9e 100644 --- a/scripts/notion-fetch/contentSanitizer.ts +++ b/scripts/notion-fetch/contentSanitizer.ts @@ -105,3 +105,69 @@ export function sanitizeMarkdownContent(content: string): string { return content; } + +/** + * Restores intentional soft line breaks (Shift+Enter in Notion) by converting single + * newlines within paragraphs into `
` elements while avoiding structural markdown lines. + */ +export function restoreSoftLineBreaks(content: string): string { + if (!content) return content; + + const codeBlocks: string[] = []; + const codeSpans: string[] = []; + + const blockPlaceholder = (index: number) => + `__SOFTBREAK_CODEBLOCK_${index}__`; + const spanPlaceholder = (index: number) => `__SOFTBREAK_CODESPAN_${index}__`; + + // Protect fenced blocks and inline code so formatting is left untouched + let transformed = content.replace(/```[\s\S]*?```/g, (match) => { + codeBlocks.push(match); + return blockPlaceholder(codeBlocks.length - 1); + }); + + transformed = transformed.replace(/`[^`\n]*`/g, (match) => { + codeSpans.push(match); + return spanPlaceholder(codeSpans.length - 1); + }); + + // Normalize uncommon Unicode line separators that Notion may emit + transformed = transformed.replace(/[\u2028\u2029]/g, "\n"); + + transformed = transformed.replace( + /(?<=\S)\n(?=\S)/g, + (newline, offset, full) => { + const nextLine = full.slice(offset + newline.length); + const trimmedNextLine = nextLine.replace(/^[ \t]+/, ""); + + const before = full.slice(0, offset); + const prevLine = before.slice(before.lastIndexOf("\n") + 1); + const trimmedPrevLine = prevLine.trim(); + + // Skip markdown constructs that should remain as new lines + if ( + /^([-*+>#|<])/.test(trimmedNextLine) || + /^\d+[.)]/.test(trimmedNextLine) || + /^```/.test(trimmedPrevLine) || + /^---$/.test(trimmedPrevLine) || + trimmedPrevLine.startsWith("__SOFTBREAK_CODEBLOCK_") + ) { + return newline; + } + + return "
\n"; + } + ); + + // Restore masked code sections + transformed = transformed.replace( + /__SOFTBREAK_CODEBLOCK_(\d+)__/g, + (_m, i) => codeBlocks[Number(i)] + ); + transformed = transformed.replace( + /__SOFTBREAK_CODESPAN_(\d+)__/g, + (_m, i) => codeSpans[Number(i)] + ); + + return transformed; +} diff --git a/scripts/notion-fetch/generateBlocks.test.ts b/scripts/notion-fetch/generateBlocks.test.ts index ae7ca469..fcb49e84 100644 --- a/scripts/notion-fetch/generateBlocks.test.ts +++ b/scripts/notion-fetch/generateBlocks.test.ts @@ -69,6 +69,7 @@ vi.mock("./imageProcessor", () => ({ vi.mock("./utils", () => ({ sanitizeMarkdownContent: vi.fn((content) => content), + restoreSoftLineBreaks: vi.fn((content) => content), compressImageToFileWithFallback: vi.fn(), detectFormatFromBuffer: vi.fn(() => "jpeg"), formatFromContentType: vi.fn(() => "jpeg"), diff --git a/scripts/notion-fetch/generateBlocks.ts b/scripts/notion-fetch/generateBlocks.ts index 7e597c61..985a1aea 100644 --- a/scripts/notion-fetch/generateBlocks.ts +++ b/scripts/notion-fetch/generateBlocks.ts @@ -13,6 +13,7 @@ import chalk from "chalk"; import { processImage } from "./imageProcessor"; import { sanitizeMarkdownContent, + restoreSoftLineBreaks, compressImageToFileWithFallback, detectFormatFromBuffer, formatFromContentType, @@ -1550,6 +1551,9 @@ export async function generateBlocks(pages, progressCallback) { markdownString.parent = sanitizeMarkdownContent( markdownString.parent ); + markdownString.parent = restoreSoftLineBreaks( + markdownString.parent + ); // Remove duplicate title heading if it exists // The first H1 heading often duplicates the title in Notion exports let contentBody = markdownString.parent; diff --git a/scripts/notion-fetch/utils.ts b/scripts/notion-fetch/utils.ts index bdfb9bb0..d65228bd 100644 --- a/scripts/notion-fetch/utils.ts +++ b/scripts/notion-fetch/utils.ts @@ -4,8 +4,11 @@ import os from "node:os"; import chalk from "chalk"; import { compressImage } from "./imageCompressor"; -// Re-export sanitize so callers have a single utils entrypoint -export { sanitizeMarkdownContent } from "./contentSanitizer"; +// Re-export sanitize helpers so callers have a single utils entrypoint +export { + sanitizeMarkdownContent, + restoreSoftLineBreaks, +} from "./contentSanitizer"; // Fail-open toggle: defaults to true unless explicitly set to 'false' export const SOFT_FAIL: boolean =