From 2551c6290121fafb551962d2f9ddc60f370495ae Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Mon, 22 Dec 2025 12:52:33 +0700 Subject: [PATCH 01/10] wip: first pass --- __tests__/lib/mdxish/mdxish.test.ts | 39 ++- .../normalize-malformed-md-syntax.test.ts | 242 ++++++++++++++++++ lib/mdxish.ts | 2 + .../mdxish/normalize-malformed-md-syntax.ts | 110 ++++++++ 4 files changed, 387 insertions(+), 6 deletions(-) create mode 100644 __tests__/transformers/normalize-malformed-md-syntax.test.ts create mode 100644 processor/transform/mdxish/normalize-malformed-md-syntax.ts diff --git a/__tests__/lib/mdxish/mdxish.test.ts b/__tests__/lib/mdxish/mdxish.test.ts index f4c4f9417..7e13106e6 100644 --- a/__tests__/lib/mdxish/mdxish.test.ts +++ b/__tests__/lib/mdxish/mdxish.test.ts @@ -1,16 +1,43 @@ +import type { Root } from 'hast'; + import { mdxish } from '../../../lib/mdxish'; -describe('mdxish', () => { +describe('mdxish should render', () => { describe('invalid mdx syntax', () => { it('should render unclosed tags', () => { const md = '
'; expect(() => mdxish(md)).not.toThrow(); }); + }); - it('should render content in new lines', () => { - const md = `
hello -
`; - expect(() => mdxish(md)).not.toThrow(); + describe('relaxed md syntax, such as', () => { + it('wrong bold syntax', () => { + const md = `**Bold** + +Normal + +Hello** Wrong Bold**`; + const tree = mdxish(md); + + const getStrongTexts = (node: Root | Root['children'][number]): string[] => { + const texts: string[] = []; + if ('type' in node && node.type === 'element' && node.tagName === 'strong') { + const textNodes = + 'children' in node && Array.isArray(node.children) + ? node.children.filter(c => 'type' in c && c.type === 'text') + : []; + texts.push(textNodes.map(t => ('value' in t ? t.value : '')).join('')); + } + if ('children' in node && Array.isArray(node.children)) { + node.children.forEach(child => { + texts.push(...getStrongTexts(child)); + }); + } + return texts; + }; + + const strongTexts = getStrongTexts(tree); + expect(strongTexts.length).toBeGreaterThanOrEqual(2); }); }); -}); \ No newline at end of file +}); diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts new file mode 100644 index 000000000..60ccc5710 --- /dev/null +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -0,0 +1,242 @@ +import type { Code, Paragraph, Strong } from 'mdast'; + +import { remark } from 'remark'; +import remarkParse from 'remark-parse'; +import { removePosition } from 'unist-util-remove-position'; + +import normalizeEmphasisAST from '../../processor/transform/mdxish/normalize-malformed-md-syntax'; + +const processor = remark().use(remarkParse).use(normalizeEmphasisAST); + +describe('normalize-malformed-md-syntax', () => { + describe('bold patterns with spaces', () => { + it('should handle space after opening ** (with word before)', () => { + const md = 'Hello** Wrong Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening **', () => { + const md = 'Hello ** World**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing **', () => { + const md = '**text **word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = '** text **'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + ], + }); + }); + + it('should handle multiple malformed bold patterns in one text', () => { + const md = 'Start** first** middle** second **end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + + it('should handle complex case from migration tests', () => { + const md = 'Move to **Hello**> **World **from the top left menu'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes.length).toBeGreaterThanOrEqual(1); + const worldNode = strongNodes.find( + (n): n is Strong => + n.type === 'strong' && + Array.isArray(n.children) && + n.children[0]?.type === 'text' && + n.children[0].value === 'World', + ); + expect(worldNode).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }); + }); + + it('should handle case with word before and after', () => { + const md = 'Find** Hello World** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + }); + + describe('should not modify valid bold syntax', () => { + it('should leave **valid** bold untouched', () => { + const md = '**valid**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave word**valid**bold untouched', () => { + const md = 'word**valid**bold'; + const tree = processor.parse(md); + processor.runSync(tree); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + expect(children.length).toBeGreaterThanOrEqual(1); + }); + }); + + describe('should skip code blocks and inline code', () => { + it('should not modify malformed bold inside inline code', () => { + const md = '`** bold**`'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'inlineCode', + value: '** bold**', + }, + ], + }); + }); + + it('should not modify malformed bold inside code blocks', () => { + const md = '```\n** bold**\n```'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const codeBlock = tree.children[0] as Code; + expect(codeBlock.type).toBe('code'); + expect(codeBlock.value).toContain('** bold**'); + }); + }); + + describe('edge cases', () => { + it('should handle empty content', () => { + const md = '** **'; + const tree = processor.parse(md); + processor.runSync(tree); + + expect(tree.children.length).toBeGreaterThan(0); + }); + + it('should handle newlines in content', () => { + const md = '** text\nwith newline**'; + const tree = processor.parse(md); + processor.runSync(tree); + + expect(tree.children.length).toBeGreaterThan(0); + }); + + it('should preserve text around malformed bold', () => { + const md = 'Before Hello** Wrong Bold** After'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + expect(paragraph.children.length).toBeGreaterThanOrEqual(3); + }); + }); +}); diff --git a/lib/mdxish.ts b/lib/mdxish.ts index c20b5caf1..7cd7bef6e 100644 --- a/lib/mdxish.ts +++ b/lib/mdxish.ts @@ -24,6 +24,7 @@ import mdxishComponentBlocks from '../processor/transform/mdxish/mdxish-componen import mdxishHtmlBlocks from '../processor/transform/mdxish/mdxish-html-blocks'; import magicBlockRestorer from '../processor/transform/mdxish/mdxish-magic-blocks'; import mdxishTables from '../processor/transform/mdxish/mdxish-tables'; +import normalizeEmphasisAST from '../processor/transform/mdxish/normalize-malformed-md-syntax'; import { preprocessJSXExpressions, type JSXContext } from '../processor/transform/mdxish/preprocess-jsx-expressions'; import variablesTextTransformer from '../processor/transform/mdxish/variables-text'; import tailwindTransformer from '../processor/transform/tailwind'; @@ -68,6 +69,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root { .data('fromMarkdownExtensions', [mdxExpressionFromMarkdown()]) .use(remarkParse) .use(remarkFrontmatter) + .use(normalizeEmphasisAST) .use(magicBlockRestorer, { blocks }) .use(imageTransformer, { isMdxish: true }) .use(defaultTransformers) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts new file mode 100644 index 000000000..33be52b3d --- /dev/null +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -0,0 +1,110 @@ +import type { Parent, Root, Strong, Text } from 'mdast'; +import type { Plugin } from 'unified'; + +import { visit } from 'unist-util-visit'; + +/** + * A remark plugin that normalizes malformed bold markers in text nodes. + * Detects patterns like `** bold**` or `Hello** Wrong Bold**` and converts them + * to proper strong nodes, matching the behavior of the legacy rdmd engine. + * + * This runs after remark-parse, which (in v11+) is strict and doesn't parse + * malformed bold syntax. This plugin post-processes the AST to handle these cases. + */ +const normalizeEmphasisAST: Plugin = () => (tree: Root) => { + visit(tree, 'text', (node: Text, index, parent: Parent) => { + if (index === undefined || !parent) return; + + // Skip if inside code blocks, inline code, or already inside strong/emphasis + if ( + parent.type === 'inlineCode' || + parent.type === 'code' || + parent.type === 'strong' || + parent.type === 'emphasis' + ) { + return; + } + + const text = node.value; + + // Patterns to detect: + // 1. ** text** (space after opening, preceded by space/start) + // 2. **text ** (space before closing, followed by non-whitespace or end) + // 3. word** text** (word before, space after opening) + // 4. ** text ** (spaces on both sides) + + // Combined regex to match all malformed bold patterns + // Pattern: (\S+)?\s*\*\*(?:\s+([^*\n]+?)\s*\*\*|([^*\n]+?)\s+\*\*)(\S|$)? + // - (\S+)? - optional word before (capture group 1) + // - \s* - optional whitespace before ** (to preserve spaces like "Hello **") + // - \*\* - opening ** + // - (?:...) - alternation: + // - \s+([^*\n]+?)\s*\*\* - space after opening, content, optional space before closing (group 2) + // - OR ([^*\n]+?)\s+\*\* - content, space before closing (group 3) + // - (\S|$)? - optional non-whitespace after or end of string (capture group 4) + const malformedBoldRegex = /(\S+)?\s*\*\*(?:\s+([^*\n]+?)\s*\*\*|([^*\n]+?)\s+\*\*)(\S|$)?/g; + + const matches = [...text.matchAll(malformedBoldRegex)]; + if (matches.length === 0) return; + + const parts: (Strong | Text)[] = []; + let lastIndex = 0; + + matches.forEach(match => { + const matchIndex = match.index ?? 0; + const fullMatch = match[0]; + + // Add text before the match + if (matchIndex > lastIndex) { + const beforeText = text.slice(lastIndex, matchIndex); + if (beforeText) { + parts.push({ type: 'text', value: beforeText } satisfies Text); + } + } + + const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" or "Hello" in "Hello ** World**" + const content = (match[2] || match[3] || '').trim(); // The bold content (from either pattern), trimmed + const afterChar = match[4]; // Character after closing ** (if any) + + const asteriskPos = fullMatch.indexOf('**'); + const spacesBefore = wordBefore + ? fullMatch.slice(wordBefore.length, asteriskPos) + : fullMatch.slice(0, asteriskPos); + + if (wordBefore) { + const spacing = spacesBefore || ' '; + parts.push({ type: 'text', value: wordBefore + spacing } satisfies Text); + } else if (spacesBefore) { + parts.push({ type: 'text', value: spacesBefore } satisfies Text); + } + + if (content) { + parts.push({ + type: 'strong', + children: [{ type: 'text', value: content } satisfies Text], + } satisfies Strong); + } + + if (afterChar) { + parts.push({ type: 'text', value: ` ${afterChar}` } satisfies Text); + } + + lastIndex = matchIndex + fullMatch.length; + }); + + if (lastIndex < text.length) { + const remainingText = text.slice(lastIndex); + if (remainingText) { + parts.push({ type: 'text', value: remainingText } satisfies Text); + } + } + + if (parts.length > 0) { + parent.children.splice(index, 1, ...parts); + } + }); + + return tree; +}; + +export default normalizeEmphasisAST; From d1200fc9aa313d3bff828efb3018801a862bfe84 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Mon, 22 Dec 2025 13:23:55 +0700 Subject: [PATCH 02/10] feat: support underscore syntax - added a lot of comments for dev purposes, will remove before pushing --- .../normalize-malformed-md-syntax.test.ts | 218 +++++++++++++++++- .../mdxish/normalize-malformed-md-syntax.ts | 66 ++++-- 2 files changed, 258 insertions(+), 26 deletions(-) diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts index 60ccc5710..c57307dd0 100644 --- a/__tests__/transformers/normalize-malformed-md-syntax.test.ts +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -1,4 +1,4 @@ -import type { Code, Paragraph, Strong } from 'mdast'; +import type { Code, Paragraph, Strong, Text } from 'mdast'; import { remark } from 'remark'; import remarkParse from 'remark-parse'; @@ -147,6 +147,141 @@ describe('normalize-malformed-md-syntax', () => { }); }); + describe('underscore bold patterns with spaces', () => { + it('should handle space after opening __ (with word before)', () => { + const md = 'Hello__ Wrong Bold__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening __', () => { + const md = 'Hello __ World__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing __', () => { + const md = '__text __word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = '__ text __'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + ], + }); + }); + + it('should handle multiple malformed bold patterns in one text', () => { + const md = 'Start__ first__ middle__ second __end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + + it('should handle case with word before and after', () => { + const md = 'Find__ Hello World__ and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + + it('should handle mixed ** and __ patterns', () => { + const md = 'Asterisk** first** Underscore__ second__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + }); + describe('should not modify valid bold syntax', () => { it('should leave **valid** bold untouched', () => { const md = '**valid**'; @@ -174,6 +309,33 @@ describe('normalize-malformed-md-syntax', () => { const children = paragraph.children; expect(children.length).toBeGreaterThanOrEqual(1); }); + + it('should leave __valid__ bold untouched', () => { + const md = '__valid__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave word__valid__bold untouched', () => { + const md = 'word__valid__bold'; + const tree = processor.parse(md); + processor.runSync(tree); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + expect(children.length).toBeGreaterThanOrEqual(1); + }); }); describe('should skip code blocks and inline code', () => { @@ -194,6 +356,23 @@ describe('normalize-malformed-md-syntax', () => { }); }); + it('should not modify malformed bold with __ inside inline code', () => { + const md = '`__ bold__`'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'inlineCode', + value: '__ bold__', + }, + ], + }); + }); + it('should not modify malformed bold inside code blocks', () => { const md = '```\n** bold**\n```'; const tree = processor.parse(md); @@ -238,5 +417,42 @@ describe('normalize-malformed-md-syntax', () => { }); expect(paragraph.children.length).toBeGreaterThanOrEqual(3); }); + + it('should not add space when space is only before closing markers', () => { + const md = 'Hello**bold **'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + ], + }); + }); + + it('should not add space for valid bold syntax', () => { + const md = 'Hello**bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }); + const textNodes = paragraph.children.filter((c): c is Text => c.type === 'text'); + const helloText = textNodes.find(t => t.value.startsWith('Hello')); + expect(helloText).toBeDefined(); + expect(helloText?.value).toBe('Hello'); + }); }); }); diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 33be52b3d..738c40d15 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -5,8 +5,10 @@ import { visit } from 'unist-util-visit'; /** * A remark plugin that normalizes malformed bold markers in text nodes. - * Detects patterns like `** bold**` or `Hello** Wrong Bold**` and converts them - * to proper strong nodes, matching the behavior of the legacy rdmd engine. + * Detects patterns like `** bold**`, `Hello** Wrong Bold**`, `__ bold__`, or `Hello__ Wrong Bold__` + * and converts them to proper strong nodes, matching the behavior of the legacy rdmd engine. + * + * Supports both asterisk (`**bold**`) and underscore (`__bold__`) bold syntax. * * This runs after remark-parse, which (in v11+) is strict and doesn't parse * malformed bold syntax. This plugin post-processes the AST to handle these cases. @@ -27,22 +29,22 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { const text = node.value; - // Patterns to detect: - // 1. ** text** (space after opening, preceded by space/start) - // 2. **text ** (space before closing, followed by non-whitespace or end) - // 3. word** text** (word before, space after opening) - // 4. ** text ** (spaces on both sides) + // Patterns to detect for both ** and __ syntax: + // 1. ** text** or __ text__ (space after opening, preceded by space/start) + // 2. **text ** or __text __ (space before closing, followed by non-whitespace or end) + // 3. word** text** or word__ text__ (word before, space after opening) + // 4. ** text ** or __ text __ (spaces on both sides) - // Combined regex to match all malformed bold patterns - // Pattern: (\S+)?\s*\*\*(?:\s+([^*\n]+?)\s*\*\*|([^*\n]+?)\s+\*\*)(\S|$)? + // Combined regex to match all malformed bold patterns for both ** and __ + // Pattern: (\S+)?\s*(\*\*|__)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)? // - (\S+)? - optional word before (capture group 1) - // - \s* - optional whitespace before ** (to preserve spaces like "Hello **") - // - \*\* - opening ** + // - \s* - optional whitespace before markers (to preserve spaces like "Hello **" or "Hello __") + // - (\*\*|__) - opening markers: ** or __ (capture group 2, used as \2 for closing) // - (?:...) - alternation: - // - \s+([^*\n]+?)\s*\*\* - space after opening, content, optional space before closing (group 2) - // - OR ([^*\n]+?)\s+\*\* - content, space before closing (group 3) - // - (\S|$)? - optional non-whitespace after or end of string (capture group 4) - const malformedBoldRegex = /(\S+)?\s*\*\*(?:\s+([^*\n]+?)\s*\*\*|([^*\n]+?)\s+\*\*)(\S|$)?/g; + // - \s+([^*_\n]+?)\s*\2 - space after opening, content, optional space before closing (group 3) + // - OR ([^*_\n]+?)\s+\2 - content, space before closing (group 4) + // - (\S|$)? - optional non-whitespace after or end of string (capture group 5) + const malformedBoldRegex = /(\S+)?\s*(\*\*|__)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; const matches = [...text.matchAll(malformedBoldRegex)]; if (matches.length === 0) return; @@ -63,20 +65,34 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { } const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" or "Hello" in "Hello ** World**" - const content = (match[2] || match[3] || '').trim(); // The bold content (from either pattern), trimmed - const afterChar = match[4]; // Character after closing ** (if any) - - const asteriskPos = fullMatch.indexOf('**'); - const spacesBefore = wordBefore - ? fullMatch.slice(wordBefore.length, asteriskPos) - : fullMatch.slice(0, asteriskPos); + const marker = match[2]; // Either "**" or "__" + const contentWithSpaceAfter = match[3]; // Content when there's a space after opening markers + const contentWithSpaceBefore = match[4]; // Content when there's only a space before closing markers + const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); // The bold content, trimmed + const afterChar = match[5]; // Character after closing markers (if any) + + // Find position of opening markers (** or __) + const markerPos = fullMatch.indexOf(marker); + const spacesBeforeMarkers = wordBefore + ? fullMatch.slice(wordBefore.length, markerPos) + : fullMatch.slice(0, markerPos); + + // If there's a space after the opening markers (group 3), we should add a space before the word + // BUT only if there's actually a word before AND no spaces already exist before the markers + // If there's only a space before the closing markers (group 4), we should NOT add a space + // If spaces already exist before markers (like "Hello **"), we should preserve them and NOT add another + const shouldAddSpace = !!contentWithSpaceAfter && !!wordBefore && !spacesBeforeMarkers; if (wordBefore) { - const spacing = spacesBefore || ' '; + // Preserve spacing before markers, and add space only if there was one after opening markers + // and no spaces already exist before the markers + const spacing = spacesBeforeMarkers + (shouldAddSpace ? ' ' : ''); parts.push({ type: 'text', value: wordBefore + spacing } satisfies Text); - } else if (spacesBefore) { - parts.push({ type: 'text', value: spacesBefore } satisfies Text); + } else if (spacesBeforeMarkers) { + parts.push({ type: 'text', value: spacesBeforeMarkers } satisfies Text); } + // Note: We don't add a space when there's no word before, even if there was a space after opening markers + // This matches the behavior where "** text **" should become just a strong node, no leading space if (content) { parts.push({ From bc8793b7bf7f0904d202b88aeb551f3a78c1ce7d Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Mon, 22 Dec 2025 13:42:39 +0700 Subject: [PATCH 03/10] feat: add support for italics --- .../normalize-malformed-md-syntax.test.ts | 218 +++++++++++++++++- .../mdxish/normalize-malformed-md-syntax.ts | 71 +++--- 2 files changed, 259 insertions(+), 30 deletions(-) diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts index c57307dd0..4bbb7f3f3 100644 --- a/__tests__/transformers/normalize-malformed-md-syntax.test.ts +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -1,4 +1,4 @@ -import type { Code, Paragraph, Strong, Text } from 'mdast'; +import type { Code, Emphasis, Paragraph, Strong, Text } from 'mdast'; import { remark } from 'remark'; import remarkParse from 'remark-parse'; @@ -455,4 +455,220 @@ describe('normalize-malformed-md-syntax', () => { expect(helloText?.value).toBe('Hello'); }); }); + + describe('italic patterns with spaces (asterisk)', () => { + it('should handle space after opening * (with word before)', () => { + const md = 'Hello* Wrong Italic*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening *', () => { + const md = 'Hello * World*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing *', () => { + const md = '*text *word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = 'Before * text * after'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }); + }); + + it('should not add space when space is only before closing *', () => { + const md = 'Hello*italic *'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'italic' }], + }, + ], + }); + }); + }); + + describe('italic patterns with spaces (underscore)', () => { + it('should handle space after opening _ (with word before)', () => { + const md = 'Hello_ Wrong Italic_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening _', () => { + const md = 'Hello _ World_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing _', () => { + const md = '_text _word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = 'Before _ text _ after'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }); + }); + + it('should not add space when space is only before closing _', () => { + const md = 'Hello_italic _'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'italic' }], + }, + ], + }); + }); + }); + + describe('should not modify valid italic syntax', () => { + it('should leave *valid* italic untouched', () => { + const md = '*valid*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave _valid_ italic untouched', () => { + const md = '_valid_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + }); }); diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 738c40d15..9d6ad7e7e 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -1,17 +1,18 @@ -import type { Parent, Root, Strong, Text } from 'mdast'; +import type { Emphasis, Parent, Root, Strong, Text } from 'mdast'; import type { Plugin } from 'unified'; import { visit } from 'unist-util-visit'; /** - * A remark plugin that normalizes malformed bold markers in text nodes. - * Detects patterns like `** bold**`, `Hello** Wrong Bold**`, `__ bold__`, or `Hello__ Wrong Bold__` - * and converts them to proper strong nodes, matching the behavior of the legacy rdmd engine. + * A remark plugin that normalizes malformed bold and italic markers in text nodes. + * Detects patterns like `** bold**`, `Hello** Wrong Bold**`, `__ bold__`, `Hello__ Wrong Bold__`, + * `* italic*`, `Hello* Wrong Italic*`, `_ italic_`, or `Hello_ Wrong Italic_` + * and converts them to proper strong/emphasis nodes, matching the behavior of the legacy rdmd engine. * - * Supports both asterisk (`**bold**`) and underscore (`__bold__`) bold syntax. + * Supports both asterisk (`**bold**`, `*italic*`) and underscore (`__bold__`, `_italic_`) syntax. * * This runs after remark-parse, which (in v11+) is strict and doesn't parse - * malformed bold syntax. This plugin post-processes the AST to handle these cases. + * malformed emphasis syntax. This plugin post-processes the AST to handle these cases. */ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { visit(tree, 'text', (node: Text, index, parent: Parent) => { @@ -29,27 +30,29 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { const text = node.value; - // Patterns to detect for both ** and __ syntax: - // 1. ** text** or __ text__ (space after opening, preceded by space/start) - // 2. **text ** or __text __ (space before closing, followed by non-whitespace or end) - // 3. word** text** or word__ text__ (word before, space after opening) - // 4. ** text ** or __ text __ (spaces on both sides) - - // Combined regex to match all malformed bold patterns for both ** and __ - // Pattern: (\S+)?\s*(\*\*|__)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)? - // - (\S+)? - optional word before (capture group 1) - // - \s* - optional whitespace before markers (to preserve spaces like "Hello **" or "Hello __") - // - (\*\*|__) - opening markers: ** or __ (capture group 2, used as \2 for closing) + // Patterns to detect for bold (** and __) and italic (* and _) syntax: + // Bold: ** text**, **text **, word** text**, ** text ** + // Italic: * text*, *text *, word* text*, * text * + // Same patterns for underscore variants + + // Combined regex to match all malformed bold and italic patterns + // We match bold first (longer patterns), then italic (shorter patterns) + // Pattern: ([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)? + // - ([^*_\s]+)? - optional word before, excluding * and _ (capture group 1) + // This ensures we don't match "Hello*" as a word when we have "Hello**" + // - \s* - optional whitespace before markers + // - (\*\*|__|\*|_) - opening markers: **, __, *, or _ (capture group 2, used as \2 for closing) + // Note: Order matters - longer patterns (**, __) are matched before shorter ones (*, _) // - (?:...) - alternation: // - \s+([^*_\n]+?)\s*\2 - space after opening, content, optional space before closing (group 3) // - OR ([^*_\n]+?)\s+\2 - content, space before closing (group 4) // - (\S|$)? - optional non-whitespace after or end of string (capture group 5) - const malformedBoldRegex = /(\S+)?\s*(\*\*|__)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; + const malformedEmphasisRegex = /([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; - const matches = [...text.matchAll(malformedBoldRegex)]; + const matches = [...text.matchAll(malformedEmphasisRegex)]; if (matches.length === 0) return; - const parts: (Strong | Text)[] = []; + const parts: (Emphasis | Strong | Text)[] = []; let lastIndex = 0; matches.forEach(match => { @@ -64,14 +67,17 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { } } - const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" or "Hello" in "Hello ** World**" - const marker = match[2]; // Either "**" or "__" + const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" or "Hello* Wrong Italic*" + const marker = match[2]; // Either "**", "__", "*", or "_" const contentWithSpaceAfter = match[3]; // Content when there's a space after opening markers const contentWithSpaceBefore = match[4]; // Content when there's only a space before closing markers - const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); // The bold content, trimmed + const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); // The content, trimmed const afterChar = match[5]; // Character after closing markers (if any) - // Find position of opening markers (** or __) + // Determine if this is bold (double markers) or italic (single markers) + const isBold = marker === '**' || marker === '__'; + + // Find position of opening markers const markerPos = fullMatch.indexOf(marker); const spacesBeforeMarkers = wordBefore ? fullMatch.slice(wordBefore.length, markerPos) @@ -92,13 +98,20 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { parts.push({ type: 'text', value: spacesBeforeMarkers } satisfies Text); } // Note: We don't add a space when there's no word before, even if there was a space after opening markers - // This matches the behavior where "** text **" should become just a strong node, no leading space + // This matches the behavior where "** text **" or "* text *" should become just a strong/emphasis node, no leading space if (content) { - parts.push({ - type: 'strong', - children: [{ type: 'text', value: content } satisfies Text], - } satisfies Strong); + if (isBold) { + parts.push({ + type: 'strong', + children: [{ type: 'text', value: content } satisfies Text], + } satisfies Strong); + } else { + parts.push({ + type: 'emphasis', + children: [{ type: 'text', value: content } satisfies Text], + } satisfies Emphasis); + } } if (afterChar) { From bbd97ce69be6843034c99630af25dcf323f5f679 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Mon, 22 Dec 2025 13:45:43 +0700 Subject: [PATCH 04/10] chore: code cleanup --- .../mdxish/normalize-malformed-md-syntax.ts | 28 ++----------------- 1 file changed, 2 insertions(+), 26 deletions(-) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 9d6ad7e7e..1b4f24c99 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -34,22 +34,9 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { // Bold: ** text**, **text **, word** text**, ** text ** // Italic: * text*, *text *, word* text*, * text * // Same patterns for underscore variants + const malformedRegex = /([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; - // Combined regex to match all malformed bold and italic patterns - // We match bold first (longer patterns), then italic (shorter patterns) - // Pattern: ([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)? - // - ([^*_\s]+)? - optional word before, excluding * and _ (capture group 1) - // This ensures we don't match "Hello*" as a word when we have "Hello**" - // - \s* - optional whitespace before markers - // - (\*\*|__|\*|_) - opening markers: **, __, *, or _ (capture group 2, used as \2 for closing) - // Note: Order matters - longer patterns (**, __) are matched before shorter ones (*, _) - // - (?:...) - alternation: - // - \s+([^*_\n]+?)\s*\2 - space after opening, content, optional space before closing (group 3) - // - OR ([^*_\n]+?)\s+\2 - content, space before closing (group 4) - // - (\S|$)? - optional non-whitespace after or end of string (capture group 5) - const malformedEmphasisRegex = /([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; - - const matches = [...text.matchAll(malformedEmphasisRegex)]; + const matches = [...text.matchAll(malformedRegex)]; if (matches.length === 0) return; const parts: (Emphasis | Strong | Text)[] = []; @@ -59,7 +46,6 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { const matchIndex = match.index ?? 0; const fullMatch = match[0]; - // Add text before the match if (matchIndex > lastIndex) { const beforeText = text.slice(lastIndex, matchIndex); if (beforeText) { @@ -77,29 +63,19 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { // Determine if this is bold (double markers) or italic (single markers) const isBold = marker === '**' || marker === '__'; - // Find position of opening markers const markerPos = fullMatch.indexOf(marker); const spacesBeforeMarkers = wordBefore ? fullMatch.slice(wordBefore.length, markerPos) : fullMatch.slice(0, markerPos); - // If there's a space after the opening markers (group 3), we should add a space before the word - // BUT only if there's actually a word before AND no spaces already exist before the markers - // If there's only a space before the closing markers (group 4), we should NOT add a space - // If spaces already exist before markers (like "Hello **"), we should preserve them and NOT add another const shouldAddSpace = !!contentWithSpaceAfter && !!wordBefore && !spacesBeforeMarkers; if (wordBefore) { - // Preserve spacing before markers, and add space only if there was one after opening markers - // and no spaces already exist before the markers const spacing = spacesBeforeMarkers + (shouldAddSpace ? ' ' : ''); parts.push({ type: 'text', value: wordBefore + spacing } satisfies Text); } else if (spacesBeforeMarkers) { parts.push({ type: 'text', value: spacesBeforeMarkers } satisfies Text); } - // Note: We don't add a space when there's no word before, even if there was a space after opening markers - // This matches the behavior where "** text **" or "* text *" should become just a strong/emphasis node, no leading space - if (content) { if (isBold) { parts.push({ From 7a6e4e63f89ba5ac5e928842a39fcbf7ad28c05e Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Tue, 23 Dec 2025 17:17:52 +0700 Subject: [PATCH 05/10] tests: add some more edge cases related to snake_case --- .../normalize-malformed-md-syntax.test.ts | 446 +++++++++++++++++- 1 file changed, 445 insertions(+), 1 deletion(-) diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts index 4bbb7f3f3..a0f815e62 100644 --- a/__tests__/transformers/normalize-malformed-md-syntax.test.ts +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -1,12 +1,26 @@ -import type { Code, Emphasis, Paragraph, Strong, Text } from 'mdast'; +import type { + Blockquote, + Code, + Emphasis, + List, + ListItem, + Paragraph, + Strong, + Table, + TableCell, + TableRow, + Text, +} from 'mdast'; import { remark } from 'remark'; +import remarkGfm from 'remark-gfm'; import remarkParse from 'remark-parse'; import { removePosition } from 'unist-util-remove-position'; import normalizeEmphasisAST from '../../processor/transform/mdxish/normalize-malformed-md-syntax'; const processor = remark().use(remarkParse).use(normalizeEmphasisAST); +const processorWithGfm = remark().use(remarkParse).use(remarkGfm).use(normalizeEmphasisAST); describe('normalize-malformed-md-syntax', () => { describe('bold patterns with spaces', () => { @@ -671,4 +685,434 @@ describe('normalize-malformed-md-syntax', () => { }); }); }); + + describe('should not modify escaped markers', () => { + it('should leave escaped underscore untouched', () => { + const md = '\\_ not italic_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toBeDefined(); + expect(emphasis?.children[0]).toStrictEqual({ + type: 'text', + value: 'not italic', + }); + }); + + it('should leave escaped asterisk untouched', () => { + const md = '\\* not bold*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toBeDefined(); + expect(emphasis?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should leave escaped double asterisk untouched', () => { + const md = '\\*\\* not bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should leave escaped double underscore untouched', () => { + const md = '\\_\\_ not bold__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should handle multiple escaped markers', () => { + const md = 'Text with \\* asterisk and \\_ underscore and \\*\\* double asterisk'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + expect(paragraph.type).toBe('paragraph'); + expect(paragraph.children.length).toBeGreaterThan(0); + const textNodes = paragraph.children.filter((c): c is Text => c.type === 'text'); + const allText = textNodes.map(t => t.value).join(''); + expect(allText.length).toBeGreaterThan(0); + }); + }); + + describe('malformed syntax in callouts', () => { + it('should handle malformed bold in callout content', () => { + const md = '> 👍 Success\n>\n> This is ** Wrong Bold** text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in callout content', () => { + const md = '> 📘 Info\n>\n> This is * Wrong Italic* text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed bold with word before in callout', () => { + const md = '> ⚠️ Warning\n>\n> Find** Hello World** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }); + }); + + it('should handle multiple malformed patterns in callout', () => { + const md = '> ❗ Error\n>\n> Start** first** middle* second *end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strongNodes = paragraph.children.filter((c): c is Strong => c.type === 'strong'); + const emphasisNodes = paragraph.children.filter((c): c is Emphasis => c.type === 'emphasis'); + + expect(strongNodes.length).toBeGreaterThanOrEqual(1); + expect(emphasisNodes.length).toBeGreaterThanOrEqual(1); + }); + }); + + describe('malformed syntax in tables', () => { + it('should handle malformed bold in table cells', () => { + const md = '| Header |\n|--------|\n| ** Wrong Bold** |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const strong = cell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in table cells', () => { + const md = '| Header |\n|--------|\n| * Wrong Italic* |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const emphasis = cell.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed bold with word before in table cell', () => { + const md = '| Column |\n|--------|\n| Find** Hello** text |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const strong = cell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Hello' }], + }); + }); + + it('should handle malformed syntax in multiple table cells', () => { + const md = '| Col1 | Col2 |\n|------|------|\n| ** Bold** | * Italic* |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cells = row.children.filter((c): c is TableCell => c.type === 'tableCell'); + expect(cells.length).toBeGreaterThanOrEqual(2); + + const firstCell = cells[0]; + const strong = firstCell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toBeDefined(); + + const secondCell = cells[1]; + const emphasis = secondCell.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toBeDefined(); + }); + }); + + describe('malformed syntax in lists', () => { + it('should handle malformed bold in list items', () => { + const md = '- Item with ** Wrong Bold** text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const listItem = list.children[0] as ListItem; + expect(listItem.type).toBe('listItem'); + const paragraph = listItem.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in list items', () => { + const md = '- Item with * Wrong Italic* text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const listItem = list.children[0] as ListItem; + expect(listItem.type).toBe('listItem'); + const paragraph = listItem.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed syntax in nested list items', () => { + const md = '- Outer\n - Inner with ** Wrong Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const outerItem = list.children[0] as ListItem; + expect(outerItem.type).toBe('listItem'); + expect(outerItem.children.length).toBeGreaterThan(1); + const nestedList = outerItem.children[1] as List; + expect(nestedList.type).toBe('list'); + const innerItem = nestedList.children[0] as ListItem; + expect(innerItem.type).toBe('listItem'); + const paragraph = innerItem.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + }); + + describe('edge cases with nested syntax', () => { + it('should have malformed italic inside valid bold', () => { + const md = '**Bold with _ malformed italic_ inside**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + // Should have valid bold + expect(strong).toBeDefined(); + expect(strong?.children.length).toBeGreaterThan(0); + // The malformed italic inside should be processed into an emphasis node + const emphasisInside = strong?.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasisInside).toBeDefined(); + expect(emphasisInside?.children[0]).toStrictEqual({ + type: 'text', + value: 'malformed italic', + }); + }); + + it('should handle malformed bold with snake_case content', () => { + const md = '** some_snake_case**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case' }], + }); + }); + + it('should handle malformed bold with multiple underscores in content', () => { + const md = '** some_snake_case_with_many_underscores**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case_with_many_underscores' }], + }); + }); + + it('should handle malformed italic with snake_case content', () => { + const md = 'Text with * some_snake_case* here'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'some_snake_case' }], + }); + }); + + it('should handle malformed bold with word before and snake_case content', () => { + const md = 'Find** some_snake_case** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + + it('should handle malformed underscore italic inside valid asterisk bold', () => { + const md = '**Bold with _ malformed_ inside**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children.length).toBeGreaterThan(0); + const emphasisInside = strong?.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasisInside).toBeDefined(); + expect(emphasisInside?.children[0]).toStrictEqual({ + type: 'text', + value: 'malformed', + }); + }); + + it('should handle malformed bold with mixed underscores and spaces', () => { + const md = '** some_snake_case with spaces**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case with spaces' }], + }); + }); + + it('should handle malformed bold with underscore in word before', () => { + const md = 'some_word** Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'some_word ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Bold' }], + }, + ], + }); + }); + }); }); From 0d24e18b0175ecd0ce621c3d86b7dec2c0b19ec9 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Tue, 23 Dec 2025 17:18:59 +0700 Subject: [PATCH 06/10] add some more tests regarding trailing spaces --- .../normalize-malformed-md-syntax.test.ts | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts index a0f815e62..a03599b04 100644 --- a/__tests__/transformers/normalize-malformed-md-syntax.test.ts +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -96,6 +96,92 @@ describe('normalize-malformed-md-syntax', () => { }); }); + it('should NOT add space before punctuation when no trailing space before closing markers', () => { + const md = 'This is ** bold**!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // No space before ! because there was no space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: '!' }, + ], + }); + }); + + it('should preserve space before punctuation when trailing space before closing markers', () => { + const md = 'This is ** bold **!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Space before ! because there was a space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: ' !' }, + ], + }); + }); + + it('should preserve space before word when trailing space before closing markers', () => { + const md = 'This is ** bold **Hello'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Space before H because there was a space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: ' H' }, + { type: 'text', value: 'ello' }, + ], + }); + }); + + it('should NOT add space before word when no trailing space before closing markers', () => { + const md = 'This is ** bold**Hello'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // No space before H because there was no space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: 'H' }, + { type: 'text', value: 'ello' }, + ], + }); + }); + it('should handle multiple malformed bold patterns in one text', () => { const md = 'Start** first** middle** second **end'; const tree = processor.parse(md); From 335140e2c030913067159bca161b770bc074ddd3 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Tue, 23 Dec 2025 17:19:47 +0700 Subject: [PATCH 07/10] fix: add new logic to support underscores and snake_case - also fix the trailing space issue --- .../mdxish/normalize-malformed-md-syntax.ts | 85 ++++++++++++++----- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 1b4f24c99..eec0a1055 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -10,6 +10,7 @@ import { visit } from 'unist-util-visit'; * and converts them to proper strong/emphasis nodes, matching the behavior of the legacy rdmd engine. * * Supports both asterisk (`**bold**`, `*italic*`) and underscore (`__bold__`, `_italic_`) syntax. + * Also supports snake_case content like `** some_snake_case**`. * * This runs after remark-parse, which (in v11+) is strict and doesn't parse * malformed emphasis syntax. This plugin post-processes the AST to handle these cases. @@ -18,13 +19,8 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { visit(tree, 'text', (node: Text, index, parent: Parent) => { if (index === undefined || !parent) return; - // Skip if inside code blocks, inline code, or already inside strong/emphasis - if ( - parent.type === 'inlineCode' || - parent.type === 'code' || - parent.type === 'strong' || - parent.type === 'emphasis' - ) { + // Skip if inside code blocks or inline code + if (parent.type === 'inlineCode' || parent.type === 'code') { return; } @@ -34,15 +30,64 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { // Bold: ** text**, **text **, word** text**, ** text ** // Italic: * text*, *text *, word* text*, * text * // Same patterns for underscore variants - const malformedRegex = /([^*_\s]+)?\s*(\*\*|__|\*|_)(?:\s+([^*_\n]+?)\s*\2|([^*_\n]+?)\s+\2)(\S|$)?/g; + // We use separate patterns for each marker type to allow this flexibility. - const matches = [...text.matchAll(malformedRegex)]; - if (matches.length === 0) return; + // Pattern for ** bold ** + // Groups: 1=wordBefore, 2=marker, 3=contentWithSpaceAfter, 4=trailingSpace1, 5=contentWithSpaceBefore, 6=trailingSpace2, 7=afterChar + // trailingSpace1 is for "** text **" pattern, trailingSpace2 is for "**text **" pattern + const asteriskBoldRegex = /([^*\s]+)?\s*(\*\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; + + // Pattern for __ bold __ + const underscoreBoldRegex = /([^_\s]+)?\s*(__)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; + + // Pattern for * italic * + const asteriskItalicRegex = /([^*\s]+)?\s*(\*)(?!\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; + + // Pattern for _ italic _ + const underscoreItalicRegex = /([^_\s]+)?\s*(_)(?!_)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; + + interface MatchInfo { + isBold: boolean; + marker: string; + match: RegExpMatchArray; + } + + const allMatches: MatchInfo[] = []; + + [...text.matchAll(asteriskBoldRegex)].forEach(match => { + allMatches.push({ isBold: true, marker: '**', match }); + }); + [...text.matchAll(underscoreBoldRegex)].forEach(match => { + allMatches.push({ isBold: true, marker: '__', match }); + }); + [...text.matchAll(asteriskItalicRegex)].forEach(match => { + allMatches.push({ isBold: false, marker: '*', match }); + }); + [...text.matchAll(underscoreItalicRegex)].forEach(match => { + allMatches.push({ isBold: false, marker: '_', match }); + }); + + if (allMatches.length === 0) return; + + allMatches.sort((a, b) => (a.match.index ?? 0) - (b.match.index ?? 0)); + + const filteredMatches: MatchInfo[] = []; + let lastEnd = 0; + allMatches.forEach(info => { + const start = info.match.index ?? 0; + const end = start + info.match[0].length; + if (start >= lastEnd) { + filteredMatches.push(info); + lastEnd = end; + } + }); + + if (filteredMatches.length === 0) return; const parts: (Emphasis | Strong | Text)[] = []; let lastIndex = 0; - matches.forEach(match => { + filteredMatches.forEach(({ match, marker, isBold }) => { const matchIndex = match.index ?? 0; const fullMatch = match[0]; @@ -53,15 +98,14 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { } } - const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" or "Hello* Wrong Italic*" - const marker = match[2]; // Either "**", "__", "*", or "_" + const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" const contentWithSpaceAfter = match[3]; // Content when there's a space after opening markers - const contentWithSpaceBefore = match[4]; // Content when there's only a space before closing markers - const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); // The content, trimmed - const afterChar = match[5]; // Character after closing markers (if any) - - // Determine if this is bold (double markers) or italic (single markers) - const isBold = marker === '**' || marker === '__'; + const trailingSpace1 = match[4] || ''; // Space before closing markers (for "** text **" pattern) + const contentWithSpaceBefore = match[5]; // Content when there's only a space before closing markers + const trailingSpace2 = match[6] || ''; // Space before closing markers (for "**text **" pattern) + const trailingSpace = trailingSpace1 || trailingSpace2; // Combined trailing space + const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); + const afterChar = match[7]; // Character after closing markers (if any) const markerPos = fullMatch.indexOf(marker); const spacesBeforeMarkers = wordBefore @@ -91,7 +135,8 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { } if (afterChar) { - parts.push({ type: 'text', value: ` ${afterChar}` } satisfies Text); + const prefix = trailingSpace ? ' ' : ''; + parts.push({ type: 'text', value: prefix + afterChar } satisfies Text); } lastIndex = matchIndex + fullMatch.length; From fbfd1a81c92b70f0e9f78484d83b8d1bb428e7f4 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Tue, 23 Dec 2025 17:34:12 +0700 Subject: [PATCH 08/10] fix: fixed a bug where escaped markers are incorrectly rendered --- .../normalize-malformed-md-syntax.test.ts | 21 +++++++++++++++++++ .../mdxish/normalize-malformed-md-syntax.ts | 6 ++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts index a03599b04..1a196d668 100644 --- a/__tests__/transformers/normalize-malformed-md-syntax.test.ts +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -138,6 +138,27 @@ describe('normalize-malformed-md-syntax', () => { }); }); + it('should handle escaped asterisk in content', () => { + const md = 'This is ** bo\\*ld**!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Escaped asterisk should be preserved in content + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bo*ld' }], + }, + { type: 'text', value: '!' }, + ], + }); + }); + it('should preserve space before word when trailing space before closing markers', () => { const md = 'This is ** bold **Hello'; const tree = processor.parse(md); diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index eec0a1055..59d5b9db1 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -35,10 +35,12 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { // Pattern for ** bold ** // Groups: 1=wordBefore, 2=marker, 3=contentWithSpaceAfter, 4=trailingSpace1, 5=contentWithSpaceBefore, 6=trailingSpace2, 7=afterChar // trailingSpace1 is for "** text **" pattern, trailingSpace2 is for "**text **" pattern - const asteriskBoldRegex = /([^*\s]+)?\s*(\*\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; + const asteriskBoldRegex = + /([^*\s]+)?\s*(\*\*)(?:\s+((?:[^*\n]|\*(?!\*))+?)(\s*)\2|((?:[^*\n]|\*(?!\*))+?)(\s+)\2)(\S|$)?/g; // Pattern for __ bold __ - const underscoreBoldRegex = /([^_\s]+)?\s*(__)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; + const underscoreBoldRegex = + /([^_\s]+)?\s*(__)(?:\s+((?:[^_\n]|_(?!_))+?)(\s*)\2|((?:[^_\n]|_(?!_))+?)(\s+)\2)(\S|$)?/g; // Pattern for * italic * const asteriskItalicRegex = /([^*\s]+)?\s*(\*)(?!\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; From 9d5ab905a559c7e1b1ab2f4409190a29ab3c21b1 Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Tue, 23 Dec 2025 17:39:17 +0700 Subject: [PATCH 09/10] fix: make sure plugin returns index and what to skip - returns new index to prevent the plugin from revisiting them --- .../mdxish/normalize-malformed-md-syntax.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 59d5b9db1..79c00f49c 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -1,7 +1,7 @@ import type { Emphasis, Parent, Root, Strong, Text } from 'mdast'; import type { Plugin } from 'unified'; -import { visit } from 'unist-util-visit'; +import { SKIP, visit } from 'unist-util-visit'; /** * A remark plugin that normalizes malformed bold and italic markers in text nodes. @@ -16,12 +16,12 @@ import { visit } from 'unist-util-visit'; * malformed emphasis syntax. This plugin post-processes the AST to handle these cases. */ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { - visit(tree, 'text', (node: Text, index, parent: Parent) => { - if (index === undefined || !parent) return; + visit(tree, 'text', function visitor(node: Text, index, parent: Parent) { + if (index === undefined || !parent) return undefined; // Skip if inside code blocks or inline code if (parent.type === 'inlineCode' || parent.type === 'code') { - return; + return undefined; } const text = node.value; @@ -69,7 +69,7 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { allMatches.push({ isBold: false, marker: '_', match }); }); - if (allMatches.length === 0) return; + if (allMatches.length === 0) return undefined; allMatches.sort((a, b) => (a.match.index ?? 0) - (b.match.index ?? 0)); @@ -84,7 +84,7 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { } }); - if (filteredMatches.length === 0) return; + if (filteredMatches.length === 0) return undefined; const parts: (Emphasis | Strong | Text)[] = []; let lastIndex = 0; @@ -153,7 +153,10 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { if (parts.length > 0) { parent.children.splice(index, 1, ...parts); + return [SKIP, index + parts.length]; } + + return undefined; }); return tree; From 0b42e1bf93f7a5b44b60887229e9a99ca803b6ef Mon Sep 17 00:00:00 2001 From: Maximilian Falco Widjaya Date: Wed, 24 Dec 2025 03:05:21 +0700 Subject: [PATCH 10/10] chore: move regex creation up the module scope --- .../mdxish/normalize-malformed-md-syntax.ts | 43 +++++++++---------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts index 79c00f49c..ca0aa896b 100644 --- a/processor/transform/mdxish/normalize-malformed-md-syntax.ts +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -3,6 +3,27 @@ import type { Plugin } from 'unified'; import { SKIP, visit } from 'unist-util-visit'; +// Patterns to detect for bold (** and __) and italic (* and _) syntax: +// Bold: ** text**, **text **, word** text**, ** text ** +// Italic: * text*, *text *, word* text*, * text * +// Same patterns for underscore variants +// We use separate patterns for each marker type to allow this flexibility. + +// Pattern for ** bold ** +// Groups: 1=wordBefore, 2=marker, 3=contentWithSpaceAfter, 4=trailingSpace1, 5=contentWithSpaceBefore, 6=trailingSpace2, 7=afterChar +// trailingSpace1 is for "** text **" pattern, trailingSpace2 is for "**text **" pattern +const asteriskBoldRegex = + /([^*\s]+)?\s*(\*\*)(?:\s+((?:[^*\n]|\*(?!\*))+?)(\s*)\2|((?:[^*\n]|\*(?!\*))+?)(\s+)\2)(\S|$)?/g; + +// Pattern for __ bold __ +const underscoreBoldRegex = /([^_\s]+)?\s*(__)(?:\s+((?:[^_\n]|_(?!_))+?)(\s*)\2|((?:[^_\n]|_(?!_))+?)(\s+)\2)(\S|$)?/g; + +// Pattern for * italic * +const asteriskItalicRegex = /([^*\s]+)?\s*(\*)(?!\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; + +// Pattern for _ italic _ +const underscoreItalicRegex = /([^_\s]+)?\s*(_)(?!_)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; + /** * A remark plugin that normalizes malformed bold and italic markers in text nodes. * Detects patterns like `** bold**`, `Hello** Wrong Bold**`, `__ bold__`, `Hello__ Wrong Bold__`, @@ -26,28 +47,6 @@ const normalizeEmphasisAST: Plugin = () => (tree: Root) => { const text = node.value; - // Patterns to detect for bold (** and __) and italic (* and _) syntax: - // Bold: ** text**, **text **, word** text**, ** text ** - // Italic: * text*, *text *, word* text*, * text * - // Same patterns for underscore variants - // We use separate patterns for each marker type to allow this flexibility. - - // Pattern for ** bold ** - // Groups: 1=wordBefore, 2=marker, 3=contentWithSpaceAfter, 4=trailingSpace1, 5=contentWithSpaceBefore, 6=trailingSpace2, 7=afterChar - // trailingSpace1 is for "** text **" pattern, trailingSpace2 is for "**text **" pattern - const asteriskBoldRegex = - /([^*\s]+)?\s*(\*\*)(?:\s+((?:[^*\n]|\*(?!\*))+?)(\s*)\2|((?:[^*\n]|\*(?!\*))+?)(\s+)\2)(\S|$)?/g; - - // Pattern for __ bold __ - const underscoreBoldRegex = - /([^_\s]+)?\s*(__)(?:\s+((?:[^_\n]|_(?!_))+?)(\s*)\2|((?:[^_\n]|_(?!_))+?)(\s+)\2)(\S|$)?/g; - - // Pattern for * italic * - const asteriskItalicRegex = /([^*\s]+)?\s*(\*)(?!\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; - - // Pattern for _ italic _ - const underscoreItalicRegex = /([^_\s]+)?\s*(_)(?!_)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; - interface MatchInfo { isBold: boolean; marker: string;