diff --git a/__tests__/lib/mdxish/mdxish.test.ts b/__tests__/lib/mdxish/mdxish.test.ts index f4c4f9417..7e13106e6 100644 --- a/__tests__/lib/mdxish/mdxish.test.ts +++ b/__tests__/lib/mdxish/mdxish.test.ts @@ -1,16 +1,43 @@ +import type { Root } from 'hast'; + import { mdxish } from '../../../lib/mdxish'; -describe('mdxish', () => { +describe('mdxish should render', () => { describe('invalid mdx syntax', () => { it('should render unclosed tags', () => { const md = '
'; expect(() => mdxish(md)).not.toThrow(); }); + }); - it('should render content in new lines', () => { - const md = `
hello -
`; - expect(() => mdxish(md)).not.toThrow(); + describe('relaxed md syntax, such as', () => { + it('wrong bold syntax', () => { + const md = `**Bold** + +Normal + +Hello** Wrong Bold**`; + const tree = mdxish(md); + + const getStrongTexts = (node: Root | Root['children'][number]): string[] => { + const texts: string[] = []; + if ('type' in node && node.type === 'element' && node.tagName === 'strong') { + const textNodes = + 'children' in node && Array.isArray(node.children) + ? node.children.filter(c => 'type' in c && c.type === 'text') + : []; + texts.push(textNodes.map(t => ('value' in t ? t.value : '')).join('')); + } + if ('children' in node && Array.isArray(node.children)) { + node.children.forEach(child => { + texts.push(...getStrongTexts(child)); + }); + } + return texts; + }; + + const strongTexts = getStrongTexts(tree); + expect(strongTexts.length).toBeGreaterThanOrEqual(2); }); }); -}); \ No newline at end of file +}); diff --git a/__tests__/transformers/normalize-malformed-md-syntax.test.ts b/__tests__/transformers/normalize-malformed-md-syntax.test.ts new file mode 100644 index 000000000..1a196d668 --- /dev/null +++ b/__tests__/transformers/normalize-malformed-md-syntax.test.ts @@ -0,0 +1,1225 @@ +import type { + Blockquote, + Code, + Emphasis, + List, + ListItem, + Paragraph, + Strong, + Table, + TableCell, + TableRow, + Text, +} from 'mdast'; + +import { remark } from 'remark'; +import remarkGfm from 'remark-gfm'; +import remarkParse from 'remark-parse'; +import { removePosition } from 'unist-util-remove-position'; + +import normalizeEmphasisAST from '../../processor/transform/mdxish/normalize-malformed-md-syntax'; + +const processor = remark().use(remarkParse).use(normalizeEmphasisAST); +const processorWithGfm = remark().use(remarkParse).use(remarkGfm).use(normalizeEmphasisAST); + +describe('normalize-malformed-md-syntax', () => { + describe('bold patterns with spaces', () => { + it('should handle space after opening ** (with word before)', () => { + const md = 'Hello** Wrong Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening **', () => { + const md = 'Hello ** World**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing **', () => { + const md = '**text **word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = '** text **'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + ], + }); + }); + + it('should NOT add space before punctuation when no trailing space before closing markers', () => { + const md = 'This is ** bold**!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // No space before ! because there was no space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: '!' }, + ], + }); + }); + + it('should preserve space before punctuation when trailing space before closing markers', () => { + const md = 'This is ** bold **!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Space before ! because there was a space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: ' !' }, + ], + }); + }); + + it('should handle escaped asterisk in content', () => { + const md = 'This is ** bo\\*ld**!'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Escaped asterisk should be preserved in content + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bo*ld' }], + }, + { type: 'text', value: '!' }, + ], + }); + }); + + it('should preserve space before word when trailing space before closing markers', () => { + const md = 'This is ** bold **Hello'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // Space before H because there was a space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: ' H' }, + { type: 'text', value: 'ello' }, + ], + }); + }); + + it('should NOT add space before word when no trailing space before closing markers', () => { + const md = 'This is ** bold**Hello'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + // No space before H because there was no space before closing ** + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'This ' }, + { type: 'text', value: 'is ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + { type: 'text', value: 'H' }, + { type: 'text', value: 'ello' }, + ], + }); + }); + + it('should handle multiple malformed bold patterns in one text', () => { + const md = 'Start** first** middle** second **end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + + it('should handle complex case from migration tests', () => { + const md = 'Move to **Hello**> **World **from the top left menu'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes.length).toBeGreaterThanOrEqual(1); + const worldNode = strongNodes.find( + (n): n is Strong => + n.type === 'strong' && + Array.isArray(n.children) && + n.children[0]?.type === 'text' && + n.children[0].value === 'World', + ); + expect(worldNode).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }); + }); + + it('should handle case with word before and after', () => { + const md = 'Find** Hello World** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + }); + + describe('underscore bold patterns with spaces', () => { + it('should handle space after opening __ (with word before)', () => { + const md = 'Hello__ Wrong Bold__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening __', () => { + const md = 'Hello __ World__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing __', () => { + const md = '__text __word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = '__ text __'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'text' }], + }, + ], + }); + }); + + it('should handle multiple malformed bold patterns in one text', () => { + const md = 'Start__ first__ middle__ second __end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + + it('should handle case with word before and after', () => { + const md = 'Find__ Hello World__ and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + + it('should handle mixed ** and __ patterns', () => { + const md = 'Asterisk** first** Underscore__ second__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + const strongNodes = children.filter((c): c is Strong => c.type === 'strong'); + + expect(strongNodes).toHaveLength(2); + expect(strongNodes[0]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'first' }], + }); + expect(strongNodes[1]).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'second' }], + }); + }); + }); + + describe('should not modify valid bold syntax', () => { + it('should leave **valid** bold untouched', () => { + const md = '**valid**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave word**valid**bold untouched', () => { + const md = 'word**valid**bold'; + const tree = processor.parse(md); + processor.runSync(tree); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + expect(children.length).toBeGreaterThanOrEqual(1); + }); + + it('should leave __valid__ bold untouched', () => { + const md = '__valid__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'strong', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave word__valid__bold untouched', () => { + const md = 'word__valid__bold'; + const tree = processor.parse(md); + processor.runSync(tree); + + const paragraph = tree.children[0] as Paragraph; + const children = paragraph.children; + expect(children.length).toBeGreaterThanOrEqual(1); + }); + }); + + describe('should skip code blocks and inline code', () => { + it('should not modify malformed bold inside inline code', () => { + const md = '`** bold**`'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'inlineCode', + value: '** bold**', + }, + ], + }); + }); + + it('should not modify malformed bold with __ inside inline code', () => { + const md = '`__ bold__`'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'inlineCode', + value: '__ bold__', + }, + ], + }); + }); + + it('should not modify malformed bold inside code blocks', () => { + const md = '```\n** bold**\n```'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const codeBlock = tree.children[0] as Code; + expect(codeBlock.type).toBe('code'); + expect(codeBlock.value).toContain('** bold**'); + }); + }); + + describe('edge cases', () => { + it('should handle empty content', () => { + const md = '** **'; + const tree = processor.parse(md); + processor.runSync(tree); + + expect(tree.children.length).toBeGreaterThan(0); + }); + + it('should handle newlines in content', () => { + const md = '** text\nwith newline**'; + const tree = processor.parse(md); + processor.runSync(tree); + + expect(tree.children.length).toBeGreaterThan(0); + }); + + it('should preserve text around malformed bold', () => { + const md = 'Before Hello** Wrong Bold** After'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + expect(paragraph.children.length).toBeGreaterThanOrEqual(3); + }); + + it('should not add space when space is only before closing markers', () => { + const md = 'Hello**bold **'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }, + ], + }); + }); + + it('should not add space for valid bold syntax', () => { + const md = 'Hello**bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'bold' }], + }); + const textNodes = paragraph.children.filter((c): c is Text => c.type === 'text'); + const helloText = textNodes.find(t => t.value.startsWith('Hello')); + expect(helloText).toBeDefined(); + expect(helloText?.value).toBe('Hello'); + }); + }); + + describe('italic patterns with spaces (asterisk)', () => { + it('should handle space after opening * (with word before)', () => { + const md = 'Hello* Wrong Italic*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening *', () => { + const md = 'Hello * World*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing *', () => { + const md = '*text *word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = 'Before * text * after'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }); + }); + + it('should not add space when space is only before closing *', () => { + const md = 'Hello*italic *'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'italic' }], + }, + ], + }); + }); + }); + + describe('italic patterns with spaces (underscore)', () => { + it('should handle space after opening _ (with word before)', () => { + const md = 'Hello_ Wrong Italic_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }, + ], + }); + }); + + it('should preserve multiple spaces before opening _', () => { + const md = 'Hello _ World_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello ' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'World' }], + }, + ], + }); + }); + + it('should handle space before closing _', () => { + const md = '_text _word'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }, + { type: 'text', value: ' w' }, + { type: 'text', value: 'ord' }, + ], + }); + }); + + it('should handle spaces on both sides', () => { + const md = 'Before _ text _ after'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'text' }], + }); + }); + + it('should not add space when space is only before closing _', () => { + const md = 'Hello_italic _'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Hello' }, + { + type: 'emphasis', + children: [{ type: 'text', value: 'italic' }], + }, + ], + }); + }); + }); + + describe('should not modify valid italic syntax', () => { + it('should leave *valid* italic untouched', () => { + const md = '*valid*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + + it('should leave _valid_ italic untouched', () => { + const md = '_valid_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { + type: 'emphasis', + children: [{ type: 'text', value: 'valid' }], + }, + ], + }); + }); + }); + + describe('should not modify escaped markers', () => { + it('should leave escaped underscore untouched', () => { + const md = '\\_ not italic_'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toBeDefined(); + expect(emphasis?.children[0]).toStrictEqual({ + type: 'text', + value: 'not italic', + }); + }); + + it('should leave escaped asterisk untouched', () => { + const md = '\\* not bold*'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toBeDefined(); + expect(emphasis?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should leave escaped double asterisk untouched', () => { + const md = '\\*\\* not bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should leave escaped double underscore untouched', () => { + const md = '\\_\\_ not bold__'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children[0]).toStrictEqual({ + type: 'text', + value: 'not bold', + }); + }); + + it('should handle multiple escaped markers', () => { + const md = 'Text with \\* asterisk and \\_ underscore and \\*\\* double asterisk'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + expect(paragraph.type).toBe('paragraph'); + expect(paragraph.children.length).toBeGreaterThan(0); + const textNodes = paragraph.children.filter((c): c is Text => c.type === 'text'); + const allText = textNodes.map(t => t.value).join(''); + expect(allText.length).toBeGreaterThan(0); + }); + }); + + describe('malformed syntax in callouts', () => { + it('should handle malformed bold in callout content', () => { + const md = '> 👍 Success\n>\n> This is ** Wrong Bold** text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in callout content', () => { + const md = '> 📘 Info\n>\n> This is * Wrong Italic* text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed bold with word before in callout', () => { + const md = '> ⚠️ Warning\n>\n> Find** Hello World** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Hello World' }], + }); + }); + + it('should handle multiple malformed patterns in callout', () => { + const md = '> ❗ Error\n>\n> Start** first** middle* second *end'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const blockquote = tree.children[0] as Blockquote; + expect(blockquote.type).toBe('blockquote'); + const paragraph = blockquote.children[blockquote.children.length - 1] as Paragraph; + const strongNodes = paragraph.children.filter((c): c is Strong => c.type === 'strong'); + const emphasisNodes = paragraph.children.filter((c): c is Emphasis => c.type === 'emphasis'); + + expect(strongNodes.length).toBeGreaterThanOrEqual(1); + expect(emphasisNodes.length).toBeGreaterThanOrEqual(1); + }); + }); + + describe('malformed syntax in tables', () => { + it('should handle malformed bold in table cells', () => { + const md = '| Header |\n|--------|\n| ** Wrong Bold** |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const strong = cell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in table cells', () => { + const md = '| Header |\n|--------|\n| * Wrong Italic* |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const emphasis = cell.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed bold with word before in table cell', () => { + const md = '| Column |\n|--------|\n| Find** Hello** text |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cell = row.children[0] as TableCell; + expect(cell.type).toBe('tableCell'); + const strong = cell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Hello' }], + }); + }); + + it('should handle malformed syntax in multiple table cells', () => { + const md = '| Col1 | Col2 |\n|------|------|\n| ** Bold** | * Italic* |'; + const tree = processorWithGfm.parse(md); + processorWithGfm.runSync(tree); + removePosition(tree, { force: true }); + + const table = tree.children[0] as Table; + expect(table.type).toBe('table'); + const row = table.children[1] as TableRow; + expect(row.type).toBe('tableRow'); + const cells = row.children.filter((c): c is TableCell => c.type === 'tableCell'); + expect(cells.length).toBeGreaterThanOrEqual(2); + + const firstCell = cells[0]; + const strong = firstCell.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toBeDefined(); + + const secondCell = cells[1]; + const emphasis = secondCell.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toBeDefined(); + }); + }); + + describe('malformed syntax in lists', () => { + it('should handle malformed bold in list items', () => { + const md = '- Item with ** Wrong Bold** text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const listItem = list.children[0] as ListItem; + expect(listItem.type).toBe('listItem'); + const paragraph = listItem.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + + it('should handle malformed italic in list items', () => { + const md = '- Item with * Wrong Italic* text'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const listItem = list.children[0] as ListItem; + expect(listItem.type).toBe('listItem'); + const paragraph = listItem.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'Wrong Italic' }], + }); + }); + + it('should handle malformed syntax in nested list items', () => { + const md = '- Outer\n - Inner with ** Wrong Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const list = tree.children[0] as List; + expect(list.type).toBe('list'); + const outerItem = list.children[0] as ListItem; + expect(outerItem.type).toBe('listItem'); + expect(outerItem.children.length).toBeGreaterThan(1); + const nestedList = outerItem.children[1] as List; + expect(nestedList.type).toBe('list'); + const innerItem = nestedList.children[0] as ListItem; + expect(innerItem.type).toBe('listItem'); + const paragraph = innerItem.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'Wrong Bold' }], + }); + }); + }); + + describe('edge cases with nested syntax', () => { + it('should have malformed italic inside valid bold', () => { + const md = '**Bold with _ malformed italic_ inside**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + // Should have valid bold + expect(strong).toBeDefined(); + expect(strong?.children.length).toBeGreaterThan(0); + // The malformed italic inside should be processed into an emphasis node + const emphasisInside = strong?.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasisInside).toBeDefined(); + expect(emphasisInside?.children[0]).toStrictEqual({ + type: 'text', + value: 'malformed italic', + }); + }); + + it('should handle malformed bold with snake_case content', () => { + const md = '** some_snake_case**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case' }], + }); + }); + + it('should handle malformed bold with multiple underscores in content', () => { + const md = '** some_snake_case_with_many_underscores**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case_with_many_underscores' }], + }); + }); + + it('should handle malformed italic with snake_case content', () => { + const md = 'Text with * some_snake_case* here'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const emphasis = paragraph.children.find((c): c is Emphasis => c.type === 'emphasis'); + + expect(emphasis).toStrictEqual({ + type: 'emphasis', + children: [{ type: 'text', value: 'some_snake_case' }], + }); + }); + + it('should handle malformed bold with word before and snake_case content', () => { + const md = 'Find** some_snake_case** and click'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'Find ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case' }], + }, + { type: 'text', value: ' and click' }, + ], + }); + }); + + it('should handle malformed underscore italic inside valid asterisk bold', () => { + const md = '**Bold with _ malformed_ inside**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toBeDefined(); + expect(strong?.children.length).toBeGreaterThan(0); + const emphasisInside = strong?.children.find((c): c is Emphasis => c.type === 'emphasis'); + expect(emphasisInside).toBeDefined(); + expect(emphasisInside?.children[0]).toStrictEqual({ + type: 'text', + value: 'malformed', + }); + }); + + it('should handle malformed bold with mixed underscores and spaces', () => { + const md = '** some_snake_case with spaces**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + const paragraph = tree.children[0] as Paragraph; + const strong = paragraph.children.find((c): c is Strong => c.type === 'strong'); + + expect(strong).toStrictEqual({ + type: 'strong', + children: [{ type: 'text', value: 'some_snake_case with spaces' }], + }); + }); + + it('should handle malformed bold with underscore in word before', () => { + const md = 'some_word** Bold**'; + const tree = processor.parse(md); + processor.runSync(tree); + removePosition(tree, { force: true }); + + expect(tree.children[0]).toStrictEqual({ + type: 'paragraph', + children: [ + { type: 'text', value: 'some_word ' }, + { + type: 'strong', + children: [{ type: 'text', value: 'Bold' }], + }, + ], + }); + }); + }); +}); diff --git a/lib/mdxish.ts b/lib/mdxish.ts index c20b5caf1..7cd7bef6e 100644 --- a/lib/mdxish.ts +++ b/lib/mdxish.ts @@ -24,6 +24,7 @@ import mdxishComponentBlocks from '../processor/transform/mdxish/mdxish-componen import mdxishHtmlBlocks from '../processor/transform/mdxish/mdxish-html-blocks'; import magicBlockRestorer from '../processor/transform/mdxish/mdxish-magic-blocks'; import mdxishTables from '../processor/transform/mdxish/mdxish-tables'; +import normalizeEmphasisAST from '../processor/transform/mdxish/normalize-malformed-md-syntax'; import { preprocessJSXExpressions, type JSXContext } from '../processor/transform/mdxish/preprocess-jsx-expressions'; import variablesTextTransformer from '../processor/transform/mdxish/variables-text'; import tailwindTransformer from '../processor/transform/tailwind'; @@ -68,6 +69,7 @@ export function mdxish(mdContent: string, opts: MdxishOpts = {}): Root { .data('fromMarkdownExtensions', [mdxExpressionFromMarkdown()]) .use(remarkParse) .use(remarkFrontmatter) + .use(normalizeEmphasisAST) .use(magicBlockRestorer, { blocks }) .use(imageTransformer, { isMdxish: true }) .use(defaultTransformers) diff --git a/processor/transform/mdxish/normalize-malformed-md-syntax.ts b/processor/transform/mdxish/normalize-malformed-md-syntax.ts new file mode 100644 index 000000000..ca0aa896b --- /dev/null +++ b/processor/transform/mdxish/normalize-malformed-md-syntax.ts @@ -0,0 +1,164 @@ +import type { Emphasis, Parent, Root, Strong, Text } from 'mdast'; +import type { Plugin } from 'unified'; + +import { SKIP, visit } from 'unist-util-visit'; + +// Patterns to detect for bold (** and __) and italic (* and _) syntax: +// Bold: ** text**, **text **, word** text**, ** text ** +// Italic: * text*, *text *, word* text*, * text * +// Same patterns for underscore variants +// We use separate patterns for each marker type to allow this flexibility. + +// Pattern for ** bold ** +// Groups: 1=wordBefore, 2=marker, 3=contentWithSpaceAfter, 4=trailingSpace1, 5=contentWithSpaceBefore, 6=trailingSpace2, 7=afterChar +// trailingSpace1 is for "** text **" pattern, trailingSpace2 is for "**text **" pattern +const asteriskBoldRegex = + /([^*\s]+)?\s*(\*\*)(?:\s+((?:[^*\n]|\*(?!\*))+?)(\s*)\2|((?:[^*\n]|\*(?!\*))+?)(\s+)\2)(\S|$)?/g; + +// Pattern for __ bold __ +const underscoreBoldRegex = /([^_\s]+)?\s*(__)(?:\s+((?:[^_\n]|_(?!_))+?)(\s*)\2|((?:[^_\n]|_(?!_))+?)(\s+)\2)(\S|$)?/g; + +// Pattern for * italic * +const asteriskItalicRegex = /([^*\s]+)?\s*(\*)(?!\*)(?:\s+([^*\n]+?)(\s*)\2|([^*\n]+?)(\s+)\2)(\S|$)?/g; + +// Pattern for _ italic _ +const underscoreItalicRegex = /([^_\s]+)?\s*(_)(?!_)(?:\s+([^_\n]+?)(\s*)\2|([^_\n]+?)(\s+)\2)(\S|$)?/g; + +/** + * A remark plugin that normalizes malformed bold and italic markers in text nodes. + * Detects patterns like `** bold**`, `Hello** Wrong Bold**`, `__ bold__`, `Hello__ Wrong Bold__`, + * `* italic*`, `Hello* Wrong Italic*`, `_ italic_`, or `Hello_ Wrong Italic_` + * and converts them to proper strong/emphasis nodes, matching the behavior of the legacy rdmd engine. + * + * Supports both asterisk (`**bold**`, `*italic*`) and underscore (`__bold__`, `_italic_`) syntax. + * Also supports snake_case content like `** some_snake_case**`. + * + * This runs after remark-parse, which (in v11+) is strict and doesn't parse + * malformed emphasis syntax. This plugin post-processes the AST to handle these cases. + */ +const normalizeEmphasisAST: Plugin = () => (tree: Root) => { + visit(tree, 'text', function visitor(node: Text, index, parent: Parent) { + if (index === undefined || !parent) return undefined; + + // Skip if inside code blocks or inline code + if (parent.type === 'inlineCode' || parent.type === 'code') { + return undefined; + } + + const text = node.value; + + interface MatchInfo { + isBold: boolean; + marker: string; + match: RegExpMatchArray; + } + + const allMatches: MatchInfo[] = []; + + [...text.matchAll(asteriskBoldRegex)].forEach(match => { + allMatches.push({ isBold: true, marker: '**', match }); + }); + [...text.matchAll(underscoreBoldRegex)].forEach(match => { + allMatches.push({ isBold: true, marker: '__', match }); + }); + [...text.matchAll(asteriskItalicRegex)].forEach(match => { + allMatches.push({ isBold: false, marker: '*', match }); + }); + [...text.matchAll(underscoreItalicRegex)].forEach(match => { + allMatches.push({ isBold: false, marker: '_', match }); + }); + + if (allMatches.length === 0) return undefined; + + allMatches.sort((a, b) => (a.match.index ?? 0) - (b.match.index ?? 0)); + + const filteredMatches: MatchInfo[] = []; + let lastEnd = 0; + allMatches.forEach(info => { + const start = info.match.index ?? 0; + const end = start + info.match[0].length; + if (start >= lastEnd) { + filteredMatches.push(info); + lastEnd = end; + } + }); + + if (filteredMatches.length === 0) return undefined; + + const parts: (Emphasis | Strong | Text)[] = []; + let lastIndex = 0; + + filteredMatches.forEach(({ match, marker, isBold }) => { + const matchIndex = match.index ?? 0; + const fullMatch = match[0]; + + if (matchIndex > lastIndex) { + const beforeText = text.slice(lastIndex, matchIndex); + if (beforeText) { + parts.push({ type: 'text', value: beforeText } satisfies Text); + } + } + + const wordBefore = match[1]; // e.g., "Hello" in "Hello** Wrong Bold**" + const contentWithSpaceAfter = match[3]; // Content when there's a space after opening markers + const trailingSpace1 = match[4] || ''; // Space before closing markers (for "** text **" pattern) + const contentWithSpaceBefore = match[5]; // Content when there's only a space before closing markers + const trailingSpace2 = match[6] || ''; // Space before closing markers (for "**text **" pattern) + const trailingSpace = trailingSpace1 || trailingSpace2; // Combined trailing space + const content = (contentWithSpaceAfter || contentWithSpaceBefore || '').trim(); + const afterChar = match[7]; // Character after closing markers (if any) + + const markerPos = fullMatch.indexOf(marker); + const spacesBeforeMarkers = wordBefore + ? fullMatch.slice(wordBefore.length, markerPos) + : fullMatch.slice(0, markerPos); + + const shouldAddSpace = !!contentWithSpaceAfter && !!wordBefore && !spacesBeforeMarkers; + + if (wordBefore) { + const spacing = spacesBeforeMarkers + (shouldAddSpace ? ' ' : ''); + parts.push({ type: 'text', value: wordBefore + spacing } satisfies Text); + } else if (spacesBeforeMarkers) { + parts.push({ type: 'text', value: spacesBeforeMarkers } satisfies Text); + } + if (content) { + if (isBold) { + parts.push({ + type: 'strong', + children: [{ type: 'text', value: content } satisfies Text], + } satisfies Strong); + } else { + parts.push({ + type: 'emphasis', + children: [{ type: 'text', value: content } satisfies Text], + } satisfies Emphasis); + } + } + + if (afterChar) { + const prefix = trailingSpace ? ' ' : ''; + parts.push({ type: 'text', value: prefix + afterChar } satisfies Text); + } + + lastIndex = matchIndex + fullMatch.length; + }); + + if (lastIndex < text.length) { + const remainingText = text.slice(lastIndex); + if (remainingText) { + parts.push({ type: 'text', value: remainingText } satisfies Text); + } + } + + if (parts.length > 0) { + parent.children.splice(index, 1, ...parts); + return [SKIP, index + parts.length]; + } + + return undefined; + }); + + return tree; +}; + +export default normalizeEmphasisAST;