From 326e482a7a7d69a9f671aed3d7b2dc0b233dcfb7 Mon Sep 17 00:00:00 2001 From: Yuriy Demidov Date: Fri, 3 Apr 2026 20:00:15 +0300 Subject: [PATCH] feat(core): backport 4 improvements to serializer from prosemirror-markdown upstream --- .../editor/src/core/markdown/Markdown.test.ts | 206 +++++++++++++++++- .../src/core/markdown/MarkdownSerializer.ts | 115 +++++++--- 2 files changed, 281 insertions(+), 40 deletions(-) diff --git a/packages/editor/src/core/markdown/Markdown.test.ts b/packages/editor/src/core/markdown/Markdown.test.ts index b22c3f28..46a037c5 100644 --- a/packages/editor/src/core/markdown/Markdown.test.ts +++ b/packages/editor/src/core/markdown/Markdown.test.ts @@ -49,7 +49,7 @@ const serializer = new MarkdownSerializer( }) as SerializerNodeToken, heading: ((state, node) => { state.write(state.repeat('#', node.attrs.level) + ' '); - state.renderInline(node); + state.renderInline(node, false); state.closeBlock(node); }) as SerializerNodeToken, list_item: ((state, node) => { @@ -94,11 +94,12 @@ const {em, strong, code} = builder; const {same, parse, serialize} = createMarkupChecker({parser, serializer}); -describe('markdown', () => { - it('parses a paragraph', () => same('hello!', doc(p('hello!')))); // TODO: move test to extensions? +// Tests ported from prosemirror-markdown +describe('markdown (from prosemirror-markdown)', () => { + it('parses a paragraph', () => same('hello!', doc(p('hello!')))); it('parses headings', () => { - same('# one\n\n## two\n\nthree', doc(h1('one'), h2('two'), p('three'))); // TODO: move test to extensions? + same('# one\n\n## two\n\nthree', doc(h1('one'), h2('two'), p('three'))); }); // FIXME bring back testing for preserving bullets and tight attrs @@ -118,6 +119,9 @@ describe('markdown', () => { ), )); + // todo: parser adds empty paragraph before heading in list_item + it.skip('can parse a heading in a list', () => same('* # Foo', doc(ul(li(h1('Foo')))))); + it('parses inline marks', () => same( 'Hello. Some *em* text, some **strong** text, and some `code`', @@ -147,19 +151,23 @@ describe('markdown', () => { it('parses code mark inside strong text', () => same('**`code` is bold**', doc(p(strong(code('code'), ' is bold'))))); - // tood + // todo: requires dynamic backtick fencing in code mark serializer it.skip('parses code mark containing backticks', () => same( '``` one backtick: ` two backticks: `` ```', doc(p(code('one backtick: ` two backticks: ``'))), )); - // todo + // todo: requires code mark to preserve whitespace-only content it.skip('parses code mark containing only whitespace', () => serialize(doc(p('Three spaces: ', code(' '))), 'Three spaces: ` `')); - it('parses a line break', () => - same('line one\\\nline two', doc(p('line one', br(), 'line two')))); + it('parses hard breaks', () => { + same('line one\\\nline two', doc(p('line one', br(), 'line two'))); + }); + + it('parses hard breaks inside emphasis', () => + same('*foo\\\nbar*', doc(p(em('foo', br(), 'bar'))))); it('ignores HTML tags', () => parse('Foo < img> bar', doc(p('Foo < img> bar')))); @@ -170,11 +178,11 @@ describe('markdown', () => { it('drops trailing hard breaks', () => serialize(doc(p('a', br(), br())), 'a')); - it('should remove marks from edge break (before)', () => - serialize(doc(p('text', strong(br(), 'text2'))), 'text\\\n**text2**')); + it('properly expels whitespace before a hard break', () => + serialize(doc(p(strong('foo ', br()), 'bar')), '**foo** \\\nbar')); - it('should remove marks from edge break (after)', () => - serialize(doc(p(strong('text', br()), 'text2')), '**text**\\\ntext2')); + it("doesn't crash when a block ends in a hard break", () => + serialize(doc(p(strong('foo', br()))), '**foo**')); it('expels enclosing whitespace from inside emphasis', () => serialize( @@ -204,6 +212,48 @@ describe('markdown', () => { it("doesn't escape characters in code", () => same('foo`*`', doc(p('foo', code('*'))))); + // todo: requires smarter startOfLine escape – don't escape `\d+.` without trailing space + it.skip('does not escape list markers without space after them', () => + same('1.2kg', doc(p('1.2kg')))); + + // todo: requires removing `+` from defaultEsc in esc() + it.skip("doesn't escape +++", () => same('+++', doc(p('+++')))); + + it('escapes list markers inside lists', () => { + same('* 1\\. hi\n\n* x', doc(ul(li(p('1. hi')), li(p('x'))))); + }); + + it("doesn't escape block-start characters in heading content", () => { + same('# 1. foo', doc(h1('1. foo'))); + }); + + it('escapes ATX heading markers with space after them', () => { + same('\\### text', doc(p('### text'))); + }); + + it('escapes ATX heading markers followed by end of line', () => { + same('\\###', doc(p('###'))); + }); + + // todo: requires smarter startOfLine escape for # – only escape #{1,6} followed by space or EOL + it.skip('does not escape ATX heading markers without space after them', () => { + same('#hashtag', doc(p('#hashtag'))); + }); + + // todo: requires smarter startOfLine escape for # – only escape #{1,6} followed by space or EOL + it.skip('does not escape ATX heading markers consisting of more than 6 in a sequence', () => { + same('#######', doc(p('#######'))); + }); +}); + +// Tests specific to the fork's extensions and customizations +describe('markdown (fork-specific)', () => { + it('should remove marks from edge break (before)', () => + serialize(doc(p('text', strong(br(), 'text2'))), 'text\\\n**text2**')); + + it('should remove marks from edge break (after)', () => + serialize(doc(p(strong('text', br()), 'text2')), '**text**\\\ntext2')); + it('escapes special characters in a text', () => { same( 'Markdown special characters: \\_underscore\\_, \\*asterisk\\*, \\`backtick\\`, \\$dollar\\$, \\{curly\\} brace, \\[square\\] bracket, and a \\|vertical\\| bar.', @@ -238,4 +288,136 @@ describe('markdown', () => { same('trailing\\_', doc(p('trailing_'))); same('space \\_ space', doc(p('space _ space'))); }); + + describe('expelEnclosingWhitespace with mark continuation', () => { + it('keeps trailing whitespace inside mark when mark continues', () => { + // strong("hello ") + strong(em("world")) — strong continues + // trailing space stays inside strong, before em opens + serialize(doc(p(strong('hello '), strong(em('world')))), '**hello *world***'); + }); + + it('expels trailing whitespace when mark does not continue', () => { + serialize(doc(p(strong('hello '), 'world')), '**hello** world'); + }); + + it('keeps leading whitespace inside mark when mark is already active', () => { + // em("hello") + em(code("x")) + em(" world") — em continues throughout + // leading space in " world" stays inside em + serialize(doc(p(em('hello'), em(code('x')), em(' world'))), '*hello`x` world*'); + }); + + it('expels leading whitespace when mark is opening', () => { + serialize(doc(p('before', strong(' hello'))), 'before **hello**'); + }); + }); + + describe('atBlockStart – startOfLine escaping precision', () => { + it('does not escape # after line break inside inline content', () => { + serialize(doc(p(strong('text1\n#text2'))), '**text1\n#text2**'); + }); + + it('does not escape - after line break inside inline content', () => { + serialize(doc(p(strong('line1\n-line2'))), '**line1\n-line2**'); + }); + + // todo: > is in defaultEsc so it's always escaped, not just at startOfLine; needs esc() change + it.skip('does not escape > after line break inside inline content', () => { + serialize(doc(p(strong('line1\n>quote'))), '**line1\n>quote**'); + }); + + it('does not escape numbered list after line break inside inline content', () => { + serialize(doc(p(strong('line1\n1. item'))), '**line1\n1. item**'); + }); + + it('still escapes # at actual block start', () => { + same('\\# not a heading', doc(p('# not a heading'))); + }); + + it('still escapes - at actual block start', () => { + same('\\- not a list', doc(p('- not a list'))); + }); + + it('still escapes > at actual block start', () => { + same('\\>not a quote', doc(p('>not a quote'))); + }); + + it('still escapes numbered list at actual block start', () => { + same('1\\. not a list', doc(p('1. not a list'))); + }); + }); + + describe('render (strict mode for nodes)', () => { + const nodeStrictSerializer = new MarkdownSerializer( + { + text: ((state, node) => { + state.text(node.text ?? ''); + }) as SerializerNodeToken, + paragraph: ((state, node) => { + state.renderInline(node); + state.closeBlock(node); + }) as SerializerNodeToken, + // 'heading' is NOT registered + }, + { + em: {open: '*', close: '*', mixable: true, expelEnclosingWhitespace: true}, + strong: {open: '**', close: '**', mixable: true, expelEnclosingWhitespace: true}, + }, + ); + + it('throws on unknown node in strict mode (default)', () => { + expect(() => nodeStrictSerializer.serialize(doc(h1('text')))).toThrow( + /Token type `heading` not supported by Markdown renderer/, + ); + }); + + it('renders inline content of unknown node in non-strict mode', () => { + expect(nodeStrictSerializer.serialize(doc(h1('text')), {strict: false})).toBe('text'); + }); + + it('renders inline content with marks of unknown node in non-strict mode', () => { + expect( + nodeStrictSerializer.serialize(doc(h1('hello ', strong('world'))), {strict: false}), + ).toBe('hello **world**'); + }); + }); + + describe('getMark (strict mode)', () => { + const strictSerializer = new MarkdownSerializer( + { + text: ((state, node) => { + state.text(node.text ?? ''); + }) as SerializerNodeToken, + paragraph: ((state, node) => { + state.renderInline(node); + state.closeBlock(node); + }) as SerializerNodeToken, + }, + { + // only 'strong' is registered, 'em' and 'code' are missing + strong: {open: '**', close: '**', mixable: true, expelEnclosingWhitespace: true}, + }, + ); + + it('throws on unknown mark in strict mode (default)', () => { + expect(() => strictSerializer.serialize(doc(p(em('text'))))).toThrow( + /Mark type `em` not supported by Markdown renderer/, + ); + }); + + it('silently ignores unknown mark in non-strict mode', () => { + expect(strictSerializer.serialize(doc(p(em('text'))), {strict: false})).toBe('text'); + }); + + it('serializes known marks normally in non-strict mode', () => { + expect(strictSerializer.serialize(doc(p(strong('text'))), {strict: false})).toBe( + '**text**', + ); + }); + + it('silently ignores unknown mark mixed with known marks in non-strict mode', () => { + expect(strictSerializer.serialize(doc(p(strong(em('text')))), {strict: false})).toBe( + '**text**', + ); + }); + }); }); diff --git a/packages/editor/src/core/markdown/MarkdownSerializer.ts b/packages/editor/src/core/markdown/MarkdownSerializer.ts index d80a258c..c53ce6ee 100644 --- a/packages/editor/src/core/markdown/MarkdownSerializer.ts +++ b/packages/editor/src/core/markdown/MarkdownSerializer.ts @@ -31,6 +31,8 @@ interface SerializerOptions { escape?: boolean; // Added to fix types } +const blankMark: SerializerMarkToken = {open: '', close: '', mixable: false}; + interface MarkMap { [markName: string]: SerializerMarkToken; } @@ -125,6 +127,7 @@ export class MarkdownSerializerState { inTightList: boolean; noAutoBlank: boolean; isAutolink: boolean | undefined; + atBlockStart: boolean; escapeWhitespace: boolean; escapeCharacters?: string[]; @@ -143,6 +146,7 @@ export class MarkdownSerializerState { this.inTightList = false; this.noAutoBlank = false; this.isAutolink = undefined; + this.atBlockStart = false; this.escapeWhitespace = false; // :: Object // The options passed to the serializer. @@ -226,13 +230,12 @@ export class MarkdownSerializerState { text(text: string, escape?: boolean) { const lines = text.split('\n'); for (let i = 0; i < lines.length; i++) { - const startOfLine = this.atBlank() || this.closed; this.write(); // Escape ! before [ to prevent being parsed as image syntax if (escape === false && lines[i][0] === '[' && /(^|[^\\])!$/.test(this.out)) this.out = this.out.slice(0, this.out.length - 1) + '\\!'; let text = lines[i]; - if (escape !== false && this.options.escape !== false) text = this.esc(text, startOfLine as any) + if (escape !== false && this.options.escape !== false) text = this.esc(text, this.atBlockStart) if (this.escapeWhitespace) text = this.escWhitespace(text); this.out += text if (i != lines.length - 1) this.out += '\n'; @@ -245,15 +248,22 @@ export class MarkdownSerializerState { if (typeof parent === 'number') { throw new Error('!'); } - if (!this.nodes[node.type.name]) { - throw new Error('Token type `' + node.type.name + '` not supported by Markdown renderer'); - } const callback = this.nodes[node.type.name]; - if (this.dynamicModifier) { - this.dynamicModifier.processNode(this, node, parent, index, callback); + if (callback) { + if (this.dynamicModifier) { + this.dynamicModifier.processNode(this, node, parent, index, callback); + } else { + callback(this, node, parent, index); + } } else { - callback(this, node, parent, index); + if (this.options.strict !== false) { + throw new Error('Token type `' + node.type.name + '` not supported by Markdown renderer'); + } else if (!node.type.isLeaf) { + if (node.type.inlineContent) this.renderInline(node); + else this.renderContent(node); + if (node.isBlock) this.closeBlock(node); + } } } @@ -263,11 +273,11 @@ export class MarkdownSerializerState { parent.forEach((node, _, i) => this.render(node, parent, i)); } - // :: (Node) // Render the contents of `parent` as inline content. - renderInline(parent: Node) { - const active: Mark[] = []; let - trailing = ''; + renderInline(parent: Node, fromBlockStart = true) { + this.atBlockStart = fromBlockStart; + const active: Mark[] = []; + let trailing = ''; const progress = (node: Node | null, _: any, index: number) => { let marks = node ? node.marks : []; @@ -275,9 +285,9 @@ export class MarkdownSerializerState { // that mark to prevent parser edge cases with new lines just // before closing or after opening marks. if (node && node.type.spec.isBreak) { - marks = marks.filter((m: any) => { + marks = marks.filter((m) => { if (index === 0) return false; - if (index + 1 == parent.childCount) return false; + if (index + 1 === parent.childCount) return false; const prev = parent.child(index - 1); const next = parent.child(index + 1); return ( @@ -291,21 +301,31 @@ export class MarkdownSerializerState { trailing = ''; // If whitespace has to be expelled from the node, adjust // leading and trailing accordingly. - if (node && node.isText && marks.some((mark: any) => { - const info = this.marks[mark.type.name]; - return info && info.expelEnclosingWhitespace; + if (node && node.isText && marks.some((mark) => { + const info = this.getMark(mark.type.name); + return info && info.expelEnclosingWhitespace && !mark.isInSet(active); + })) { + const [_, lead, rest] = /^(\s*)(.*)$/m.exec(node.text!)!; + if (lead) { + leading += lead; + node = rest ? (node as any).withText(rest) : null; + if (!node) marks = active; + } + } + if (node && node.isText && marks.some((mark) => { + const info = this.getMark(mark.type.name); + return info && info.expelEnclosingWhitespace && !this.isMarkAhead(parent, index + 1, mark); })) { - const [_, lead, inner, trail] = /^(\s*)(.*?)(\s*)$/m.exec(node.text ?? '')!; - leading += lead; - trailing = trail; - if (lead || trail) { - node = inner ? (node as any).withText(inner) : null; + const [_, rest, trail] = /^(.*?)(\s*)$/m.exec(node.text!)!; + if (trail) { + trailing = trail; + node = rest ? (node as any).withText(rest) : null; if (!node) marks = active; } } const inner = marks.length && marks[marks.length - 1]; - const noEsc = inner && this.marks[inner.type.name].escape === false; + const noEsc = inner && this.getMark(inner.type.name).escape === false; const len = marks.length - (noEsc ? 1 : 0); // Try to reorder 'mixable' marks, such as em and strong, which @@ -315,12 +335,17 @@ export class MarkdownSerializerState { // eslint-disable-next-line no-labels outer: for (let i = 0; i < len; i++) { const mark = marks[i]; - if (!this.marks[mark.type.name].mixable) break; + if (!this.getMark(mark.type.name).mixable) break; for (let j = 0; j < active.length; j++) { const other = active[j]; - if (!this.marks[other.type.name].mixable) break; + if (!this.getMark(other.type.name).mixable) break; if (mark.eq(other)) { - if (i > j) { marks = marks.slice(0, j).concat(mark).concat(marks.slice(j, i)).concat(marks.slice(i + 1, len)) } else if (j > i) { marks = marks.slice(0, i).concat(marks.slice(i + 1, j)).concat(mark).concat(marks.slice(j, len)) } + if (i > j) { + marks = marks.slice(0, j).concat(mark).concat(marks.slice(j, i)).concat(marks.slice(i + 1, len)) + } + else if (j > i) { + marks = marks.slice(0, i).concat(marks.slice(i + 1, j)).concat(mark).concat(marks.slice(j, len)) + } // eslint-disable-next-line no-labels continue outer; } @@ -345,6 +370,7 @@ export class MarkdownSerializerState { const add = marks[active.length]; active.push(add); this.text(this.markString(add, true, parent, index), false); + this.atBlockStart = false; } // Render the node. Special case code marks, since their content @@ -352,11 +378,25 @@ export class MarkdownSerializerState { if (noEsc && node.isText) { this.text(this.markString(inner, true, parent, index) + node.text + this.markString(inner, false, parent, index + 1), false); - } else { this.render(node, parent, index) } + } else { + this.render(node, parent, index) + } + this.atBlockStart = false; + } + + // After the first non-empty text node is rendered, the end of output + // is no longer at block start. + // + // FIXME: If a non-text node writes something to the output for this + // block, the end of output is also no longer at block start. But how + // can we detect that? + if (node?.isText && node.nodeSize > 0) { + this.atBlockStart = false; } }; parent.forEach(progress); progress(null, null, parent.childCount); + this.atBlockStart = false; } // :: (Node, string, (number) → string) @@ -423,7 +463,7 @@ export class MarkdownSerializerState { // :: (Mark, bool, string?) → string // Get the markdown string for a given opening or closing mark. markString(mark: Mark, open: boolean, parent: Node, index: number): string { - const info = this.marks[mark.type.name]; + const info = this.getMark(mark.type.name); const value = open ? info.open : info.close; return typeof value === 'string' ? value : value(this, mark, parent, index); } @@ -438,4 +478,23 @@ export class MarkdownSerializerState { trailing: (text.match(/(\s+)$/) || [])[0], }; } + + private isMarkAhead(parent: Node, index: number, mark: Mark): boolean { + for (;; index++) { + if (index >= parent.childCount) return false; + const next = parent.child(index); + if (!next.type.spec.isBreak) return !!mark.isInSet(next.marks); + } + } + + // Get mark info by name, with fallback to blankMark when strict mode is off. + private getMark(name: string): SerializerMarkToken { + const info = this.marks[name]; + if (!info) { + if (this.options.strict !== false) + throw new Error(`Mark type \`${name}\` not supported by Markdown renderer`); + return blankMark; + } + return info; + } }