From 326e482a7a7d69a9f671aed3d7b2dc0b233dcfb7 Mon Sep 17 00:00:00 2001
From: Yuriy Demidov <d3m1d0v@yandex-team.ru>
Date: Fri, 3 Apr 2026 20:00:15 +0300
Subject: [PATCH] feat(core): backport 4 improvements to serializer from
 prosemirror-markdown upstream

---
 .../editor/src/core/markdown/Markdown.test.ts | 206 +++++++++++++++++-
 .../src/core/markdown/MarkdownSerializer.ts   | 115 +++++++---
 2 files changed, 281 insertions(+), 40 deletions(-)

diff --git a/packages/editor/src/core/markdown/Markdown.test.ts b/packages/editor/src/core/markdown/Markdown.test.ts
index b22c3f28..46a037c5 100644
--- a/packages/editor/src/core/markdown/Markdown.test.ts
+++ b/packages/editor/src/core/markdown/Markdown.test.ts
@@ -49,7 +49,7 @@ const serializer = new MarkdownSerializer(
         }) as SerializerNodeToken,
         heading: ((state, node) => {
             state.write(state.repeat('#', node.attrs.level) + ' ');
-            state.renderInline(node);
+            state.renderInline(node, false);
             state.closeBlock(node);
         }) as SerializerNodeToken,
         list_item: ((state, node) => {
@@ -94,11 +94,12 @@ const {em, strong, code} = builder;
 
 const {same, parse, serialize} = createMarkupChecker({parser, serializer});
 
-describe('markdown', () => {
-    it('parses a paragraph', () => same('hello!', doc(p('hello!')))); // TODO: move test to extensions?
+// Tests ported from prosemirror-markdown
+describe('markdown (from prosemirror-markdown)', () => {
+    it('parses a paragraph', () => same('hello!', doc(p('hello!'))));
 
     it('parses headings', () => {
-        same('# one\n\n## two\n\nthree', doc(h1('one'), h2('two'), p('three'))); // TODO: move test to extensions?
+        same('# one\n\n## two\n\nthree', doc(h1('one'), h2('two'), p('three')));
     });
 
     // FIXME bring back testing for preserving bullets and tight attrs
@@ -118,6 +119,9 @@ describe('markdown', () => {
             ),
         ));
 
+    // todo: parser adds empty paragraph before heading in list_item
+    it.skip('can parse a heading in a list', () => same('* # Foo', doc(ul(li(h1('Foo'))))));
+
     it('parses inline marks', () =>
         same(
             'Hello. Some *em* text, some **strong** text, and some `code`',
@@ -147,19 +151,23 @@ describe('markdown', () => {
     it('parses code mark inside strong text', () =>
         same('**`code` is bold**', doc(p(strong(code('code'), ' is bold')))));
 
-    // tood
+    // todo: requires dynamic backtick fencing in code mark serializer
     it.skip('parses code mark containing backticks', () =>
         same(
             '``` one backtick: ` two backticks: `` ```',
             doc(p(code('one backtick: ` two backticks: ``'))),
         ));
 
-    // todo
+    // todo: requires code mark to preserve whitespace-only content
     it.skip('parses code mark containing only whitespace', () =>
         serialize(doc(p('Three spaces: ', code('   '))), 'Three spaces: `   `'));
 
-    it('parses a line break', () =>
-        same('line one\\\nline two', doc(p('line one', br(), 'line two'))));
+    it('parses hard breaks', () => {
+        same('line one\\\nline two', doc(p('line one', br(), 'line two')));
+    });
+
+    it('parses hard breaks inside emphasis', () =>
+        same('*foo\\\nbar*', doc(p(em('foo', br(), 'bar')))));
 
     it('ignores HTML tags', () => parse('Foo < img> bar', doc(p('Foo < img> bar'))));
 
@@ -170,11 +178,11 @@ describe('markdown', () => {
 
     it('drops trailing hard breaks', () => serialize(doc(p('a', br(), br())), 'a'));
 
-    it('should remove marks from edge break (before)', () =>
-        serialize(doc(p('text', strong(br(), 'text2'))), 'text\\\n**text2**'));
+    it('properly expels whitespace before a hard break', () =>
+        serialize(doc(p(strong('foo ', br()), 'bar')), '**foo** \\\nbar'));
 
-    it('should remove marks from edge break (after)', () =>
-        serialize(doc(p(strong('text', br()), 'text2')), '**text**\\\ntext2'));
+    it("doesn't crash when a block ends in a hard break", () =>
+        serialize(doc(p(strong('foo', br()))), '**foo**'));
 
     it('expels enclosing whitespace from inside emphasis', () =>
         serialize(
@@ -204,6 +212,48 @@ describe('markdown', () => {
 
     it("doesn't escape characters in code", () => same('foo`*`', doc(p('foo', code('*')))));
 
+    // todo: requires smarter startOfLine escape – don't escape `\d+.` without trailing space
+    it.skip('does not escape list markers without space after them', () =>
+        same('1.2kg', doc(p('1.2kg'))));
+
+    // todo: requires removing `+` from defaultEsc in esc()
+    it.skip("doesn't escape +++", () => same('+++', doc(p('+++'))));
+
+    it('escapes list markers inside lists', () => {
+        same('* 1\\. hi\n\n* x', doc(ul(li(p('1. hi')), li(p('x')))));
+    });
+
+    it("doesn't escape block-start characters in heading content", () => {
+        same('# 1. foo', doc(h1('1. foo')));
+    });
+
+    it('escapes ATX heading markers with space after them', () => {
+        same('\\### text', doc(p('### text')));
+    });
+
+    it('escapes ATX heading markers followed by end of line', () => {
+        same('\\###', doc(p('###')));
+    });
+
+    // todo: requires smarter startOfLine escape for # – only escape #{1,6} followed by space or EOL
+    it.skip('does not escape ATX heading markers without space after them', () => {
+        same('#hashtag', doc(p('#hashtag')));
+    });
+
+    // todo: requires smarter startOfLine escape for # – only escape #{1,6} followed by space or EOL
+    it.skip('does not escape ATX heading markers consisting of more than 6 in a sequence', () => {
+        same('#######', doc(p('#######')));
+    });
+});
+
+// Tests specific to the fork's extensions and customizations
+describe('markdown (fork-specific)', () => {
+    it('should remove marks from edge break (before)', () =>
+        serialize(doc(p('text', strong(br(), 'text2'))), 'text\\\n**text2**'));
+
+    it('should remove marks from edge break (after)', () =>
+        serialize(doc(p(strong('text', br()), 'text2')), '**text**\\\ntext2'));
+
     it('escapes special characters in a text', () => {
         same(
             'Markdown special characters: \\_underscore\\_, \\*asterisk\\*, \\`backtick\\`, \\$dollar\\$, \\{curly\\} brace, \\[square\\] bracket, and a \\|vertical\\| bar.',
@@ -238,4 +288,136 @@ describe('markdown', () => {
         same('trailing\\_', doc(p('trailing_')));
         same('space \\_ space', doc(p('space _ space')));
     });
+
+    describe('expelEnclosingWhitespace with mark continuation', () => {
+        it('keeps trailing whitespace inside mark when mark continues', () => {
+            // strong("hello ") + strong(em("world")) — strong continues
+            // trailing space stays inside strong, before em opens
+            serialize(doc(p(strong('hello '), strong(em('world')))), '**hello *world***');
+        });
+
+        it('expels trailing whitespace when mark does not continue', () => {
+            serialize(doc(p(strong('hello '), 'world')), '**hello** world');
+        });
+
+        it('keeps leading whitespace inside mark when mark is already active', () => {
+            // em("hello") + em(code("x")) + em(" world") — em continues throughout
+            // leading space in " world" stays inside em
+            serialize(doc(p(em('hello'), em(code('x')), em(' world'))), '*hello`x` world*');
+        });
+
+        it('expels leading whitespace when mark is opening', () => {
+            serialize(doc(p('before', strong(' hello'))), 'before **hello**');
+        });
+    });
+
+    describe('atBlockStart – startOfLine escaping precision', () => {
+        it('does not escape # after line break inside inline content', () => {
+            serialize(doc(p(strong('text1\n#text2'))), '**text1\n#text2**');
+        });
+
+        it('does not escape - after line break inside inline content', () => {
+            serialize(doc(p(strong('line1\n-line2'))), '**line1\n-line2**');
+        });
+
+        // todo: > is in defaultEsc so it's always escaped, not just at startOfLine; needs esc() change
+        it.skip('does not escape > after line break inside inline content', () => {
+            serialize(doc(p(strong('line1\n>quote'))), '**line1\n>quote**');
+        });
+
+        it('does not escape numbered list after line break inside inline content', () => {
+            serialize(doc(p(strong('line1\n1. item'))), '**line1\n1. item**');
+        });
+
+        it('still escapes # at actual block start', () => {
+            same('\\# not a heading', doc(p('# not a heading')));
+        });
+
+        it('still escapes - at actual block start', () => {
+            same('\\- not a list', doc(p('- not a list')));
+        });
+
+        it('still escapes > at actual block start', () => {
+            same('\\>not a quote', doc(p('>not a quote')));
+        });
+
+        it('still escapes numbered list at actual block start', () => {
+            same('1\\. not a list', doc(p('1. not a list')));
+        });
+    });
+
+    describe('render (strict mode for nodes)', () => {
+        const nodeStrictSerializer = new MarkdownSerializer(
+            {
+                text: ((state, node) => {
+                    state.text(node.text ?? '');
+                }) as SerializerNodeToken,
+                paragraph: ((state, node) => {
+                    state.renderInline(node);
+                    state.closeBlock(node);
+                }) as SerializerNodeToken,
+                // 'heading' is NOT registered
+            },
+            {
+                em: {open: '*', close: '*', mixable: true, expelEnclosingWhitespace: true},
+                strong: {open: '**', close: '**', mixable: true, expelEnclosingWhitespace: true},
+            },
+        );
+
+        it('throws on unknown node in strict mode (default)', () => {
+            expect(() => nodeStrictSerializer.serialize(doc(h1('text')))).toThrow(
+                /Token type `heading` not supported by Markdown renderer/,
+            );
+        });
+
+        it('renders inline content of unknown node in non-strict mode', () => {
+            expect(nodeStrictSerializer.serialize(doc(h1('text')), {strict: false})).toBe('text');
+        });
+
+        it('renders inline content with marks of unknown node in non-strict mode', () => {
+            expect(
+                nodeStrictSerializer.serialize(doc(h1('hello ', strong('world'))), {strict: false}),
+            ).toBe('hello **world**');
+        });
+    });
+
+    describe('getMark (strict mode)', () => {
+        const strictSerializer = new MarkdownSerializer(
+            {
+                text: ((state, node) => {
+                    state.text(node.text ?? '');
+                }) as SerializerNodeToken,
+                paragraph: ((state, node) => {
+                    state.renderInline(node);
+                    state.closeBlock(node);
+                }) as SerializerNodeToken,
+            },
+            {
+                // only 'strong' is registered, 'em' and 'code' are missing
+                strong: {open: '**', close: '**', mixable: true, expelEnclosingWhitespace: true},
+            },
+        );
+
+        it('throws on unknown mark in strict mode (default)', () => {
+            expect(() => strictSerializer.serialize(doc(p(em('text'))))).toThrow(
+                /Mark type `em` not supported by Markdown renderer/,
+            );
+        });
+
+        it('silently ignores unknown mark in non-strict mode', () => {
+            expect(strictSerializer.serialize(doc(p(em('text'))), {strict: false})).toBe('text');
+        });
+
+        it('serializes known marks normally in non-strict mode', () => {
+            expect(strictSerializer.serialize(doc(p(strong('text'))), {strict: false})).toBe(
+                '**text**',
+            );
+        });
+
+        it('silently ignores unknown mark mixed with known marks in non-strict mode', () => {
+            expect(strictSerializer.serialize(doc(p(strong(em('text')))), {strict: false})).toBe(
+                '**text**',
+            );
+        });
+    });
 });
diff --git a/packages/editor/src/core/markdown/MarkdownSerializer.ts b/packages/editor/src/core/markdown/MarkdownSerializer.ts
index d80a258c..c53ce6ee 100644
--- a/packages/editor/src/core/markdown/MarkdownSerializer.ts
+++ b/packages/editor/src/core/markdown/MarkdownSerializer.ts
@@ -31,6 +31,8 @@ interface SerializerOptions {
     escape?: boolean; // Added to fix types
 }
 
+const blankMark: SerializerMarkToken = {open: '', close: '', mixable: false};
+
 interface MarkMap {
     [markName: string]: SerializerMarkToken;
 }
@@ -125,6 +127,7 @@ export class MarkdownSerializerState {
     inTightList: boolean;
     noAutoBlank: boolean;
     isAutolink: boolean | undefined;
+    atBlockStart: boolean;
     escapeWhitespace: boolean;
     escapeCharacters?: string[];
 
@@ -143,6 +146,7 @@ export class MarkdownSerializerState {
         this.inTightList = false;
         this.noAutoBlank = false;
         this.isAutolink = undefined;
+        this.atBlockStart = false;
         this.escapeWhitespace = false;
         // :: Object
         // The options passed to the serializer.
@@ -226,13 +230,12 @@ export class MarkdownSerializerState {
     text(text: string, escape?: boolean) {
         const lines = text.split('\n');
         for (let i = 0; i < lines.length; i++) {
-            const startOfLine = this.atBlank() || this.closed;
             this.write();
             // Escape ! before [ to prevent being parsed as image syntax
             if (escape === false && lines[i][0] === '[' && /(^|[^\\])!$/.test(this.out))
                 this.out = this.out.slice(0, this.out.length - 1) + '\\!';
             let text = lines[i];
-            if (escape !== false && this.options.escape !== false) text = this.esc(text, startOfLine as any)
+            if (escape !== false && this.options.escape !== false) text = this.esc(text, this.atBlockStart)
             if (this.escapeWhitespace) text = this.escWhitespace(text);
             this.out += text
             if (i != lines.length - 1) this.out += '\n';
@@ -245,15 +248,22 @@ export class MarkdownSerializerState {
         if (typeof parent === 'number') {
             throw new Error('!');
         }
-        if (!this.nodes[node.type.name]) {
-            throw new Error('Token type `' + node.type.name + '` not supported by Markdown renderer');
-        }
 
         const callback = this.nodes[node.type.name];
-        if (this.dynamicModifier) {
-            this.dynamicModifier.processNode(this, node, parent, index, callback);
+        if (callback) {
+            if (this.dynamicModifier) {
+                this.dynamicModifier.processNode(this, node, parent, index, callback);
+            } else {
+                callback(this, node, parent, index);
+            }
         } else {
-            callback(this, node, parent, index);
+            if (this.options.strict !== false) {
+                throw new Error('Token type `' + node.type.name + '` not supported by Markdown renderer');
+            } else if (!node.type.isLeaf) {
+                if (node.type.inlineContent) this.renderInline(node);
+                else this.renderContent(node);
+                if (node.isBlock) this.closeBlock(node);
+            }
         }
     }
 
@@ -263,11 +273,11 @@ export class MarkdownSerializerState {
         parent.forEach((node, _, i) => this.render(node, parent, i));
     }
 
-    // :: (Node)
     // Render the contents of `parent` as inline content.
-    renderInline(parent: Node) {
-        const active: Mark[] = []; let
-            trailing = '';
+    renderInline(parent: Node, fromBlockStart = true) {
+        this.atBlockStart = fromBlockStart;
+        const active: Mark[] = [];
+        let trailing = '';
         const progress = (node: Node | null, _: any, index: number) => {
             let marks = node ? node.marks : [];
 
@@ -275,9 +285,9 @@ export class MarkdownSerializerState {
             // that mark to prevent parser edge cases with new lines just
             // before closing or after opening marks.
             if (node && node.type.spec.isBreak) {
-                marks = marks.filter((m: any) => {
+                marks = marks.filter((m) => {
                     if (index === 0) return false;
-                    if (index + 1 == parent.childCount) return false;
+                    if (index + 1 === parent.childCount) return false;
                     const prev = parent.child(index - 1);
                     const next = parent.child(index + 1);
                     return (
@@ -291,21 +301,31 @@ export class MarkdownSerializerState {
             trailing = '';
             // If whitespace has to be expelled from the node, adjust
             // leading and trailing accordingly.
-            if (node && node.isText && marks.some((mark: any) => {
-                const info = this.marks[mark.type.name];
-                return info && info.expelEnclosingWhitespace;
+            if (node && node.isText && marks.some((mark) => {
+                const info = this.getMark(mark.type.name);
+                return info && info.expelEnclosingWhitespace && !mark.isInSet(active);
+            })) {
+                const [_, lead, rest] = /^(\s*)(.*)$/m.exec(node.text!)!;
+                if (lead) {
+                    leading += lead;
+                    node = rest ? (node as any).withText(rest) : null;
+                    if (!node) marks = active;
+                }
+            }
+            if (node && node.isText && marks.some((mark) => {
+                const info = this.getMark(mark.type.name);
+                return info && info.expelEnclosingWhitespace && !this.isMarkAhead(parent, index + 1, mark);
             })) {
-                const [_, lead, inner, trail] = /^(\s*)(.*?)(\s*)$/m.exec(node.text ?? '')!;
-                leading += lead;
-                trailing = trail;
-                if (lead || trail) {
-                    node = inner ? (node as any).withText(inner) : null;
+                const [_, rest, trail] = /^(.*?)(\s*)$/m.exec(node.text!)!;
+                if (trail) {
+                    trailing = trail;
+                    node = rest ? (node as any).withText(rest) : null;
                     if (!node) marks = active;
                 }
             }
 
             const inner = marks.length && marks[marks.length - 1];
-            const noEsc = inner && this.marks[inner.type.name].escape === false;
+            const noEsc = inner && this.getMark(inner.type.name).escape === false;
             const len = marks.length - (noEsc ? 1 : 0);
 
             // Try to reorder 'mixable' marks, such as em and strong, which
@@ -315,12 +335,17 @@ export class MarkdownSerializerState {
             // eslint-disable-next-line  no-labels
             outer: for (let i = 0; i < len; i++) {
                 const mark = marks[i];
-                if (!this.marks[mark.type.name].mixable) break;
+                if (!this.getMark(mark.type.name).mixable) break;
                 for (let j = 0; j < active.length; j++) {
                     const other = active[j];
-                    if (!this.marks[other.type.name].mixable) break;
+                    if (!this.getMark(other.type.name).mixable) break;
                     if (mark.eq(other)) {
-                        if (i > j) { marks = marks.slice(0, j).concat(mark).concat(marks.slice(j, i)).concat(marks.slice(i + 1, len)) } else if (j > i) { marks = marks.slice(0, i).concat(marks.slice(i + 1, j)).concat(mark).concat(marks.slice(j, len)) }
+                        if (i > j) {
+                            marks = marks.slice(0, j).concat(mark).concat(marks.slice(j, i)).concat(marks.slice(i + 1, len))
+                        }
+                        else if (j > i) {
+                            marks = marks.slice(0, i).concat(marks.slice(i + 1, j)).concat(mark).concat(marks.slice(j, len))
+                        }
                         // eslint-disable-next-line no-labels
                         continue outer;
                     }
@@ -345,6 +370,7 @@ export class MarkdownSerializerState {
                     const add = marks[active.length];
                     active.push(add);
                     this.text(this.markString(add, true, parent, index), false);
+                    this.atBlockStart = false;
                 }
 
                 // Render the node. Special case code marks, since their content
@@ -352,11 +378,25 @@ export class MarkdownSerializerState {
                 if (noEsc && node.isText) {
                     this.text(this.markString(inner, true, parent, index) + node.text +
                       this.markString(inner, false, parent, index + 1), false);
-                } else { this.render(node, parent, index) }
+                } else {
+                    this.render(node, parent, index)
+                }
+                this.atBlockStart = false;
+            }
+
+            // After the first non-empty text node is rendered, the end of output
+            // is no longer at block start.
+            //
+            // FIXME: If a non-text node writes something to the output for this
+            // block, the end of output is also no longer at block start. But how
+            // can we detect that?
+            if (node?.isText && node.nodeSize > 0) {
+                this.atBlockStart = false;
             }
         };
         parent.forEach(progress);
         progress(null, null, parent.childCount);
+        this.atBlockStart = false;
     }
 
     // :: (Node, string, (number) → string)
@@ -423,7 +463,7 @@ export class MarkdownSerializerState {
     // :: (Mark, bool, string?) → string
     // Get the markdown string for a given opening or closing mark.
     markString(mark: Mark, open: boolean, parent: Node, index: number): string {
-        const info = this.marks[mark.type.name];
+        const info = this.getMark(mark.type.name);
         const value = open ? info.open : info.close;
         return typeof value === 'string' ? value : value(this, mark, parent, index);
     }
@@ -438,4 +478,23 @@ export class MarkdownSerializerState {
             trailing: (text.match(/(\s+)$/) || [])[0],
         };
     }
+
+    private isMarkAhead(parent: Node, index: number, mark: Mark): boolean {
+        for (;; index++) {
+            if (index >= parent.childCount) return false;
+            const next = parent.child(index);
+            if (!next.type.spec.isBreak) return !!mark.isInSet(next.marks);
+        }
+    }
+
+    // Get mark info by name, with fallback to blankMark when strict mode is off.
+    private getMark(name: string): SerializerMarkToken {
+        const info = this.marks[name];
+        if (!info) {
+            if (this.options.strict !== false)
+                throw new Error(`Mark type \`${name}\` not supported by Markdown renderer`);
+            return blankMark;
+        }
+        return info;
+    }
 }