diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dafa58..1d5634e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Add static math rendering for `$...$` and `$$...$$` via `texmath` (LaTeX → MathML at build time) - Wrap raw HTML/inline blobs in a unique `` element (with `display: contents`) instead of `
`/`` ([#13](https://github.com/srid/heist-extra/pull/13)). Avoids xmlhtml's "div cannot contain text looking like its end tag" crash when raw HTML contains a literal `
` — most painfully, mermaid SVG with `
` HTML labels. Fixes [srid/emanote#119](https://github.com/srid/emanote/issues/119). - Pandoc `Table` rendering now applies the AST fields it had been silently dropping ([#15](https://github.com/srid/heist-extra/pull/15)): per-column alignment from `ColSpec` (with cell-level `Alignment` overriding the column default), column widths via a generated ``, cell `RowSpan`/`ColSpan` as `rowspan`/`colspan` attributes, row & cell `Attr` merged into the rendered ``/``/``, and `TableFoot` rows rendered into ``. Captions are still skipped — commonmark-hs doesn't emit them. Pure helpers (`alignmentStyle`, `colSpecsToColgroup`, `cellSpanAttrs`, `cellColumnIndices`, `mergeStyleKVs`) live in the new `Heist.Extra.Splices.Pandoc.Render.Internal` module so consumers get a clean public API. Fixes [srid/emanote#27](https://github.com/srid/emanote/issues/27). +- Group orphan opener/closer raw-HTML blocks around the markdown content between them, so the renderer emits a real DOM element instead of two stranded `` wrappers. CommonMark "type 6" HTML blocks end at the next blank line, which makes Pandoc split `
\n\nbody\n\n
` into three blocks: an opener `RawBlock`, a `Para`, and a closer `RawBlock`. Without grouping, each raw blob lives in its own wrapper and the markdown paragraph ends up a sibling of the (empty) details element rather than its child. The new `Heist.Extra.Splices.Pandoc.RawHtmlGroup.groupRawHtmlBlocks` pass walks the AST and rewrites those triplets into a `B.Div` carrying the tag in its `"tag"` attribute, which the renderer already turns into the named element. The `Div` arm of the renderer now also strips that `"tag"` directive from the serialized attributes so it doesn't leak as a literal `tag="…"` on the output element. Fixes [srid/emanote#433](https://github.com/srid/emanote/issues/433). ## 0.4.0.0 (2025-08-19) diff --git a/heist-extra.cabal b/heist-extra.cabal index d376f25..9aaa9bf 100644 --- a/heist-extra.cabal +++ b/heist-extra.cabal @@ -99,6 +99,7 @@ library Heist.Extra.Splices.Pandoc.Attr Heist.Extra.Splices.Pandoc.Ctx Heist.Extra.Splices.Pandoc.Footnotes + Heist.Extra.Splices.Pandoc.RawHtmlGroup Heist.Extra.Splices.Pandoc.Render Heist.Extra.Splices.Pandoc.Render.Internal Heist.Extra.Splices.Pandoc.Skylighting @@ -111,7 +112,9 @@ test-suite test type: exitcode-stdio-1.0 hs-source-dirs: test main-is: Spec.hs - other-modules: Heist.Extra.Splices.Pandoc.RenderSpec + other-modules: + Heist.Extra.Splices.Pandoc.RawHtmlGroupSpec + Heist.Extra.Splices.Pandoc.RenderSpec default-language: Haskell2010 default-extensions: ImportQualifiedPost OverloadedStrings ghc-options: -Wall diff --git a/src/Heist/Extra/Splices/Pandoc/RawHtmlGroup.hs b/src/Heist/Extra/Splices/Pandoc/RawHtmlGroup.hs new file mode 100644 index 0000000..b485891 --- /dev/null +++ b/src/Heist/Extra/Splices/Pandoc/RawHtmlGroup.hs @@ -0,0 +1,151 @@ +{- | Group orphan raw-HTML opener / closer block pairs around the markdown +content between them, so the renderer can emit a real DOM element that +nests the content as children. + +Background. CommonMark "type 6" HTML blocks (any block-level start tag) end +at the next blank line. Markdown like + +@ +\ + +**bold** content + +\ +@ + +reaches a Pandoc renderer as three blocks: a 'B.RawBlock' with +@"\\\n"@, a 'B.Para' for the paragraph, and another 'B.RawBlock' +with @"\\\n"@. Heist-extra's renderer wraps each raw blob in +its own @\@ element to keep xmlhtml from mangling the bytes; the +side effect is that the @\@ open and close get trapped inside +those wrappers and the markdown paragraph ends up a sibling of the (now +empty) details element rather than its child. See @srid/emanote#433@. + +This pass walks a block list and, when it sees an unbalanced opening tag +followed downstream by a matching closing tag (depth counted only against +opens of the same tag name), replaces that span with a 'B.Div' carrying +the tag name in the @"tag"@ attribute. The render path already turns @Div@ +with a @"tag"@ attr into the named element, so the markdown content lands +as a real DOM child. + +Tags without a matching closer are left as raw blocks — emitting a +synthetic close would change the input's semantics. Tag names are +compared case-insensitively to follow HTML's own rules. Attributes on +the opener (e.g. @\
@) are dropped when we group; that is +deliberate scope until a real case demands otherwise. A tag like +@\foo\@ that already balances inside one +'B.RawBlock' is left alone — the renderer handles balanced raw-HTML +fragments correctly today. + +== Volatility & boundary + +This module exists to encapsulate one specific axis of change: the +strategy for rebalancing orphan raw-HTML blocks around CommonMark "type 6" +content. Future evolution (alternative wrapping strategies, attribute +preservation on the produced 'B.Div', void-element awareness, support for +new HTML element families, smarter nesting heuristics) lives here so the +renderer's interface stays stable. Treat the boundary as load-bearing — +do not inline @groupRawHtmlBlocks@ into the renderer or scatter pieces +of the parsing logic across other modules. + +== Public surface + +Exposed for tests and downstream tooling but not covered by the +library's stability guarantees; the API can change between minor +versions without a deprecation cycle. +-} +module Heist.Extra.Splices.Pandoc.RawHtmlGroup ( + groupRawHtmlBlocks, + + -- * The @"tag"@ directive on 'B.Div' + + -- | Wire format the pass produces and the renderer consumes. Owned here + -- (the producer) so a future change to the directive scheme starts at the + -- module that decides what shape to emit. + tagDirectiveKey, + divTag, + stripTagDirective, +) where + +import Data.Char (isAsciiLower, isAsciiUpper, isDigit, isSpace) +import Data.List (lookup) +import Data.Text qualified as T +import Text.Pandoc.Definition qualified as B + +groupRawHtmlBlocks :: [B.Block] -> [B.Block] +groupRawHtmlBlocks = \case + [] -> [] + b : rest + | Just tag <- openerTag b + , Just (inner, after) <- splitAtMatchingCloser tag rest -> + B.Div ("", [], [(tagDirectiveKey, tag)]) (groupRawHtmlBlocks inner) + : groupRawHtmlBlocks after + | otherwise -> b : groupRawHtmlBlocks rest + +-- | Read @b@ as an opening raw-HTML tag, returning the (lowercased) tag name. +openerTag :: B.Block -> Maybe Text +openerTag = \case + B.RawBlock (B.Format "html") s -> do + name <- parseTagAfterPrefix "<" s + -- Reject self-closing forms (@\
@): if the body between name and + -- '>' ends in '/', it's a void-element shorthand, not an opener. + let inside = T.takeWhile (/= '>') (T.drop (T.length name + 1) (T.strip s)) + guard $ not ("/" `T.isSuffixOf` T.stripEnd inside) + pure name + _ -> Nothing + +-- | Read @b@ as a closing tag for @tag@. +closerTag :: Text -> B.Block -> Bool +closerTag tag = \case + B.RawBlock (B.Format "html") s -> parseTagAfterPrefix " False + +{- | Shared parser for the bare-tag wire format both opener and closer follow: +@PREFIX@ + tag-name chars + optional attribute body + @\>@, with whitespace +the only thing allowed past the closing @\>@. Returns the lowercased tag +name, or 'Nothing' if any guard fails (no @\>@, malformed name, content +past the @\>@). +-} +parseTagAfterPrefix :: Text -> Text -> Maybe Text +parseTagAfterPrefix prefix s = do + body <- T.stripPrefix prefix (T.strip s) + let (name, rest) = T.span isTagNameChar body + guard $ not (T.null name) + afterGT <- T.stripPrefix ">" (T.dropWhile (/= '>') rest) + guard $ T.all isSpace afterGT + pure $ T.toLower name + +isTagNameChar :: Char -> Bool +isTagNameChar c = isAsciiLower c || isAsciiUpper c || isDigit c || c == '-' + +{- | Walk forward, tracking nesting depth of @tag@. On the matching closer +(depth back to zero) split into the inner span and the remainder. 'Nothing' +means the opener is orphan at this level. +-} +splitAtMatchingCloser :: Text -> [B.Block] -> Maybe ([B.Block], [B.Block]) +splitAtMatchingCloser tag = go (1 :: Int) [] + where + -- Caller has already consumed the opener, so depth starts at 1. + go _ _ [] = Nothing + go depth acc (b : bs) + | closerTag tag b = + if depth == 1 + then Just (reverse acc, bs) + else go (depth - 1) (b : acc) bs + | Just t <- openerTag b + , t == tag = + go (depth + 1) (b : acc) bs + | otherwise = go depth (b : acc) bs + +-- | Attribute key on 'B.Div' whose value overrides the rendered element name. +tagDirectiveKey :: Text +tagDirectiveKey = "tag" + +-- | Resolve the element name a 'B.Div' should render as, defaulting to @"div"@. +divTag :: B.Attr -> Text +divTag (_, _, kvs) = fromMaybe "div" (lookup tagDirectiveKey kvs) + +-- | Drop the directive from a 'B.Attr' so it doesn't survive into rendered HTML. +stripTagDirective :: B.Attr -> B.Attr +stripTagDirective (i, cs, kvs) = + (i, cs, filter ((/= tagDirectiveKey) . fst) kvs) diff --git a/src/Heist/Extra/Splices/Pandoc/Render.hs b/src/Heist/Extra/Splices/Pandoc/Render.hs index 7910897..9218dd8 100644 --- a/src/Heist/Extra/Splices/Pandoc/Render.hs +++ b/src/Heist/Extra/Splices/Pandoc/Render.hs @@ -11,7 +11,6 @@ module Heist.Extra.Splices.Pandoc.Render ( rawNode, ) where -import Data.Map.Strict qualified as Map import Data.Map.Syntax ((##)) import Data.Text qualified as T import Heist qualified as H @@ -24,6 +23,11 @@ import Heist.Extra.Splices.Pandoc.Ctx ( RenderFeatures (..), rewriteClass, ) +import Heist.Extra.Splices.Pandoc.RawHtmlGroup ( + divTag, + groupRawHtmlBlocks, + stripTagDirective, + ) import Heist.Extra.Splices.Pandoc.Render.Internal ( alignmentStyle, cellColumnIndices, @@ -40,9 +44,14 @@ import Text.Pandoc.Definition (Pandoc (..)) import Text.Pandoc.Walk as W import Text.XmlHtml qualified as X +-- | Pre-group orphan opener/closer raw-HTML pairs at every block-list level. +preprocess :: Pandoc -> Pandoc +preprocess = W.walk groupRawHtmlBlocks + renderPandocWith :: RenderCtx -> Pandoc -> HI.Splice Identity -renderPandocWith ctx (Pandoc _meta blocks) = - foldMapM (rpBlock ctx) blocks +renderPandocWith ctx doc = + let Pandoc _meta blocks = preprocess doc + in foldMapM (rpBlock ctx) blocks rpBlock :: RenderCtx -> B.Block -> HI.Splice Identity rpBlock ctx@RenderCtx {..} b = do @@ -145,15 +154,12 @@ rpBlock' ctx@RenderCtx {..} b = case b of tfoot <- wrapSection "tfoot" "td" frows pure $ cg <> thead <> tbody <> tfoot B.Div attr bs -> - one . X.Element (getTag "div" attr) (rpAttr $ rewriteClass ctx attr) + one . X.Element (divTag attr) (rpAttr $ rewriteClass ctx (stripTagDirective attr)) <$> foldMapM (rpBlock ctx) bs B.Figure attr _caption bs -> -- TODO: support caption one . X.Element "figure" (rpAttr attr) <$> foldMapM (rpBlock ctx) bs where - getTag defaultTag (_, _, Map.fromList -> attrs) = - Map.lookup "tag" attrs & fromMaybe defaultTag - headerSplices headerId innerSplice = do "header:id" ## HI.textSplice headerId "inlines" ## innerSplice diff --git a/test/Heist/Extra/Splices/Pandoc/RawHtmlGroupSpec.hs b/test/Heist/Extra/Splices/Pandoc/RawHtmlGroupSpec.hs new file mode 100644 index 0000000..60ec982 --- /dev/null +++ b/test/Heist/Extra/Splices/Pandoc/RawHtmlGroupSpec.hs @@ -0,0 +1,138 @@ +{- | Tests for "Heist.Extra.Splices.Pandoc.RawHtmlGroup". + +The pass solves @srid/emanote#433@: orphan opener/closer raw-HTML blocks +end up wrapping a markdown paragraph as siblings rather than parents +unless the renderer can see a real DOM 'B.Div' instead of two stranded +'B.RawBlock's. These tests pin the AST shape produced for every variant +that mattered enough to write code for. +-} +module Heist.Extra.Splices.Pandoc.RawHtmlGroupSpec (spec) where + +import Data.Text (Text) +import Heist.Extra.Splices.Pandoc.RawHtmlGroup (groupRawHtmlBlocks) +import Test.Hspec +import Text.Pandoc.Definition qualified as B +import Prelude + +raw :: Text -> B.Block +raw = B.RawBlock (B.Format "html") + +para :: Text -> B.Block +para s = B.Para [B.Str s] + +-- | A 'B.Div' with the named tag and no other attrs — what the pass produces. +taggedDiv :: Text -> [B.Block] -> B.Block +taggedDiv tag = B.Div ("", [], [("tag", tag)]) + +spec :: Spec +spec = describe "groupRawHtmlBlocks (srid/emanote#433)" $ do + it "groups the issue's example: opener, paragraph, closer" $ do + -- The exact Pandoc AST emitted by parseNoteMarkdown on the issue's MD. + let input = + [ para "aaaa" + , raw "
\n" + , para "bbb" + , raw "
\n" + , para "eee" + ] + expected = + [ para "aaaa" + , taggedDiv "details" [para "bbb"] + , para "eee" + ] + groupRawHtmlBlocks input `shouldBe` expected + + it "produces an empty Div when opener and closer are adjacent" $ do + -- The "empty group" case the branch is named after: nothing between + -- the open and close raw blocks. + let input = [raw "
\n", raw "
\n"] + expected = [taggedDiv "details" []] + groupRawHtmlBlocks input `shouldBe` expected + + it "leaves an opener with no matching closer untouched" $ do + let input = [raw "
\n", para "stuck open"] + groupRawHtmlBlocks input `shouldBe` input + + it "leaves a closer with no opener untouched" $ do + let input = [para "lone", raw "
\n"] + groupRawHtmlBlocks input `shouldBe` input + + it "groups consecutive opener/closer pairs independently" $ do + let input = + [ raw "
\n" + , para "x" + , raw "
\n" + , raw "\n" + ] + expected = + [ taggedDiv "details" [para "x"] + , taggedDiv "aside" [para "y"] + ] + groupRawHtmlBlocks input `shouldBe` expected + + it "handles same-tag nesting via depth counting" $ do + -- Two opens before a close: outer must wrap the inner pair. + let input = + [ raw "
\n" + , raw "
\n" + , para "inner" + , raw "
\n" + , para "between" + , raw "
\n" + ] + expected = + [ taggedDiv + "details" + [ taggedDiv "details" [para "inner"] + , para "between" + ] + ] + groupRawHtmlBlocks input `shouldBe` expected + + it "ignores self-closing forms like
" $ do + let input = [raw "
\n", para "after"] + groupRawHtmlBlocks input `shouldBe` input + + it "ignores raw blocks that already balance internally" $ do + -- Single-line `x` doesn't get split into open/close + -- by Pandoc, so it doesn't match the opener pattern. + let input = [raw "
foo
\n", para "after"] + groupRawHtmlBlocks input `shouldBe` input + + it "matches tag names case-insensitively" $ do + let input = [raw "
\n", para "x", raw "
\n"] + expected = [taggedDiv "details" [para "x"]] + groupRawHtmlBlocks input `shouldBe` expected + + it "accepts opener attributes (and drops them on the produced Div)" $ do + -- Attribute support on the output is out of scope for this pass; + -- what matters is that the attributes don't trip the parser. + let input = + [ raw "
\n" + , para "x" + , raw "
\n" + ] + expected = [taggedDiv "details" [para "x"]] + groupRawHtmlBlocks input `shouldBe` expected + + it "groups custom-element tag names with hyphens" $ do + let input = + [ raw "\n" + , para "x" + , raw "\n" + ] + expected = [taggedDiv "my-card" [para "x"]] + groupRawHtmlBlocks input `shouldBe` expected + + it "leaves orphan tags whose closers belong to a different element" $ do + -- An
downstream stays open. + let input = [raw "