From 54feb59b933fe98f5c92cfabe5d7040b694fbe9c Mon Sep 17 00:00:00 2001 From: Martin Kavalar Date: Wed, 26 Nov 2025 14:30:47 +1100 Subject: [PATCH 1/6] Move opts to own map --- dev/nextjournal/markdown/parser.cljc | 53 +++---- src/nextjournal/markdown.cljc | 13 +- src/nextjournal/markdown/impl.clj | 6 +- src/nextjournal/markdown/impl.cljs | 4 +- src/nextjournal/markdown/impl/extensions.clj | 2 +- src/nextjournal/markdown/impl/utils.cljc | 20 ++- test/nextjournal/markdown_test.cljc | 144 +++++++++---------- 7 files changed, 118 insertions(+), 124 deletions(-) diff --git a/dev/nextjournal/markdown/parser.cljc b/dev/nextjournal/markdown/parser.cljc index eeeaecf..c91dbe6 100644 --- a/dev/nextjournal/markdown/parser.cljc +++ b/dev/nextjournal/markdown/parser.cljc @@ -19,12 +19,13 @@ ;; - `:heading-level` specific of `:heading` nodes ;; - `:attrs` attributes as passed by markdown-it tokens (e.g `{:style "some style info"}`) (ns nextjournal.markdown.parser - (:require [clojure.string :as str] + (:require #?@(:cljs [[applied-science.js-interop :as j]]) + [clojure.string :as str] [clojure.zip :as z] [nextjournal.markdown :as md] + [nextjournal.markdown.impl.utils :as utils] [nextjournal.markdown.transform :as md.transform] - [nextjournal.markdown.utils.emoji :as emoji] - #?@(:cljs [[applied-science.js-interop :as j]]))) + [nextjournal.markdown.utils.emoji :as emoji])) ;; clj common accessors (def get-in* #?(:clj get-in :cljs j/get-in)) @@ -193,8 +194,10 @@ (fn [node cs] (assoc node :content (vec cs))) doc)) -(defn assign-node-id+emoji [{:as doc ::keys [id->index path] :keys [text->id+emoji-fn]}] - (let [{:keys [id emoji]} (when (ifn? text->id+emoji-fn) (-> doc (get-in path) text->id+emoji-fn)) +(defn assign-node-id+emoji [{:as doc ::keys [id->index path] :keys [opts]}] + (let [{:keys [text->id+emoji-fn]} opts + {:keys [id emoji]} (when (ifn? text->id+emoji-fn) + (-> doc (get-in path) text->id+emoji-fn)) id-count (when id (get id->index id))] (cond-> doc id @@ -446,7 +449,7 @@ And what. insert-sidenote-containers) (-> empty-doc - (update :text-tokenizers (partial map normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial map normalize-tokenizer)) (apply-tokens (nextjournal.markdown/tokenize "what^[the heck]")) insert-sidenote-columns (apply-tokens (nextjournal.markdown/tokenize "# Hello")) @@ -510,7 +513,7 @@ And what. _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." nextjournal.markdown/tokenize - (parse (update empty-doc :text-tokenizers conj hashtag-tokenizer)))) + (parse (update-in empty-doc [:opts :text-tokenizers] conj hashtag-tokenizer)))) (defn normalize-tokenizer @@ -546,7 +549,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (conj (text-hnode remaining-text)))) [node]))) -(defmethod apply-token "text" [{:as doc :keys [text-tokenizers]} {:keys [content]}] +(defmethod apply-token "text" [{:as doc :keys [opts]} {:keys [content]}] (reduce (fn [doc {:as node :keys [doc-handler]}] (doc-handler doc (dissoc node :doc-handler))) doc (reduce (fn [nodes tokenizer] @@ -554,19 +557,19 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (if (= :text type) (tokenize-text-node tokenizer doc node) [node])) nodes)) [{:type :text :text content :doc-handler push-node}] - text-tokenizers))) + (:text-tokenizers opts)))) (comment (def mustache (normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" :handler (fn [m] {:type :eval :text (m 1)})})) (tokenize-text-node mustache {} {:text "{{what}} the {{hellow}}"}) - (apply-token (assoc empty-doc :text-tokenizers [mustache]) + (apply-token (assoc-in empty-doc [:opts :text-tokenizers] [mustache]) {:type "text" :content "foo [[bar]] dang #hashy taggy [[what]] #dangy foo [[great]] and {{eval}} me"}) - (parse (assoc empty-doc - :text-tokenizers - [(normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" - :doc-handler (fn [{:as doc ::keys [path]} {[_ meta] :match}] - (update-in doc (ppop path) assoc :meta meta))})]) + (parse (assoc-in empty-doc + [:opts :text-tokenizers] + [(normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" + :doc-handler (fn [{:as doc ::keys [path]} {[_ meta] :match}] + (update-in doc (ppop path) assoc :meta meta))})]) (nextjournal.markdown/tokenize "# Title {{id=heading}} * one * two"))) @@ -606,27 +609,17 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (let [mapify-attrs-xf (map (fn [x] (update* x :attrs pairs->kmap)))] (reduce (mapify-attrs-xf apply-token) doc tokens))) -(def empty-doc {:type :doc - :content [] - ;; Id -> Nat, to disambiguate ids for nodes with the same textual content - ::id->index {} - ;; Node -> {id : String, emoji String}, dissoc from context to opt-out of ids - :text->id+emoji-fn (comp text->id+emoji md.transform/->text) - :toc {:type :toc} - :footnotes [] - ::path [:content -1] ;; private - :text-tokenizers []}) +(def empty-doc utils/empty-doc) (defn parse "Takes a doc and a collection of markdown-it tokens, applies tokens to doc. Uses an emtpy doc in arity 1." ([tokens] (parse empty-doc tokens)) ([doc tokens] (-> doc - (update :text-tokenizers (partial map normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial map normalize-tokenizer)) (apply-tokens tokens) - (dissoc ::path - ::id->index - :text-tokenizers - :text->id+emoji-fn)))) + (dissoc :opts + ::path + ::id->index)))) (comment diff --git a/src/nextjournal/markdown.cljc b/src/nextjournal/markdown.cljc index b37a0f5..b454d31 100644 --- a/src/nextjournal/markdown.cljc +++ b/src/nextjournal/markdown.cljc @@ -20,16 +20,13 @@ (defn parse "Turns the given `markdown-string` into an AST of nested clojure data. - Accepted `config` options: + Accepted `opts`: - `:text-tokenizers`: customize parsing of text in leaf nodes (see https://nextjournal.github.io/markdown/notebooks/parsing_extensibility). - - `:disable-inline-formulas`: turn off parsing of $-delimited inline formulas. - " + - `:disable-inline-formulas`: turn off parsing of $-delimited inline formulas." ([markdown-string] (parse {} markdown-string)) - ([config markdown-string] - (-> (parse* config markdown-string) - (dissoc :disable-inline-formulas - :text-tokenizers - :text->id+emoji-fn + ([opts markdown-string] + (-> (parse* {:opts opts} markdown-string) + (dissoc :opts ::impl/footnote-offset ::impl/id->index ::impl/label->footnote-ref diff --git a/src/nextjournal/markdown/impl.clj b/src/nextjournal/markdown/impl.clj index 5513b35..e4cf0a0 100644 --- a/src/nextjournal/markdown/impl.clj +++ b/src/nextjournal/markdown/impl.clj @@ -287,8 +287,8 @@ (set (keys ctx)) (set (keys u/empty-doc)))) ;; only settings were provided, we add the empty doc - (recur (merge u/empty-doc ctx) md) - (node->data (update ctx :text-tokenizers (partial map u/normalize-tokenizer)) + (recur (merge ctx (update u/empty-doc :opts merge (:opts ctx))) md) + (node->data (update-in ctx [:opts :text-tokenizers] (partial mapv u/normalize-tokenizer)) (.parse (parser ctx) md))))) (comment @@ -311,7 +311,7 @@ (parse "some para^[with other note]")) (parse "some `marks` inline and inline $formula$ with a [link _with_ em](https://what.tfk)") - (parse (assoc u/empty-doc :text-tokenizers [u/internal-link-tokenizer]) + (parse (assoc-in u/empty-doc [:otps :text-tokenizers] [u/internal-link-tokenizer]) "what a [[link]] is this") (parse "what the real deal is") (parse "some diff --git a/src/nextjournal/markdown/impl.cljs b/src/nextjournal/markdown/impl.cljs index 735e8d1..803a450 100644 --- a/src/nextjournal/markdown/impl.cljs +++ b/src/nextjournal/markdown/impl.cljs @@ -170,7 +170,7 @@ #_ u/insert-sidenote-containers) (-> empty-doc - (update :text-tokenizers (partial map u/normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial map u/normalize-tokenizer)) (apply-tokens (nextjournal.markdown/tokenize "what^[the heck]")) insert-sidenote-columns (apply-tokens (nextjournal.markdown/tokenize "# Hello")) @@ -284,7 +284,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (let [{:as ctx-out :keys [doc title toc footnotes] ::keys [label->footnote-ref]} (-> ctx-in (assoc ::footnote-offset (count (::label->footnote-ref ctx-in))) - (update :text-tokenizers (partial map u/normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial map u/normalize-tokenizer)) (assoc :doc (u/->zip ctx-in) :footnotes (u/->zip {:type :footnotes :content (or (:footnotes ctx-in) [])})) diff --git a/src/nextjournal/markdown/impl/extensions.clj b/src/nextjournal/markdown/impl/extensions.clj index a6e293d..1d274dc 100644 --- a/src/nextjournal/markdown/impl/extensions.clj +++ b/src/nextjournal/markdown/impl/extensions.clj @@ -103,7 +103,7 @@ (extend [^Parser$Builder pb] (.customBlockParserFactory pb block-toc-parser-factory) (.customBlockParserFactory pb block-formula-parser-factory) - (when-not (:disable-inline-formulas ctx) + (when-not (:disable-inline-formulas (:opts ctx)) (.customInlineContentParserFactory pb (reify InlineContentParserFactory (getTriggerCharacters [_] #{\$}) (create [_] (inline-formula-parser)))))))) diff --git a/src/nextjournal/markdown/impl/utils.cljc b/src/nextjournal/markdown/impl/utils.cljc index 71faf13..a0e9a99 100644 --- a/src/nextjournal/markdown/impl/utils.cljc +++ b/src/nextjournal/markdown/impl/utils.cljc @@ -51,9 +51,9 @@ :content [] :toc {:type :toc} :footnotes [] - :text-tokenizers [] - ;; Node -> {id : String, emoji String}, dissoc from context to opt-out of ids - :text->id+emoji-fn (comp text->id+emoji md.transform/->text) + :opts {:text-tokenizers [] + ;; Node -> {id : String, emoji String}, dissoc from context to opt-out of ids + :text->id+emoji-fn (comp text->id+emoji md.transform/->text)} ;; private ;; Id -> Nat, to disambiguate ids for nodes with the same textual content @@ -162,7 +162,8 @@ (reduce (xf rf) (assoc doc :toc {:type :toc}) content))) (defn handle-close-heading [ctx] - (let [{:keys [text->id+emoji-fn] :nextjournal.markdown.impl/keys [id->index]} ctx + (let [{:keys [opts] :nextjournal.markdown.impl/keys [id->index]} ctx + {:keys [text->id+emoji-fn]} opts heading-loc (current-loc ctx) heading (z/node heading-loc) {:keys [id emoji]} (when (ifn? text->id+emoji-fn) @@ -231,6 +232,11 @@ end" (defn tokenize-text-node [{:as tkz :keys [tokenizer-fn pred doc-handler]} ctx {:as node :keys [text]}] ;; TokenizerFn -> HNode -> [HNode] + (when-not (and (fn? tokenizer-fn) + (fn? doc-handler) + (fn? pred) + (string? text)) + (throw (ex-info "boom" tkz))) (assert (and (fn? tokenizer-fn) (fn? doc-handler) (fn? pred) @@ -244,7 +250,7 @@ end" (-> acc (update :remaining-text subs 0 start) (cond-> - (< end (count remaining-text)) + (< end (count remaining-text)) (update :nodes conj (text-hnode (subs remaining-text end)))) (update :nodes conj {:doc-handler doc-handler :match match :text text @@ -256,7 +262,7 @@ end" (conj (text-hnode remaining-text)))) [node]))) -(defn handle-text-token [{:as ctx :keys [text-tokenizers]} text] +(defn handle-text-token [{:as ctx :keys [opts]} text] (reduce (fn [ctx {:as node :keys [doc-handler]}] (update-current-loc ctx doc-handler (dissoc node :doc-handler))) ctx (reduce (fn [nodes tokenizer] @@ -266,7 +272,7 @@ end" (cond-> [] (not (empty? text)) (conj {:type :text :text text :doc-handler z/append-child})) - text-tokenizers))) + (:text-tokenizers opts)))) ;; clj #_(handle-text-token (->zip {:type :doc :content []}) "some-text") diff --git a/test/nextjournal/markdown_test.cljc b/test/nextjournal/markdown_test.cljc index 99e27db..a62c73e 100644 --- a/test/nextjournal/markdown_test.cljc +++ b/test/nextjournal/markdown_test.cljc @@ -62,85 +62,83 @@ $$\\int_a^bf(t)dt$$ :content [{:type :text, :text "https://clerk.vision"}]}]}]} (md/parse "https://clerk.vision")))) -(defn parse-internal-links [text] - (md/parse* (update u/empty-doc :text-tokenizers conj u/internal-link-tokenizer) - text)) +(def parse-internal-links + (partial md/parse {:text-tokenizers [u/internal-link-tokenizer]})) -(defn parse-hashtags [text] - (md/parse* (update u/empty-doc :text-tokenizers conj u/hashtag-tokenizer) - text)) +(def parse-hashtags + (partial md/parse {:text-tokenizers [u/hashtag-tokenizer]})) (deftest parse-test (testing "ingests markdown returns nested nodes" - (is (match? - {:type :doc - :footnotes [] - :title "🎱 Hello" - :content [{:content [{:text "🎱 Hello" - :type :text}] - :heading-level 1 - :attrs {:id "hello"} - :emoji "🎱" - :type :heading} - {:content [{:text "some " - :type :text} - {:content [{:text "strong" - :type :text}] - :type :strong} - {:text " " - :type :text} - {:content [{:text "assertion" - :type :text}] - :type :em} - {:text " and a " - :type :text} - {:attrs {:href "/path/to/something"} - :content [{:text "link" - :type :text}] - :type :link} - {:text " and a " - :type :text} - {:text "\\pi" - :type :formula} - {:text " formula" - :type :text}] - :type :paragraph} - {:content [{:text "(+ 1 2 3)\n" :type :text}] - :info "clojure" - :language "clojure" - :type :code} - {:content [{:text "no language\n" :type :text}] - :type :code} - {:text "\\int_a^bf(t)dt" - :type :block-formula} - {:content [{:content [{:content [{:text "one" - :type :text}] - :type :paragraph}] - :type :list-item} - {:content [{:content [{:text "two" - :type :text}] - :type :paragraph}] - :type :list-item}] - :type :bullet-list}] - :toc {:type :toc - :children [{:type :toc - :content [{:type :text, :text "🎱 Hello"}] - :heading-level 1 - :attrs {:id "hello"} - :emoji "🎱" - :path [:content 0]}]}} - (md/parse markdown-text)))) + (is (= {:type :doc + :footnotes [] + :title "🎱 Hello" + :content [{:content [{:text "🎱 Hello" + :type :text}] + :heading-level 1 + :attrs {:id "hello"} + :emoji "🎱" + :type :heading} + {:content [{:text "some " + :type :text} + {:content [{:text "strong" + :type :text}] + :type :strong} + {:text " " + :type :text} + {:content [{:text "assertion" + :type :text}] + :type :em} + {:text " and a " + :type :text} + {:attrs {:href "/path/to/something"} + :content [{:text "link" + :type :text}] + :type :link} + {:text " and a " + :type :text} + {:text "\\pi" + :type :formula} + {:text " formula" + :type :text}] + :type :paragraph} + {:content [{:text "(+ 1 2 3)\n" :type :text}] + :info "clojure" + :language "clojure" + :type :code} + {:content [{:text "no language\n" :type :text}] + :info "" + :type :code} + {:text "\\int_a^bf(t)dt" + :type :block-formula} + {:content [{:content [{:content [{:text "one" + :type :text}] + :type :paragraph}] + :type :list-item} + {:content [{:content [{:text "two" + :type :text}] + :type :paragraph}] + :type :list-item}] + :type :bullet-list}] + :toc {:type :toc + :children [{:type :toc + :content [{:type :text, :text "🎱 Hello"}] + :heading-level 1 + :attrs {:id "hello"} + :emoji "🎱" + :path [:content 0]}]}} + (md/parse markdown-text)))) (testing "parses internal links / plays well with todo lists" - (is (match? {:type :doc - :content [{:type :paragraph - :content [{:text "a " - :type :text} - {:text "wikistyle" - :type :internal-link} - {:text " link" - :type :text}]}]} - (parse-internal-links "a [[wikistyle]] link"))) + (is (= {:type :doc + :toc {:type :toc} + :footnotes [] + :content [{:type :paragraph, + :content + [{:type :text, :text "a "} + {:type :internal-link, :text "wikistyle"} + {:type :text, :text " link"}]}]} + (parse-internal-links "a [[wikistyle]] link"))) (is (match? {:type :doc :content [{:heading-level 1 From ea6030e363576f2ffd887ad7a455ed4fdbbd8666 Mon Sep 17 00:00:00 2001 From: Martin Kavalar Date: Wed, 26 Nov 2025 14:41:55 +1100 Subject: [PATCH 2/6] remove throw --- src/nextjournal/markdown/impl/utils.cljc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/nextjournal/markdown/impl/utils.cljc b/src/nextjournal/markdown/impl/utils.cljc index a0e9a99..335fa5a 100644 --- a/src/nextjournal/markdown/impl/utils.cljc +++ b/src/nextjournal/markdown/impl/utils.cljc @@ -232,11 +232,6 @@ end" (defn tokenize-text-node [{:as tkz :keys [tokenizer-fn pred doc-handler]} ctx {:as node :keys [text]}] ;; TokenizerFn -> HNode -> [HNode] - (when-not (and (fn? tokenizer-fn) - (fn? doc-handler) - (fn? pred) - (string? text)) - (throw (ex-info "boom" tkz))) (assert (and (fn? tokenizer-fn) (fn? doc-handler) (fn? pred) From fe75fee8eaec0d66fb71c9fe6300b5a06a9892ab Mon Sep 17 00:00:00 2001 From: Martin Kavalar Date: Wed, 26 Nov 2025 14:54:59 +1100 Subject: [PATCH 3/6] init for cljs --- src/nextjournal/markdown/impl.cljs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/nextjournal/markdown/impl.cljs b/src/nextjournal/markdown/impl.cljs index 803a450..e7da0ee 100644 --- a/src/nextjournal/markdown/impl.cljs +++ b/src/nextjournal/markdown/impl.cljs @@ -212,7 +212,7 @@ > what about #this _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." - (parse (update empty-doc :text-tokenizers conj (u/normalize-tokenizer u/hashtag-tokenizer))))) + (parse (update-in empty-doc [:opts :text-tokenizers] conj (u/normalize-tokenizer u/hashtag-tokenizer))))) (defmethod apply-token "text" [ctx ^js token] (u/handle-text-token ctx (.-content token))) @@ -220,7 +220,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (comment (def mustache (u/normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" :handler (fn [m] {:type :eval :text (m 1)})})) (u/tokenize-text-node mustache {} {:text "{{what}} the {{hellow}}"}) - (u/handle-text-token (assoc u/empty-doc :text-tokenizers [mustache]) + (u/handle-text-token (assoc-in u/empty-doc [:opts :text-tokenizers] [mustache]) "foo [[bar]] dang #hashy taggy [[what]] #dangy foo [[great]] and {{eval}} me")) ;; inlines @@ -280,7 +280,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (if (not (set/superset? (set (keys ctx-in)) (set (keys u/empty-doc)))) - (recur (merge u/empty-doc ctx-in) markdown) + (recur (merge ctx (update u/empty-doc :opts merge (:opts ctx))) markdown) (let [{:as ctx-out :keys [doc title toc footnotes] ::keys [label->footnote-ref]} (-> ctx-in (assoc ::footnote-offset (count (::label->footnote-ref ctx-in))) @@ -288,7 +288,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (assoc :doc (u/->zip ctx-in) :footnotes (u/->zip {:type :footnotes :content (or (:footnotes ctx-in) [])})) - (apply-tokens (md/tokenize #js {:disable_inline_formulas (:disable-inline-formulas ctx-in)} + (apply-tokens (md/tokenize #js {:disable_inline_formulas (:disable-inline-formulas (:opts ctx-in))} markdown)))] (-> ctx-out (dissoc :doc) From 435614bce23f7540916c2851bd17a19c3d1652b4 Mon Sep 17 00:00:00 2001 From: Martin Kavalar Date: Wed, 26 Nov 2025 14:58:50 +1100 Subject: [PATCH 4/6] mapv --- src/nextjournal/markdown/impl.cljs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nextjournal/markdown/impl.cljs b/src/nextjournal/markdown/impl.cljs index e7da0ee..03ab619 100644 --- a/src/nextjournal/markdown/impl.cljs +++ b/src/nextjournal/markdown/impl.cljs @@ -170,7 +170,7 @@ #_ u/insert-sidenote-containers) (-> empty-doc - (update-in [:opts :text-tokenizers] (partial map u/normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial mapv u/normalize-tokenizer)) (apply-tokens (nextjournal.markdown/tokenize "what^[the heck]")) insert-sidenote-columns (apply-tokens (nextjournal.markdown/tokenize "# Hello")) @@ -284,7 +284,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (let [{:as ctx-out :keys [doc title toc footnotes] ::keys [label->footnote-ref]} (-> ctx-in (assoc ::footnote-offset (count (::label->footnote-ref ctx-in))) - (update-in [:opts :text-tokenizers] (partial map u/normalize-tokenizer)) + (update-in [:opts :text-tokenizers] (partial mapv u/normalize-tokenizer)) (assoc :doc (u/->zip ctx-in) :footnotes (u/->zip {:type :footnotes :content (or (:footnotes ctx-in) [])})) From f7115cce4c482a4a18d1526dda15c0dade29e064 Mon Sep 17 00:00:00 2001 From: Martin Kavalar Date: Wed, 26 Nov 2025 15:02:36 +1100 Subject: [PATCH 5/6] fix --- src/nextjournal/markdown/impl.cljs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/nextjournal/markdown/impl.cljs b/src/nextjournal/markdown/impl.cljs index 03ab619..03d185e 100644 --- a/src/nextjournal/markdown/impl.cljs +++ b/src/nextjournal/markdown/impl.cljs @@ -280,7 +280,7 @@ _this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." (if (not (set/superset? (set (keys ctx-in)) (set (keys u/empty-doc)))) - (recur (merge ctx (update u/empty-doc :opts merge (:opts ctx))) markdown) + (recur (merge ctx-in (update u/empty-doc :opts merge (:opts ctx-in))) markdown) (let [{:as ctx-out :keys [doc title toc footnotes] ::keys [label->footnote-ref]} (-> ctx-in (assoc ::footnote-offset (count (::label->footnote-ref ctx-in))) @@ -365,3 +365,4 @@ some final par" (mapcat (partial tree-seq (comp seq :children) :children)) (map #(select-keys % [:type :content :hidden :level :info :meta]))) tokens)) + From d48a506a08fc164eb7236ab1f5c3c1b8fc3ecd92 Mon Sep 17 00:00:00 2001 From: Andrea Amantini Date: Fri, 28 Nov 2025 15:52:21 +0100 Subject: [PATCH 6/6] Try delete old parser from dev folder --- dev/nextjournal/markdown/parser.cljc | 739 --------------------------- 1 file changed, 739 deletions(-) delete mode 100644 dev/nextjournal/markdown/parser.cljc diff --git a/dev/nextjournal/markdown/parser.cljc b/dev/nextjournal/markdown/parser.cljc deleted file mode 100644 index c91dbe6..0000000 --- a/dev/nextjournal/markdown/parser.cljc +++ /dev/null @@ -1,739 +0,0 @@ -;; # 🧩 Parsing -;; -;; Deals with transforming a sequence of tokens obtained by [markdown-it] into a nested AST composed of nested _nodes_. -;; -;; A _node_ is a clojure map and has no closed specification at the moment. We do follow a few conventions for its keys: -;; -;; - `:type` a keyword (:heading, :paragraph, :text, :code etc.) present on all nodes. -;; -;; When a node contains other child nodes, then it will have a -;; -;; - `:content` a collection of nodes representing nested content -;; -;; when a node is a textual leaf (as in a `:text` or `:formula` nodes) it carries a -;; - `:text` key with a string value -;; -;; Other keys might include e.g. -;; -;; - `:info` specific of fenced code blocks -;; - `:heading-level` specific of `:heading` nodes -;; - `:attrs` attributes as passed by markdown-it tokens (e.g `{:style "some style info"}`) -(ns nextjournal.markdown.parser - (:require #?@(:cljs [[applied-science.js-interop :as j]]) - [clojure.string :as str] - [clojure.zip :as z] - [nextjournal.markdown :as md] - [nextjournal.markdown.impl.utils :as utils] - [nextjournal.markdown.transform :as md.transform] - [nextjournal.markdown.utils.emoji :as emoji])) - -;; clj common accessors -(def get-in* #?(:clj get-in :cljs j/get-in)) -(def update* #?(:clj update :cljs j/update!)) - -#?(:clj (defn re-groups* [m] (let [g (re-groups m)] (cond-> g (not (vector? g)) vector)))) -(defn re-idx-seq - "Takes a regex and a string, returns a seq of triplets comprised of match groups followed by indices delimiting each match." - [re text] - #?(:clj (let [m (re-matcher re text)] - (take-while some? (repeatedly #(when (.find m) [(re-groups* m) (.start m) (.end m)])))) - :cljs (let [rex (js/RegExp. (.-source re) "g")] - (take-while some? (repeatedly #(when-some [m (.exec rex text)] [(vec m) (.-index m) (.-lastIndex rex)])))))) - - -(comment (re-idx-seq #"\{\{([^{]+)\}\}" "foo {{hello}} bar")) -(comment (re-idx-seq #"\{\{[^{]+\}\}" "foo {{hello}} bar")) -;; region node operations -;; helpers -(defn inc-last [path] (update path (dec (count path)) inc)) -(defn hlevel [{:as _token hn :tag}] (when (string? hn) (some-> (re-matches #"h([\d])" hn) second #?(:clj Integer/parseInt :cljs js/parseInt)))) - -(defn split-by-emoji [s] - (let [[match start end] (first (re-idx-seq emoji/regex s))] - (if match - [(subs s start end) (str/trim (subs s end))] - [nil s]))) - -#_(split-by-emoji " Stop") -#_(split-by-emoji "🀚🏽 Stop") -#_(split-by-emoji "🀚🏽🀚 Stop") -#_(split-by-emoji "🀚🏽Stop") -#_(split-by-emoji "🀚🏽 Stop") -#_(split-by-emoji "πŸ˜€ Stop") -#_(split-by-emoji "βš›οΈ Stop") -#_(split-by-emoji "βš› Stop") -#_(split-by-emoji "⬇ Stop") -#_(split-by-emoji "Should not πŸ™οΈ Split") - -(defn text->id+emoji [text] - (when (string? text) - (let [[emoji text'] (split-by-emoji (str/trim text))] - (cond-> {:id (apply str (map (comp str/lower-case (fn [c] (case c (\space \_) \- c))) text'))} - emoji (assoc :emoji emoji))))) - -#_(text->id+emoji "Hello There") -#_(text->id+emoji "Hello_There") -#_(text->id+emoji "πŸ‘©β€πŸ”¬ Quantum Physics") - -;; `parse-fence-info` ingests nextjournal, GFM, Pandoc and RMarkdown fenced code block info (any text following the leading 3 backticks) and returns a map -;; -;; _nextjournal_ / _GFM_ -;; -;; ```python id=2e3541da-0735-4b7f-a12f-4fb1bfcb6138 -;; python code -;; ``` -;; -;; _Pandoc_ -;; -;; ```{#pandoc-id .languge .extra-class key=Val} -;; code in language -;; ``` -;; -;; _Rmd_ -;; -;; ```{r cars, echo=FALSE} -;; R code -;; ``` -;; -;; See also: -;; - https://github.github.com/gfm/#info-string -;; - https://pandoc.org/MANUAL.html#fenced-code-blocks -;; - https://rstudio.com/wp-content/uploads/2016/03/rmarkdown-cheatsheet-2.0.pdf" - -(defn parse-fence-info [info-str] - (try - ;; NOTE: this fix is backported - ;; from the new implementation πŸ‘‡ - (when (and (string? info-str) (seq info-str)) - (let [tokens (-> info-str - str/trim - (str/replace #"[\{\}\,]" "") ;; remove Pandoc/Rmarkdown brackets and commas - (str/replace "." "") ;; remove dots - (str/split #" "))] ;; split by spaces - (reduce - (fn [{:as info-map :keys [language]} token] - (let [[_ k v] (re-matches #"^([^=]+)=([^=]+)$" token)] - (cond - (str/starts-with? token "#") (assoc info-map :id (str/replace token #"^#" "")) ;; pandoc #id - (and k v) (assoc info-map (keyword k) v) - (not language) (assoc info-map :language token) ;; language is the first simple token which is not a pandoc's id - :else (assoc info-map (keyword token) true)))) - {} - tokens))) - (catch #?(:clj Throwable :cljs :default) _ {}))) - -(comment - (parse-fence-info "python runtime-id=5f77e475-6178-47a3-8437-45c9c34d57ff") - (parse-fence-info "{#some-id .lang foo=nex}") - (parse-fence-info "#id clojure") - (parse-fence-info "clojure #id") - (parse-fence-info "clojure") - (parse-fence-info "{r cars, echo=FALSE}")) - -;; leaf nodes -(defn text-node [text] {:type :text :text text}) -(defn formula [text] {:type :formula :text text}) -(defn block-formula [text] {:type :block-formula :text text}) -(defn footnote-ref [ref label] (cond-> {:type :footnote-ref :ref ref} label (assoc :label label))) - -;; node constructors -(defn node - [type content attrs top-level] - (cond-> {:type type :content content} - (seq attrs) (assoc :attrs attrs) - (seq top-level) (merge top-level))) - -(defn empty-text-node? [{text :text t :type}] (and (= :text t) (empty? text))) - -(defn push-node [{:as doc ::keys [path]} node] - (try - (cond-> doc - ;; ⬇ mdit produces empty text tokens at mark boundaries, see edge cases below - (not (empty-text-node? node)) - (-> #_doc - (update ::path inc-last) - (update-in (pop path) conj node))) - (catch #?(:clj Exception :cljs js/Error) e - (throw (ex-info (str "nextjournal.markdown cannot add node: " node " at path: " path) - {:doc doc :node node} e))))) - -(def push-nodes (partial reduce push-node)) - -(defn open-node - ([doc type] (open-node doc type {})) - ([doc type attrs] (open-node doc type attrs {})) - ([doc type attrs top-level] - (-> doc - (push-node (node type [] attrs top-level)) - (update ::path into [:content -1])))) - -;; after closing a node, document ::path will point at it -(def ppop (comp pop pop)) -(defn close-node [doc] (update doc ::path ppop)) -(defn update-current [{:as doc path ::path} fn & args] (apply update-in doc path fn args)) - -(defn current-parent-node - "Given an open parsing context `doc`, returns the parent of the node which was last parsed into the document." - [{:as doc ::keys [path]}] - (assert path "A path is needed in document context to retrieve the current node: `current-parent-node` cannot be called after `parse`.") - (get-in doc (ppop path))) - -(defn current-ancestor-nodes - "Given an open parsing context `doc`, returns the list of ancestors of the node last parsed into the document, up to but - not including the top document." - [{:as doc ::keys [path]}] - (assert path "A path is needed in document context to retrieve the current node: `current-ancestor-nodes` cannot be called after `parse`.") - (loop [p (ppop path) ancestors []] - (if (seq p) - (recur (ppop p) (conj ancestors (get-in doc p))) - ancestors))) - -;; TODO: consider rewriting parse in terms of this zipper -(defn ->zip [doc] - (z/zipper (every-pred map? :type) :content - (fn [node cs] (assoc node :content (vec cs))) - doc)) - -(defn assign-node-id+emoji [{:as doc ::keys [id->index path] :keys [opts]}] - (let [{:keys [text->id+emoji-fn]} opts - {:keys [id emoji]} (when (ifn? text->id+emoji-fn) - (-> doc (get-in path) text->id+emoji-fn)) - id-count (when id (get id->index id))] - (cond-> doc - id - (update-in [::id->index id] (fnil inc 0)) - (or id emoji) - (update-in path (fn [node] - (cond-> node - id (assoc-in [:attrs :id] (cond-> id id-count (str "-" (inc id-count)))) - emoji (assoc :emoji emoji))))))) - -(comment ;; path after call - (-> empty-doc ;; [:content -1] - (open-node :heading) ;; [:content 0 :content -1] - (push-node {:node/type :text :text "foo"}) ;; [:content 0 :content 0] - (push-node {:node/type :text :text "foo"}) ;; [:content 0 :content 1] - close-node ;; [:content 1] - - (open-node :paragraph) ;; [:content 1 :content] - (push-node {:node/type :text :text "hello"}) - close-node - (open-node :bullet-list) - ;; - )) -;; endregion - -;; region TOC builder: -;; toc nodes are heading nodes but with `:type` `:toc` and an extra branching along -;; the key `:children` representing the sub-sections of the node -(defn into-toc [toc {:as toc-item :keys [heading-level]}] - (loop [toc toc l heading-level toc-path [:children]] - ;; `toc-path` is `[:children i₁ :children iβ‚‚ ... :children]` - (let [type-path (assoc toc-path (dec (count toc-path)) :type)] - (cond - ;; insert intermediate default empty :content collections for the final update-in (which defaults to maps otherwise) - (not (get-in toc toc-path)) - (recur (assoc-in toc toc-path []) l toc-path) - - ;; fill in toc types for non-contiguous jumps like h1 -> h3 - (not (get-in toc type-path)) - (recur (assoc-in toc type-path :toc) l toc-path) - - (= 1 l) - (update-in toc toc-path (fnil conj []) toc-item) - - :else - (recur toc - (dec l) - (conj toc-path - (max 0 (dec (count (get-in toc toc-path)))) ;; select last child at level if it exists - :children)))))) - -(defn add-to-toc [doc {:as h :keys [heading-level]}] - (cond-> doc (pos-int? heading-level) (update :toc into-toc (assoc h :type :toc)))) - -(defn set-title-when-missing [{:as doc :keys [title]} heading] - (cond-> doc (nil? title) (assoc :title (md.transform/->text heading)))) - -(defn add-title+toc - "Computes and adds a :title and a :toc to the document-like structure `doc` which might have not been constructed by means of `parse`." - [{:as doc :keys [content]}] - (let [rf (fn [doc heading] (-> doc (add-to-toc heading) (set-title-when-missing heading))) - xf (filter (comp #{:heading} :type))] - (reduce (xf rf) (assoc doc :toc {:type :toc}) content))) - -(comment - (-> {:type :toc} - ;;(into-toc {:heading-level 3 :title "Foo"}) - ;;(into-toc {:heading-level 2 :title "Section 1"}) - (into-toc {:heading-level 1 :title "Title" :type :toc}) - (into-toc {:heading-level 4 :title "Section 2" :type :toc}) - ;;(into-toc {:heading-level 4 :title "Section 2.1"}) - ;;(into-toc {:heading-level 2 :title "Section 3"}) - ) - - (-> "# Top _Title_ - -par - -### Three - -## Two - -par -- and a nested -- ### Heading not included - -foo - -## Two Again - -par - -# One Again - -[[TOC]] - -#### Four - -end" - md/parse - :toc - )) -;; endregion - -;; region token handlers -(declare apply-tokens) -(defmulti apply-token (fn [_doc token] (:type token))) -(defmethod apply-token :default [doc token] - (prn :apply-token/unknown-type {:token token}) - doc) - -;; blocks -(defmethod apply-token "heading_open" [doc token] (open-node doc :heading {} {:heading-level (hlevel token)})) -(defmethod apply-token "heading_close" [doc {doc-level :level}] - (let [{:as doc ::keys [path]} (close-node doc) - doc' (assign-node-id+emoji doc) - heading (-> doc' (get-in path) (assoc :path path))] - (cond-> doc' - ;; We're only considering top-level headings (e.g. not those contained inside quotes or lists) - (zero? doc-level) - (-> (add-to-toc heading) - (set-title-when-missing heading))))) - -;; for building the TOC we just care about headings at document top level (not e.g. nested under lists) ⬆ - -(defmethod apply-token "paragraph_open" [doc {:as _token :keys [hidden]}] (open-node doc (if hidden :plain :paragraph))) -(defmethod apply-token "paragraph_close" [doc _token] (close-node doc)) - -(defmethod apply-token "bullet_list_open" [doc {{:as attrs :keys [has-todos]} :attrs}] (open-node doc (if has-todos :todo-list :bullet-list) attrs)) -(defmethod apply-token "bullet_list_close" [doc _token] (close-node doc)) - -(defmethod apply-token "ordered_list_open" [doc {:keys [attrs]}] (open-node doc :numbered-list attrs)) -(defmethod apply-token "ordered_list_close" [doc _token] (close-node doc)) - -(defmethod apply-token "list_item_open" [doc {{:as attrs :keys [todo]} :attrs}] (open-node doc (if todo :todo-item :list-item) attrs)) -(defmethod apply-token "list_item_close" [doc _token] (close-node doc)) - -(defmethod apply-token "math_block" [doc {text :content}] (push-node doc (block-formula text))) -(defmethod apply-token "math_block_end" [doc _token] doc) - -(defmethod apply-token "hr" [doc _token] (push-node doc {:type :ruler})) - -(defmethod apply-token "blockquote_open" [doc _token] (open-node doc :blockquote)) -(defmethod apply-token "blockquote_close" [doc _token] (close-node doc)) - -(defmethod apply-token "tocOpen" [doc _token] (open-node doc :toc)) -(defmethod apply-token "tocBody" [doc _token] doc) ;; ignore body -(defmethod apply-token "tocClose" [doc _token] (-> doc close-node (update-current dissoc :content))) - -(defmethod apply-token "code_block" [doc {:as _token c :content}] - (-> doc - (open-node :code) - (push-node (text-node c)) - close-node)) -(defmethod apply-token "fence" [doc {:as _token i :info c :content}] - (-> doc - (open-node :code {} (assoc (parse-fence-info i) :info i)) - (push-node (text-node c)) - close-node)) - -;; footnotes -(defmethod apply-token "footnote_ref" [{:as doc :keys [footnotes]} token] - (push-node doc (footnote-ref (+ (count footnotes) (get-in* token [:meta :id])) - (get-in* token [:meta :label])))) - -(defmethod apply-token "footnote_anchor" [doc _token] doc) - -(defmethod apply-token "footnote_open" [{:as doc ::keys [footnote-offset]} token] - ;; consider an offset in case we're parsing multiple inputs into the same context - (let [ref (+ (get-in* token [:meta :id]) footnote-offset) - label (get-in* token [:meta :label])] - (open-node doc :footnote nil (cond-> {:ref ref} label (assoc :label label))))) - -(defmethod apply-token "footnote_close" [doc token] (close-node doc)) - -(defmethod apply-token "footnote_block_open" [{:as doc :keys [footnotes] ::keys [path]} _token] - ;; store footnotes at a top level `:footnote` key - (let [footnote-offset (count footnotes)] - (-> doc - (assoc ::path [:footnotes (dec footnote-offset)] - ::footnote-offset footnote-offset - ::path-to-restore path)))) - -(defmethod apply-token "footnote_block_close" - ;; restores path for addding new tokens - [{:as doc ::keys [path-to-restore]} _token] - (-> doc - (assoc ::path path-to-restore) - (dissoc ::path-to-restore ::footnote-offset))) - -(defn footnote->sidenote [{:keys [ref label content]}] - ;; this assumes the footnote container is a paragraph, won't work for lists - (node :sidenote (-> content first :content) nil (cond-> {:ref ref} label (assoc :label label)))) - -(defn node-with-sidenote-refs [p-node] - (loop [l (->zip p-node) refs []] - (if (z/end? l) - (when (seq refs) - {:node (z/root l) :refs refs}) - (let [{:keys [type ref]} (z/node l)] - (if (= :footnote-ref type) - (recur (z/next (z/edit l assoc :type :sidenote-ref)) (conj refs ref)) - (recur (z/next l) refs)))))) - -(defn insert-sidenote-containers - "Handles footnotes as sidenotes. - - Takes and returns a parsed document. When the document has footnotes, wraps every top-level block which contains footnote references - with a `:footnote-container` node, into each of such nodes, adds a `:sidenote-column` node containing a `:sidenote` node for each found ref. - Renames type `:footnote-ref` to `:sidenote-ref." - [{:as doc ::keys [_path] :keys [footnotes]}] - (if-not (seq footnotes) - doc - (let [root (->zip doc)] - (loop [loc (z/down root) parent root] - (cond - (nil? loc) - (-> parent z/node (assoc :sidenotes? true)) - (contains? #{:plain :paragraph :blockquote :numbered-list :bullet-list :todo-list :heading :table} - (:type (z/node loc))) - (if-some [{:keys [node refs]} (node-with-sidenote-refs (z/node loc))] - (let [new-loc (-> loc (z/replace {:type :sidenote-container :content []}) - (z/append-child node) - (z/append-child {:type :sidenote-column - ;; TODO: broken in the old implementation - ;; should be :content (mapv #(footnote->sidenote (get footnotes %)) (distinct refs))}))] - :content (mapv #(footnote->sidenote (get footnotes %)) refs)}))] - (recur (z/right new-loc) (z/up new-loc))) - (recur (z/right loc) parent)) - :else - (recur (z/right loc) parent)))))) - -(comment - (-> "_hello_ what and foo[^note1] and^[some other note]. - -And what. - -[^note1]: the _what_ - -* and new text[^endnote] at the end. -* the - * hell^[that warm place] - -[^endnote]: conclusion. -" - nextjournal.markdown/tokenize - parse - #_ flatten-tokens - insert-sidenote-containers) - - (-> empty-doc - (update-in [:opts :text-tokenizers] (partial map normalize-tokenizer)) - (apply-tokens (nextjournal.markdown/tokenize "what^[the heck]")) - insert-sidenote-columns - (apply-tokens (nextjournal.markdown/tokenize "# Hello")) - insert-sidenote-columns - (apply-tokens (nextjournal.markdown/tokenize "is^[this thing]")) - insert-sidenote-columns)) - -;; tables -;; table data tokens might have {:style "text-align:right|left"} attrs, maybe better nested node > :attrs > :style ? -(defmethod apply-token "table_open" [doc _token] (open-node doc :table)) -(defmethod apply-token "table_close" [doc _token] (close-node doc)) -(defmethod apply-token "thead_open" [doc _token] (open-node doc :table-head)) -(defmethod apply-token "thead_close" [doc _token] (close-node doc)) -(defmethod apply-token "tr_open" [doc _token] (open-node doc :table-row)) -(defmethod apply-token "tr_close" [doc _token] (close-node doc)) -(defmethod apply-token "th_open" [doc token] (open-node doc :table-header (:attrs token))) -(defmethod apply-token "th_close" [doc _token] (close-node doc)) -(defmethod apply-token "tbody_open" [doc _token] (open-node doc :table-body)) -(defmethod apply-token "tbody_close" [doc _token] (close-node doc)) -(defmethod apply-token "td_open" [doc token] (open-node doc :table-data (:attrs token))) -(defmethod apply-token "td_close" [doc _token] (close-node doc)) - -(comment - (-> -" -| Syntax | JVM | JavaScript | -|--------|:------------------------:|--------------------------------:| -| foo | Loca _lDate_ ahoiii | goog.date.Date | -| bar | java.time.LocalTime | some [kinky](link/to/something) | -| bag | java.time.LocalDateTime | $\\phi$ | -" - nextjournal.markdown/parse - nextjournal.markdown.transform/->hiccup - )) - -;; ## Handling of Text Tokens -;; -;; normalize-tokenizer :: {:regex, :doc-handler} | {:tokenizer-fn, :handler} -> Tokenizer -;; Tokenizer :: {:tokenizer-fn :: TokenizerFn, :doc-handler :: DocHandler} -;; -;; Match :: Any -;; Handler :: Match -> Node -;; IndexedMatch :: (Match, Int, Int) -;; TokenizerFn :: String -> [IndexedMatch] -;; DocHandler :: Doc -> {:match :: Match} -> Doc - -(def hashtag-tokenizer - {:regex #"(^|\B)#[\w-]+" - :pred #(every? (complement #{:link}) (map :type (current-ancestor-nodes %))) - :handler (fn [match] {:type :hashtag :text (subs (match 0) 1)})}) - -(def internal-link-tokenizer - {:regex #"\[\[([^\]]+)\]\]" - :pred #(every? (complement #{:link}) (map :type (current-ancestor-nodes %))) - :handler (fn [match] {:type :internal-link :text (match 1)})}) - -(comment - (->> "# Hello #Fishes - -> what about #this - -_this #should be a tag_, but this [_actually #foo shouldnt_](/bar/) is not." - nextjournal.markdown/tokenize - (parse (update-in empty-doc [:opts :text-tokenizers] conj hashtag-tokenizer)))) - - -(defn normalize-tokenizer - "Normalizes a map of regex and handler into a Tokenizer" - [{:as tokenizer :keys [doc-handler pred handler regex tokenizer-fn]}] - (assert (and (or doc-handler handler) (or regex tokenizer-fn))) - (cond-> tokenizer - (not doc-handler) (assoc :doc-handler (fn [doc {:keys [match]}] (push-node doc (handler match)))) - (not tokenizer-fn) (assoc :tokenizer-fn (partial re-idx-seq regex)) - (not pred) (assoc :pred (constantly true)))) - -(defn tokenize-text-node [{:as tkz :keys [tokenizer-fn pred doc-handler]} doc {:as node :keys [text]}] - ;; TokenizerFn -> HNode -> [HNode] - (assert (and (fn? tokenizer-fn) (fn? doc-handler) (fn? pred) (string? text)) - {:text text :tokenizer tkz}) - (let [idx-seq (when (pred doc) (tokenizer-fn text))] - (if (seq idx-seq) - (let [text-hnode (fn [s] (assoc (text-node s) :doc-handler push-node)) - {:keys [nodes remaining-text]} - (reduce (fn [{:as acc :keys [remaining-text]} [match start end]] - (-> acc - (update :remaining-text subs 0 start) - (cond-> - (< end (count remaining-text)) - (update :nodes conj (text-hnode (subs remaining-text end)))) - (update :nodes conj {:doc-handler doc-handler - :match match :text text - :start start :end end}))) - {:remaining-text text :nodes ()} - (reverse idx-seq))] - (cond-> nodes - (seq remaining-text) - (conj (text-hnode remaining-text)))) - [node]))) - -(defmethod apply-token "text" [{:as doc :keys [opts]} {:keys [content]}] - (reduce (fn [doc {:as node :keys [doc-handler]}] (doc-handler doc (dissoc node :doc-handler))) - doc - (reduce (fn [nodes tokenizer] - (mapcat (fn [{:as node :keys [type]}] - (if (= :text type) (tokenize-text-node tokenizer doc node) [node])) - nodes)) - [{:type :text :text content :doc-handler push-node}] - (:text-tokenizers opts)))) - -(comment - (def mustache (normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" :handler (fn [m] {:type :eval :text (m 1)})})) - (tokenize-text-node mustache {} {:text "{{what}} the {{hellow}}"}) - (apply-token (assoc-in empty-doc [:opts :text-tokenizers] [mustache]) - {:type "text" :content "foo [[bar]] dang #hashy taggy [[what]] #dangy foo [[great]] and {{eval}} me"}) - - (parse (assoc-in empty-doc - [:opts :text-tokenizers] - [(normalize-tokenizer {:regex #"\{\{([^\{]+)\}\}" - :doc-handler (fn [{:as doc ::keys [path]} {[_ meta] :match}] - (update-in doc (ppop path) assoc :meta meta))})]) - (nextjournal.markdown/tokenize "# Title {{id=heading}} -* one -* two"))) - -;; inlines -(defmethod apply-token "inline" [doc {:as _token ts :children}] (apply-tokens doc ts)) -(defmethod apply-token "math_inline" [doc {text :content}] (push-node doc (formula text))) -(defmethod apply-token "math_inline_double" [doc {text :content}] (push-node doc (formula text))) - -;; https://spec.commonmark.org/0.30/#softbreak -(defmethod apply-token "softbreak" [doc _token] (push-node doc {:type :softbreak})) -;; https://spec.commonmark.org/0.30/#hard-line-break -(defmethod apply-token "hardbreak" [doc _token] (push-node doc {:type :hardbreak})) - -;; images -(defmethod apply-token "image" [doc {:keys [attrs children]}] (-> doc (open-node :image attrs) (apply-tokens children) close-node)) - -;; marks -(defmethod apply-token "em_open" [doc _token] (open-node doc :em)) -(defmethod apply-token "em_close" [doc _token] (close-node doc)) -(defmethod apply-token "strong_open" [doc _token] (open-node doc :strong)) -(defmethod apply-token "strong_close" [doc _token] (close-node doc)) -(defmethod apply-token "s_open" [doc _token] (open-node doc :strikethrough)) -(defmethod apply-token "s_close" [doc _token] (close-node doc)) -(defmethod apply-token "link_open" [doc token] (open-node doc :link (:attrs token))) -(defmethod apply-token "link_close" [doc _token] (close-node doc)) -(defmethod apply-token "code_inline" [doc {text :content}] (-> doc (open-node :monospace) (push-node (text-node text)) close-node)) - -;; html (ignored) -(defmethod apply-token "html_inline" [doc _] doc) -(defmethod apply-token "html_block" [doc _] doc) -;; endregion - -;; region data builder api -(defn pairs->kmap [pairs] (into {} (map (juxt (comp keyword first) second)) pairs)) -(defn apply-tokens [doc tokens] - (let [mapify-attrs-xf (map (fn [x] (update* x :attrs pairs->kmap)))] - (reduce (mapify-attrs-xf apply-token) doc tokens))) - -(def empty-doc utils/empty-doc) - -(defn parse - "Takes a doc and a collection of markdown-it tokens, applies tokens to doc. Uses an emtpy doc in arity 1." - ([tokens] (parse empty-doc tokens)) - ([doc tokens] (-> doc - (update-in [:opts :text-tokenizers] (partial map normalize-tokenizer)) - (apply-tokens tokens) - (dissoc :opts - ::path - ::id->index)))) - -(comment - - (-> "# 🎱 Markdown Data - -some _emphatic_ **strong** [link](https://foo.com) - ---- - -> some ~~nice~~ quote -> for fun - -## Formulas - -[[TOC]] - -$$\\Pi^2$$ - -- [ ] and -- [x] some $\\Phi_{\\alpha}$ latext -- [ ] bullets - -## Sidenotes - -here [^mynote] to somewhere - -## Fences - -```py id=\"aaa-bbb-ccc\" -1 -print(\"this is some python\") -2 -3 -``` - -![Image Text](https://img.icons8.com/officel/16/000000/public.png) - -Hline Section -------------- - -### but also [[indented code]] - - import os - os.listdir('/') - -or monospace mark [`real`](/foo/bar) fun. - -[^mynote]: Here you _can_ `explain` at lenght -" - nextjournal.markdown/tokenize - parse - ;;seq - ;;(->> (take 10)) - ;;(->> (take-last 4)) - )) -;; endregion - -;; region zoom-in at section -(defn section-at [{:as doc :keys [content]} [_ pos :as path]] - ;; TODO: generalize over path (zoom-in at) - ;; supports only top-level headings atm (as found in TOC) - (let [{:as h section-level :heading-level} (get-in doc path) - in-section? (fn [{l :heading-level}] (or (not l) (< section-level l)))] - (when section-level - {:type :doc - :content (cons h - (->> content - (drop (inc pos)) - (take-while in-section?)))}))) - -(comment - (some-> "# Title - -## Section 1 - -foo - -- # What is this? (no!) -- maybe - -### Section 1.2 - -## Section 2 - -some par - -### Section 2.1 - -some other par - -### Section 2.2 - -#### Section 2.2.1 - -two two one - -#### Section 2.2.2 - -two two two - -## Section 3 - -some final par" - nextjournal.markdown/parse - (section-at [:content 9]) ;; β¬… paths are stored in TOC sections - nextjournal.markdown.transform/->hiccup)) -;; endregion - - -;; ## πŸ”§ Debug -;; A view on flattened tokens to better inspect tokens -(defn flatten-tokens [tokens] - (into [] - (comp - (mapcat (partial tree-seq (comp seq :children) :children)) - (map #(select-keys % [:type :content :hidden :level :info :meta]))) - tokens))