diff --git a/astro.config.ts b/astro.config.ts index 4d803f425..a5faa1826 100644 --- a/astro.config.ts +++ b/astro.config.ts @@ -396,6 +396,7 @@ export default defineConfig({ "adventure/minimessage/api", "adventure/minimessage/dynamic-replacements", "adventure/minimessage/translator", + "adventure/minimessage/specification", ], }, "adventure/serializer/ansi", diff --git a/ec.config.mjs b/ec.config.mjs index cb21c5af1..ee20df8f2 100644 --- a/ec.config.mjs +++ b/ec.config.mjs @@ -1,5 +1,6 @@ import { pluginCollapsibleSections } from "@expressive-code/plugin-collapsible-sections"; import { pluginLineNumbers } from "@expressive-code/plugin-line-numbers"; +import backusNaurHighlight from "./src/utils/shiki/bnf.tmLanguage.json" with { type: "json" }; import miniMessageHighlight from "./src/utils/shiki/mm.tmLanguage.json" with { type: "json" }; /** @type {import('@astrojs/starlight/expressive-code').StarlightExpressiveCodeOptions} */ @@ -14,6 +15,6 @@ export default { }, emitExternalStylesheet: false, shiki: { - langs: [miniMessageHighlight], + langs: [miniMessageHighlight, backusNaurHighlight], }, }; diff --git a/src/content/docs/adventure/minimessage/specification.md b/src/content/docs/adventure/minimessage/specification.md new file mode 100644 index 000000000..4394acca1 --- /dev/null +++ b/src/content/docs/adventure/minimessage/specification.md @@ -0,0 +1,289 @@ +--- +title: Language Specification +slug: adventure/minimessage/specification +description: A developer-facing specification of the MiniMessage format. +tableOfContents: + minHeadingLevel: 2 + maxHeadingLevel: 4 +--- + +This document outlines the MiniMessage format in detail to aid developers who wish to implement their own MiniMessage +parser from scratch or understand the internal processes happening during the parsing of MiniMessage formatted strings. + +The keywords “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, “SHOULD NOT”, “RECOMMENDED”, +“MAY”, and “OPTIONAL” in this document are to be interpreted as described in +[RFC 2119](https://www.rfc-editor.org/rfc/rfc2119.html). + +## The MiniMessage language + +The MiniMessage language is a markup format used for representing Minecraft's component-based text +system in a human-readable and modifiable way. Broadly speaking, the language consists of two types +of tokens: **plain text** and **tags**. + +Plain text is any string. This string is UTF-16 compatible. The following is an example of a valid +plain text part of a MiniMessage-formatted string: + +```mm +The MiniMessage format was made to be as simple as possible. +Emojies are allowed 😅. So are japanese characters, like 紙. +``` + +MiniMessage tags are primarily used for adding markup information to plain text parts. They can, however, +also add entirely new content into the serialized component. The way how a tag is resolved makes no +difference to the MiniMessage lexer. A tag has the following structure: + +```mm + +``` + +A tag consists of the following parts: + +- `< >`: All tags are surrounded by less than and more than symbols. +- `tagname`: Every tag starts with the name. The name follows the list of characters mentioned as allowed + in the [misc/identifiers](#identifiers) section. +- Tags can have arguments. There are two distinctions between argument types: named and sequenced. Named arguments + are, as the name implies, named in some way. Sequenced arguments do not have a name, instead they are a simple list + of string values. [Tag argument documentation can be found later in the page](#tag-arguments). + +### Tag syntax + +MiniMessage tags can surround text. + +```mm +Inner text and outer text. +``` + +Tags can be closed by repeating the tag, with a slash in front of the name. Tags are closed implicitly +when the end of the string is reached. Furthermore, tags can be nested: + +```mm +Some text even more text, and that's really it! +``` + +Nested tags are closed implicitly when the outer tag is closed. + +```mm +This text is unmarked marked, inner, and again no longer marked. +``` + +If a tag has arguments, these must not be repeated on the closing tag. + +```mm +Some text +``` + +Lastly, normal tags can be closed instantly by prepending a `/` to the more-than symbol of an opening tag. + +```mm +This tag is auto-closed: +``` + +## Tag arguments + +Arguments are placed between the tag name and the closing more-than symbol. + +```mm + +``` + +### Named argument types + +Before each named argument, a piece of [whitespace](#whitespace) must be present. + +There exists two types of named arguments: value-based arguments and flag arguments. + +#### Flag argument type + +Flags may be preceded by a single exclamation mark `!` and must follow the rules set by [identifiers](#identifiers). + +```mm + +``` + +The following shows a tag with invalid flags: + +```mm + +``` + +#### Valued argument type + +Named arguments with a value consist of an identifier, an equal symbol `=`, and a value. + +The identifier follows the rules as explained in the [misc/identifiers](#identifiers) section of this page. +The value may consist of any UTF-16 characters, but must not contain any whitespace, unless explicitly quoted. +Please refer to [misc/quoting](#quoting) for any specifics. + +Here is an example for valid valued named arguments: + +```mm + +``` + +And example for invalid valued named arguments: + +```mm + +``` + +:::note + +The above tag, assuming the identifiers were valid, would actually parse both `and` and `blue` as flags. + +::: + +#### Combining flags and values + +These two named types can be combined in any way. + +```mm + +``` + +### Sequential arguments + +Sequential arguments are declared at the end of the tag. Each sequential argument starts with a colon `:`. +Unless named arguments are present, a whitespace before the first colon `:` is not necessary. + +Sequential arguments may contain any UTF-16 characters. Any instances of `<`, `>`, or `:` characters +must either be escaped (see [misc/escaping](#escaping)) or the argument must be wrapped in quotes +(see [misc/quoting](#quoting)). + +The following are valid MiniMessage tags with sequential arguments: + +```mm + + + + + + + and a \: colon!> + + and a : colon, but it's quoted!"> +``` + +### Combining argument types + +Named and sequential arguments can be used together. The general syntax looks as follows: + +```mm + +``` + +All named arguments must be located between the tag name and the first non-value colon. + +A few examples for valid tags making use of both named and sequenced arguments: + +```mm + + + + + +``` + +## Misc + +This section defines miscellaneous behavior of common parts. + +### Identifiers + +All identifiers must be lowercased and contain only alphanumerical characters, `_`, or '-'. All identifiers +used as named argument names should be unique. + +### Quoting +Argument values can be quoted. A value counts as quoted if the first character is a `'` or `"`. The quoted +value ends as soon as another unquoted quote of the same character as the starting quote is found at the +end of an argument. + +Between the opening and the closing quote, any UTF-16 characters may be present. This also includes the same +quote as used for the string. The following would be a valid tag; + +```mm + +``` + +This is because the `"` in the middle is **not the last character of the value**. Therefore, it is read +literally, since the tag would otherwise be invalid. + +:::tip + +As long as the quote is not closed, the lexer must continue reading characters. If the end of the +input is reached before closing quote is found, the tag and any following characters should be +read as plain text, as the tag is never closed. This is to aid users in finding the error in their syntax. + +::: + +### Whitespace + +A whitespace character may be a classical space `\s`, a tab character `\t`, +a newline `\n`, or a carriage return `\r`. + +### Escaping + +In MiniMessage, certain symbols, which would be interpreted differently by a lexer may be preceded by a backslash `\` +to instead be included literally. This includes backslash `\` characters, if they would have any effect on the next +symbol. If a backlash character had no effect, it is included literally. + +## Formal grammar + +This segment declares the formal grammar (in a flavor of the Backus-Naur form) which specifies the MiniMessage language. + +The specific flavor used here changes that non-terminal symbols are no longer enclosed in angle brackets `<>` +and the `::=` meta symbol is replaced by `→`. Curly brackets `{}` declare optional parts. Lastly, a `+` suffix +declares that a symbol should appear at least once, but may appear more often, whilst a `*` suffix declares that +a symbol may appear once or more often. + +```bnf +; Important notes regarding this specific grammar: due to the massive number of characters included +; in the UTF-16 characterset, some special non-terminal symbols have been added: +; +; utf-16-char → includes all UTF-16 characters. +; +; utf-16-char-no-whitespace → includes all UTF-16 characters except for spaces (\s), tabs (\t), newlines (\n) +; and carriage returns (\r). +; +; utf-16-char-no-angle-or-colon → includes all UTF-16 characters except for the +; angle-bracket characters (<>) and colon (:). However +; those characters are valid if an uneven number of backslash +; characters is located infront of them. + +minimessage → string {tag string} + +string → utf-16-char* + +tag → "<" tag-name tag-arguments "/>" +tag → "<" tag-name tag-arguments ">" minimessage {""} + +tag-name → identifier + +tag-arguments → "" | named-argument " "+ sequential-argument | named-argument | " "* sequential-argument + +named-argument → "" | " "+ {"!"} identifier {named-argument} | " "+ identifier "=" named-value {named-argument} + +named-value → "" | quoted | no-whitespace-string + +no-whitespace-string → utf-16-char-no-whitespace* + +sequential-argument → ":" sequential-value {sequential-argument} + +sequential-value → "" | quoted | sequential-string + +sequential-string → utf-16-char-no-angle-or-colon* + +quoted → "'" string "'" | """ string """ + +identifier → alphanumeric+ + +alphanumeric → "a" | "b" | "c" | "d" + | "e" | "f" | "g" | "h" + | "i" | "j" | "k" | "l" + | "m" | "n" | "o" | "p" + | "q" | "r" | "s" | "t" + | "u" | "v" | "w" | "x" + | "y" | "z" | "_" | "0" + | "1" | "2" | "3" | "4" + | "5" | "6" | "7" | "8" + | "9" | "-" +``` diff --git a/src/utils/shiki/bnf.tmLanguage.json b/src/utils/shiki/bnf.tmLanguage.json new file mode 100644 index 000000000..065e3d9a3 --- /dev/null +++ b/src/utils/shiki/bnf.tmLanguage.json @@ -0,0 +1,82 @@ +{ + "$schema": "https://raw.githubusercontent.com/martinring/tmlanguage/master/tmlanguage.json", + "name": "bnf", + "scopeName": "source.bnf", + "patterns": [{ "include": "#comment" }, { "include": "#rule" }, { "include": "#meta" }, { "include": "#strings" }], + "repository": { + "comment": { + "name": "comment.line.semicolon.bnf", + "match": ";.*$" + }, + + "rule": { + "name": "meta.rule.bnf", + "begin": "^(\\s*)([A-Za-z0-9_-]+)(\\s*→)", + "beginCaptures": { + "2": { "name": "entity.name.function.nonterminal.bnf" }, + "3": { "name": "keyword.reserved.arrow.bnf" } + }, + "end": "(?=^\\s*[A-Za-z0-9_-]+\\s*→|\\Z)", + "patterns": [ + { "include": "#tripleQuotedNonterminal" }, + { "include": "#strings" }, + { "include": "#meta" }, + { + "match": "\\b[A-Za-z0-9_-]+\\b", + "name": "variable.language.nonterminal.bnf" + } + ] + }, + + "tripleQuotedNonterminal": { + "name": "meta.triplequoted.nonterminal.bnf", + "begin": "\"{3}", + "beginCaptures": { + "0": { "name": "string.quoted.double.bnf" } + }, + "end": "\"{3}", + "endCaptures": { + "0": { "name": "string.quoted.double.bnf" } + }, + "patterns": [ + { + "match": "\\b[A-Za-z0-9_-]+\\b", + "name": "variable.language.nonterminal.bnf" + } + ] + }, + + "strings": { + "patterns": [ + { + "name": "string.quoted.double.bnf", + "begin": "\"", + "end": "\"", + "patterns": [{ "match": "\"\"", "name": "constant.character.escape.doublequote.bnf" }] + }, + { + "name": "string.quoted.single.bnf", + "begin": "'", + "end": "'" + } + ] + }, + + "meta": { + "patterns": [ + { + "match": "→", + "name": "keyword.reserved.arrow.bnf" + }, + { + "match": "\\|", + "name": "keyword.reserved.choice.bnf" + }, + { + "match": "[{}()]", + "name": "punctuation.section.group.bnf" + } + ] + } + } +} diff --git a/src/utils/shiki/mm.tmLanguage.json b/src/utils/shiki/mm.tmLanguage.json index 5788d0d86..e7683527e 100644 --- a/src/utils/shiki/mm.tmLanguage.json +++ b/src/utils/shiki/mm.tmLanguage.json @@ -4,39 +4,79 @@ "patterns": [ { "name": "meta.tag.mm", - "begin": "(<)(/?|!?)([a-zA-Z0-9_#]+)", + "begin": "()", + "end": "/?>", "endCaptures": { - "1": { "name": "constant.language.tag.mm" } + "1": { "name": "punctuation.definition.tag.mm" } }, "patterns": [ { - "name": "string.quoted.single.argument.mm", - "match": "(:)'([^']*)'", + "name": "variable.parameter.sequenced.quoted.mm", + "match": "(\\s*)(:)(\".*\")", "captures": { - "1": { "name": "constant.language.tag.mm" }, - "2": { "name": "string.quoted.single.argument.mm" } + "2": { "name": "punctuation.definition.identifier.mm" }, + "3": { "name": "string.quoted.double.mm" } } }, { - "name": "string.quoted.double.argument.mm", - "match": "(:)\"([^\"]*)\"", + "name": "variable.parameter.sequenced.single-quoted.mm", + "match": "(\\s*)(:)('.*')", "captures": { - "1": { "name": "constant.language.tag.mm" }, - "2": { "name": "string.quoted.double.argument.mm" } + "2": { "name": "punctuation.definition.identifier.mm" }, + "3": { "name": "string.quoted.single.mm" } } }, { - "name": "variable.language.argument.mm", - "match": "(:)([^:\\s\"'>]+(?:\\[[^\\]]*\\])?)", + "name": "variable.parameter.sequenced.unquoted.mm", + "match": "(\\s*)(:)(((\\\\>)|(\\\\:)|[^>:])*)", "captures": { - "1": { "name": "constant.language.tag.mm" }, - "2": { "name": "variable.language.argument.mm" } + "2": { "name": "punctuation.definition.identifier.mm" }, + "3": { "name": "string.unquoted.mm" } + } + }, + { + "name": "variable.parameter.named-argument.quoted.mm", + "match": "(\\s+)([a-z_0-9-]+)(=)(\".*\")", + "captures": { + "2": { "name": "keyword.reserved.identifier.mm" }, + "3": { "name": "punctuation.definition.identifier.mm" }, + "4": { "name": "string.quoted.double.mm" } + } + }, + { + "name": "variable.parameter.named-argument.single-quoted.mm", + "match": "(\\s+)([a-z_0-9-]+)(=)('.*')", + "captures": { + "2": { "name": "keyword.reserved.identifier.mm" }, + "3": { "name": "punctuation.definition.identifier.mm" }, + "4": { "name": "string.quoted.single.mm" } + } + }, + { + "name": "variable.parameter.named-argument.unquoted.mm", + "match": "(\\s+)([a-z_0-9-]+)(=)([^ >:]*)", + "captures": { + "2": { "name": "keyword.reserved.identifier.mm" }, + "3": { "name": "punctuation.definition.identifier.mm" }, + "4": { "name": "string.unquoted.mm" } + } + }, + { + "name": "variable.parameter.inverse-flag.mm", + "match": "(\\s+)(![a-z_0-9-]+)", + "captures": { + "2": { "name": "keyword.reserved.inverse-flag.mm" } + } + }, + { + "name": "variable.parameter.flag.mm", + "match": "(\\s+)([a-z_0-9-]+)", + "captures": { + "2": { "name": "keyword.reserved.flag.mm" } } } ]