diff --git a/mozuku-lsp/CMakeLists.txt b/mozuku-lsp/CMakeLists.txt index aebda49..68e42f6 100644 --- a/mozuku-lsp/CMakeLists.txt +++ b/mozuku-lsp/CMakeLists.txt @@ -126,6 +126,9 @@ set(MOZUKU_SOURCES src/grammar_checker.cpp src/wikipedia.cpp src/comment_extractor.cpp + src/document_preprocessor.cpp + src/presenter.cpp + src/tree_sitter_document.cpp ) add_executable(mozuku-lsp ${MOZUKU_SOURCES}) diff --git a/mozuku-lsp/include/analyzer.hpp b/mozuku-lsp/include/analyzer.hpp index 4c02f6f..4146b06 100644 --- a/mozuku-lsp/include/analyzer.hpp +++ b/mozuku-lsp/include/analyzer.hpp @@ -1,99 +1,12 @@ #pragma once +#include "mozuku/core/config.hpp" +#include "mozuku/core/types.hpp" + #include #include #include -struct TokenData; -struct Diagnostic; - -struct DetailedPOS { - std::string mainPOS; // 主品詞 (名詞, 動詞, 助詞...) - std::string subPOS1; // 品詞細分類1 (格助詞, 副助詞, 係助詞...) - std::string subPOS2; // 品詞細分類2 - std::string subPOS3; // 品詞細分類3 - std::string inflection; // 活用型 - std::string conjugation; // 活用形 - std::string baseForm; // 原形 - std::string reading; // 読み - std::string pronunciation; // 発音 - - bool isParticle() const { return mainPOS == "助詞"; } - bool isVerb() const { return mainPOS == "動詞"; } - bool isNoun() const { return mainPOS == "名詞"; } -}; - -// Information about a particle (助詞) token -struct ParticleInfo { - std::string surface; // 表層形 - std::string function; // 格助詞, 副助詞, 係助詞, 接続助詞 - std::string role; // より詳細な役割 - size_t position; // 文中の位置 (バイト単位) - int tokenIndex; // トークン配列内のインデックス - int sentenceId; // 所属する文のID -}; - -// Sentence boundary information -struct SentenceBoundary { - size_t start; // 文の開始位置 (バイト単位) - size_t end; // 文の終了位置 (バイト単位) - int sentenceId; // 文のID - std::string text; // 文の内容 -}; - -// Dependency parsing information from CaboCha -struct DependencyInfo { - int chunkId; // チャンクID - int headId; // 係り先チャンクID - double score; // 係り受けスコア - std::string text; // チャンクのテキスト -}; - -// Configuration structures (shared between LSP server and analyzer) -struct MeCabConfig { - std::string dicPath; // Dictionary directory path - std::string charset = "UTF-8"; // Character encoding -}; - -struct AnalysisConfig { - bool enableCaboCha = true; // Enable CaboCha dependency parsing - bool grammarCheck = true; // Enable grammar diagnostics - double minJapaneseRatio = - 0.1; // Minimum Japanese character ratio for analysis - - struct RuleToggles { - bool commaLimit = true; - bool adversativeGa = true; - bool duplicateParticleSurface = true; - bool adjacentParticles = true; - bool conjunctionRepeat = true; - bool raDropping = true; - int commaLimitMax = 3; - int adversativeGaMax = 1; - int duplicateParticleSurfaceMaxRepeat = 1; - int adjacentParticlesMaxRepeat = 1; - int conjunctionRepeatMax = 1; - } rules; - - // Enhanced grammar warning settings - struct WarningLevels { - bool particleDuplicate = true; // 二重助詞警告 - bool particleSequence = true; // 不適切助詞連続 - bool particleMismatch = true; // 動詞-助詞不整合 - bool sentenceStructure = false; // 文構造問題 (実験的) - bool styleConsistency = false; // 文体混在 (実験的) - bool redundancy = false; // 冗長表現 (実験的) - } warnings; - - int warningMinSeverity = - 2; // 最小警告レベル (1=Error, 2=Warning, 3=Info, 4=Hint) -}; - -struct MoZukuConfig { - MeCabConfig mecab; - AnalysisConfig analysis; -}; - void analyzeText(const std::string &text, std::vector &tokens, std::vector &diags, const MoZukuConfig *config = nullptr); @@ -101,15 +14,6 @@ void analyzeText(const std::string &text, std::vector &tokens, void performGrammarDiagnostics(const std::string &text, std::vector &diags); -size_t computeByteOffset(const std::string &text, int line, int character); - -namespace MoZukuModifiers { -static constexpr unsigned Proper = 1u << 0; // "proper" -static constexpr unsigned Numeric = 1u << 1; // "numeric" -static constexpr unsigned Kana = 1u << 2; // "kana" -static constexpr unsigned Kanji = 1u << 3; // "kanji" -} // namespace MoZukuModifiers - namespace MoZuku { namespace mecab { @@ -132,6 +36,16 @@ class Analyzer { bool isCaboChaAvailable() const; private: + struct PreparedText { + std::string cleanText; + double japaneseRatio{0.0}; + bool belowMinJapaneseRatio{false}; + }; + + PreparedText prepareText(const std::string &text, + bool enforceMinJapaneseRatio) const; + std::vector analyzePreparedText(const PreparedText &prepared); + std::unique_ptr mecab_manager_; MoZukuConfig config_; std::string system_charset_; diff --git a/mozuku-lsp/include/encoding_utils.hpp b/mozuku-lsp/include/encoding_utils.hpp index 3260a75..c13ec83 100644 --- a/mozuku-lsp/include/encoding_utils.hpp +++ b/mozuku-lsp/include/encoding_utils.hpp @@ -5,9 +5,14 @@ namespace MoZuku { namespace encoding { +struct ConversionOptions { + bool skipInvalidInput{false}; +}; + std::string convertEncoding(const std::string &input, const std::string &fromCharset, - const std::string &toCharset = "UTF-8"); + const std::string &toCharset = "UTF-8", + ConversionOptions options = {}); std::string systemToUtf8(const std::string &input, const std::string &systemCharset); @@ -15,5 +20,9 @@ std::string systemToUtf8(const std::string &input, std::string utf8ToSystem(const std::string &input, const std::string &systemCharset); +std::string sanitizeUtf8(const std::string &input); + +size_t utf8SequenceLength(unsigned char c); + } // namespace encoding } // namespace MoZuku diff --git a/mozuku-lsp/include/grammar_checker.hpp b/mozuku-lsp/include/grammar_checker.hpp index 60d68f2..f76e76f 100644 --- a/mozuku-lsp/include/grammar_checker.hpp +++ b/mozuku-lsp/include/grammar_checker.hpp @@ -1,7 +1,7 @@ #pragma once -#include "analyzer.hpp" -#include "lsp.hpp" +#include "mozuku/core/config.hpp" +#include "mozuku/core/types.hpp" #include #include diff --git a/mozuku-lsp/include/lsp.hpp b/mozuku-lsp/include/lsp.hpp index cc136d4..1cf0f34 100644 --- a/mozuku-lsp/include/lsp.hpp +++ b/mozuku-lsp/include/lsp.hpp @@ -1,6 +1,10 @@ #pragma once #include "analyzer.hpp" +#include "mozuku/analysis/document_preprocessor.hpp" +#include "mozuku/core/config.hpp" +#include "mozuku/core/types.hpp" +#include "mozuku/lsp/presenter.hpp" #include #include #include @@ -15,78 +19,35 @@ using json = nlohmann::json; -struct Position { - int line{0}; - int character{0}; -}; - -struct Range { - Position start; - Position end; -}; - -struct Diagnostic { - Range range; - int severity{2}; - std::string message; -}; - -struct TokenData { - int line{0}; - int startChar{0}; - int endChar{0}; - std::string tokenType; // e.g. "noun", "verb" ... - unsigned int tokenModifiers{0}; - - std::string surface; // 表層形 - std::string - feature; // 品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音 - std::string baseForm; // 原形 - std::string reading; // 読み - std::string pronunciation; // 発音 -}; - -struct AnalyzerResult { - std::vector tokens; - std::vector diags; -}; - -struct ByteRange { - size_t startByte{0}; - size_t endByte{0}; -}; - class LSPServer { public: LSPServer(std::istream &in, std::ostream &out); void run(); private: + struct DocumentState { + std::string text; + std::string languageId; + std::vector tokens; + bool tokensCached{false}; + std::unordered_map> diagnosticsByLine; + std::vector commentSegments; + std::vector contentHighlightRanges; + }; + std::istream &in_; std::ostream &out_; - // インメモリテキストストア: uri -> 全テキスト - std::unordered_map docs_; - // ドキュメントの言語ID: uri -> languageId - std::unordered_map docLanguages_; - // hover用トークン情報: uri -> トークンデータ - std::unordered_map> docTokens_; - // 行ベースの診断キャッシュ: uri -> 行番号 -> 診断情報 - std::unordered_map>> - docDiagnostics_; - // コメント解析に使用するセグメント - std::unordered_map> - docCommentSegments_; - // HTML/LaTeX 本文ハイライト用の範囲 - std::unordered_map> - docContentHighlightRanges_; + // ドキュメント単位の状態: uri -> テキスト/解析結果/補助メタデータ + std::unordered_map documents_; std::vector tokenTypes_; std::vector tokenModifiers_; MoZukuConfig config_; std::unique_ptr analyzer_; + MoZuku::analysis::DocumentPreprocessor preprocessor_; + MoZuku::lsp::Presenter presenter_; bool readMessage(std::string &jsonPayload); void reply(const json &msg); @@ -103,26 +64,21 @@ class LSPServer { json onSemanticTokensRange(const json &id, const json ¶ms); json onHover(const json &id, const json ¶ms); - void analyzeAndPublish(const std::string &uri, const std::string &text); + DocumentState &ensureDocument(const std::string &uri); + DocumentState *findDocument(const std::string &uri); + const DocumentState *findDocument(const std::string &uri) const; + static bool isJapaneseLanguage(const DocumentState &document); + + void analyzeAndPublish(const std::string &uri); void analyzeChangedLines(const std::string &uri, const std::string &newText, const std::string &oldText); - std::string prepareAnalysisText(const std::string &uri, - const std::string &text); - void sendCommentHighlights( - const std::string &uri, const std::string &text, - const std::vector &segments); - void sendSemanticHighlights(const std::string &uri, - const std::vector &tokens); - void sendContentHighlights(const std::string &uri, const std::string &text, - const std::vector &ranges); + MoZuku::analysis::ProcessedDocument prepareDocument(DocumentState &document); json buildSemanticTokens(const std::string &uri); - json buildSemanticTokensFromTokens(const std::vector &tokens); - void cacheDiagnostics(const std::string &uri, + void cacheDiagnostics(DocumentState &document, const std::vector &diags); - void removeDiagnosticsForLines(const std::string &uri, + void removeDiagnosticsForLines(DocumentState &document, const std::set &lines); - std::vector getAllDiagnostics(const std::string &uri) const; std::set findChangedLines(const std::string &oldText, const std::string &newText) const; }; diff --git a/mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp b/mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp new file mode 100644 index 0000000..56038cf --- /dev/null +++ b/mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "comment_extractor.hpp" +#include "mozuku/core/types.hpp" + +#include +#include + +namespace MoZuku::analysis { + +struct ProcessedDocument { + std::string analysisText; + std::vector commentSegments; + std::vector contentHighlightRanges; +}; + +class DocumentPreprocessor { +public: + ProcessedDocument prepare(const std::string &languageId, + const std::string &text) const; +}; + +} // namespace MoZuku::analysis diff --git a/mozuku-lsp/include/mozuku/core/config.hpp b/mozuku-lsp/include/mozuku/core/config.hpp new file mode 100644 index 0000000..0be9ef6 --- /dev/null +++ b/mozuku-lsp/include/mozuku/core/config.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include + +namespace MoZuku::core { + +struct MeCabConfig { + std::string dicPath; + std::string charset = "UTF-8"; +}; + +struct AnalysisConfig { + bool enableCaboCha = true; + bool grammarCheck = true; + double minJapaneseRatio = 0.1; + + struct RuleToggles { + bool commaLimit = true; + bool adversativeGa = true; + bool duplicateParticleSurface = true; + bool adjacentParticles = true; + bool conjunctionRepeat = true; + bool raDropping = true; + int commaLimitMax = 3; + int adversativeGaMax = 1; + int duplicateParticleSurfaceMaxRepeat = 1; + int adjacentParticlesMaxRepeat = 1; + int conjunctionRepeatMax = 1; + } rules; + + struct WarningLevels { + bool particleDuplicate = true; + bool particleSequence = true; + bool particleMismatch = true; + bool sentenceStructure = false; + bool styleConsistency = false; + bool redundancy = false; + } warnings; + + int warningMinSeverity = 2; +}; + +struct MoZukuConfig { + MeCabConfig mecab; + AnalysisConfig analysis; +}; + +} // namespace MoZuku::core + +using MeCabConfig = MoZuku::core::MeCabConfig; +using AnalysisConfig = MoZuku::core::AnalysisConfig; +using MoZukuConfig = MoZuku::core::MoZukuConfig; diff --git a/mozuku-lsp/include/mozuku/core/debug.hpp b/mozuku-lsp/include/mozuku/core/debug.hpp new file mode 100644 index 0000000..112e13c --- /dev/null +++ b/mozuku-lsp/include/mozuku/core/debug.hpp @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace MoZuku::debug { + +inline bool isEnabled() { + static const bool enabled = std::getenv("MOZUKU_DEBUG") != nullptr; + return enabled; +} + +} // namespace MoZuku::debug diff --git a/mozuku-lsp/include/mozuku/core/types.hpp b/mozuku-lsp/include/mozuku/core/types.hpp new file mode 100644 index 0000000..73dc5ed --- /dev/null +++ b/mozuku-lsp/include/mozuku/core/types.hpp @@ -0,0 +1,106 @@ +#pragma once + +#include +#include +#include + +namespace MoZuku::core { + +struct Position { + int line{0}; + int character{0}; +}; + +struct Range { + Position start; + Position end; +}; + +struct Diagnostic { + Range range; + int severity{2}; + std::string message; +}; + +struct TokenData { + int line{0}; + int startChar{0}; + int endChar{0}; + std::string tokenType; + unsigned int tokenModifiers{0}; + + std::string surface; + std::string feature; + std::string baseForm; + std::string reading; + std::string pronunciation; +}; + +struct AnalyzerResult { + std::vector tokens; + std::vector diags; +}; + +struct ByteRange { + size_t startByte{0}; + size_t endByte{0}; +}; + +struct DetailedPOS { + std::string mainPOS; + std::string subPOS1; + std::string subPOS2; + std::string subPOS3; + std::string inflection; + std::string conjugation; + std::string baseForm; + std::string reading; + std::string pronunciation; + + bool isParticle() const { return mainPOS == "助詞"; } + bool isVerb() const { return mainPOS == "動詞"; } + bool isNoun() const { return mainPOS == "名詞"; } +}; + +struct ParticleInfo { + std::string surface; + std::string function; + std::string role; + size_t position{0}; + int tokenIndex{0}; + int sentenceId{0}; +}; + +struct SentenceBoundary { + size_t start{0}; + size_t end{0}; + int sentenceId{0}; + std::string text; +}; + +struct DependencyInfo { + int chunkId{0}; + int headId{0}; + double score{0.0}; + std::string text; +}; + +} // namespace MoZuku::core + +using Position = MoZuku::core::Position; +using Range = MoZuku::core::Range; +using Diagnostic = MoZuku::core::Diagnostic; +using TokenData = MoZuku::core::TokenData; +using AnalyzerResult = MoZuku::core::AnalyzerResult; +using ByteRange = MoZuku::core::ByteRange; +using DetailedPOS = MoZuku::core::DetailedPOS; +using ParticleInfo = MoZuku::core::ParticleInfo; +using SentenceBoundary = MoZuku::core::SentenceBoundary; +using DependencyInfo = MoZuku::core::DependencyInfo; + +namespace MoZukuModifiers { +static constexpr unsigned Proper = 1u << 0; +static constexpr unsigned Numeric = 1u << 1; +static constexpr unsigned Kana = 1u << 2; +static constexpr unsigned Kanji = 1u << 3; +} // namespace MoZukuModifiers diff --git a/mozuku-lsp/include/mozuku/lsp/presenter.hpp b/mozuku-lsp/include/mozuku/lsp/presenter.hpp new file mode 100644 index 0000000..346d4af --- /dev/null +++ b/mozuku-lsp/include/mozuku/lsp/presenter.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include "comment_extractor.hpp" +#include "mozuku/core/types.hpp" + +#include +#include +#include + +namespace MoZuku::lsp { + +class Presenter { +public: + using json = nlohmann::json; + + json publishDiagnosticsParams(const std::string &uri, + const std::vector &diags) const; + + json commentHighlightsParams( + const std::string &uri, const std::string &text, + const std::vector &segments) const; + + json contentHighlightsParams(const std::string &uri, const std::string &text, + const std::vector &ranges) const; + + json semanticHighlightsParams(const std::string &uri, bool isJapanese, + const std::vector &tokens) const; + + json semanticTokensData(const std::vector &tokens, + const std::vector &tokenTypes) const; + + json hoverResult(const TokenData &token, const std::string &markdown) const; +}; + +} // namespace MoZuku::lsp diff --git a/mozuku-lsp/include/mozuku/treesitter/document.hpp b/mozuku-lsp/include/mozuku/treesitter/document.hpp new file mode 100644 index 0000000..de395e1 --- /dev/null +++ b/mozuku-lsp/include/mozuku/treesitter/document.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include +#include +#include + +#include + +namespace MoZuku::treesitter { + +const TSLanguage *resolveLanguage(const std::string &languageId); +bool isLanguageSupported(const std::string &languageId); + +class ParsedDocument { +public: + ParsedDocument(); + ParsedDocument(const std::string &languageId, const std::string &text); + ParsedDocument(const TSLanguage *language, const std::string &text); + + ParsedDocument(ParsedDocument &&other) noexcept = default; + ParsedDocument &operator=(ParsedDocument &&other) noexcept = default; + + ParsedDocument(const ParsedDocument &) = delete; + ParsedDocument &operator=(const ParsedDocument &) = delete; + + bool isValid() const; + TSNode root() const; + +private: + std::unique_ptr tree_; +}; + +template +void walkDepthFirst(TSNode root, Visitor &&visitor) { + if (ts_node_is_null(root)) { + return; + } + + std::vector stack; + stack.push_back(root); + + while (!stack.empty()) { + TSNode node = stack.back(); + stack.pop_back(); + + if (ts_node_is_null(node)) { + continue; + } + + if (!visitor(node)) { + continue; + } + + uint32_t childCount = ts_node_child_count(node); + for (uint32_t i = childCount; i > 0; --i) { + TSNode child = ts_node_child(node, i - 1); + if (!ts_node_is_null(child)) { + stack.push_back(child); + } + } + } +} + +} // namespace MoZuku::treesitter diff --git a/mozuku-lsp/include/pos_analyzer.hpp b/mozuku-lsp/include/pos_analyzer.hpp index e11e014..81fe0d8 100644 --- a/mozuku-lsp/include/pos_analyzer.hpp +++ b/mozuku-lsp/include/pos_analyzer.hpp @@ -1,6 +1,6 @@ #pragma once -#include "analyzer.hpp" +#include "mozuku/core/types.hpp" #include #include @@ -10,6 +10,11 @@ namespace pos { class POSAnalyzer { public: static std::string mapPosToType(const char *feature); + static bool isNounFeature(const std::string &feature); + static bool isParticleFeature(const std::string &feature); + static bool isConjunctionFeature(const std::string &feature); + static bool isAdversativeGaFeature(const std::string &feature); + static std::string particleKey(const std::string &feature); static void parseFeatureDetails(const char *feature, std::string &baseForm, std::string &reading, @@ -24,6 +29,8 @@ class POSAnalyzer { size_t length, const char *feature); private: + static std::vector + parseFeatureFields(const std::string &feature); static std::vector splitFeature(const std::string &feature); static void analyzeCharacterTypes(const std::string &text, size_t start, diff --git a/mozuku-lsp/include/text_processor.hpp b/mozuku-lsp/include/text_processor.hpp index a197e27..d3354c6 100644 --- a/mozuku-lsp/include/text_processor.hpp +++ b/mozuku-lsp/include/text_processor.hpp @@ -1,8 +1,9 @@ #pragma once -#include "analyzer.hpp" +#include "mozuku/core/types.hpp" #include #include +#include namespace MoZuku { namespace text { diff --git a/mozuku-lsp/include/utf16.hpp b/mozuku-lsp/include/utf16.hpp index 24984a7..06f0a9a 100644 --- a/mozuku-lsp/include/utf16.hpp +++ b/mozuku-lsp/include/utf16.hpp @@ -1,6 +1,24 @@ #pragma once -#include "lsp.hpp" +#include "mozuku/core/types.hpp" + +#include +#include + +class TextOffsetMapper { +public: + explicit TextOffsetMapper(const std::string &text); + + const std::vector &lineStarts() const; + Position byteOffsetToPosition(size_t offset) const; + size_t positionToByteOffset(int line, int character) const; + size_t positionToByteOffset(const Position &position) const; + size_t tokenStartByteOffset(const TokenData &token) const; + +private: + const std::string &text_; + std::vector line_starts_; +}; std::vector computeLineStarts(const std::string &text); @@ -8,4 +26,10 @@ Position byteOffsetToPosition(const std::string &text, const std::vector &lineStarts, size_t offset); +size_t positionToByteOffset(const std::string &text, + const std::vector &lineStarts, int line, + int character); + +size_t positionToByteOffset(const std::string &text, int line, int character); + size_t utf8ToUtf16Length(const std::string &utf8Str); diff --git a/mozuku-lsp/src/analyzer.cpp b/mozuku-lsp/src/analyzer.cpp index 4b80368..7ea8045 100644 --- a/mozuku-lsp/src/analyzer.cpp +++ b/mozuku-lsp/src/analyzer.cpp @@ -2,29 +2,19 @@ #include "encoding_utils.hpp" #include "grammar_checker.hpp" #include "mecab_manager.hpp" +#include "mozuku/core/debug.hpp" #include "pos_analyzer.hpp" #include "text_processor.hpp" #include "utf16.hpp" #include -#include #include #include namespace MoZuku { -static bool isDebugEnabled() { - static bool initialized = false; - static bool debug = false; - if (!initialized) { - debug = (std::getenv("MOZUKU_DEBUG") != nullptr); - initialized = true; - } - return debug; -} - Analyzer::Analyzer() { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Analyzer created" << std::endl; } } @@ -36,7 +26,7 @@ bool Analyzer::initialize(const MoZukuConfig &config) { mecab_manager_ = std::make_unique(config.analysis.enableCaboCha); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Initializing analyzer with config" << std::endl; } @@ -52,7 +42,7 @@ bool Analyzer::initialize(const MoZukuConfig &config) { system_charset_ = mecab_manager_->getSystemCharset(); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Analyzer initialized successfully with charset: " << system_charset_ << std::endl; } @@ -60,31 +50,38 @@ bool Analyzer::initialize(const MoZukuConfig &config) { return true; } -std::vector Analyzer::analyzeText(const std::string &text) { - std::vector tokens; - +Analyzer::PreparedText +Analyzer::prepareText(const std::string &text, + bool enforceMinJapaneseRatio) const { + PreparedText prepared; if (text.empty()) { - return tokens; + return prepared; } - if (isDebugEnabled()) { - std::cerr << "[DEBUG] Analyzing text of length: " << text.size() - << std::endl; + prepared.cleanText = text::TextProcessor::sanitizeUTF8(text); + if (prepared.cleanText.empty()) { + return prepared; } - std::string cleanText = text::TextProcessor::sanitizeUTF8(text); - double japaneseRatio = text::TextProcessor::calculateJapaneseRatio(cleanText); - if (config_.analysis.minJapaneseRatio > 0.0 && - japaneseRatio < config_.analysis.minJapaneseRatio) { - if (isDebugEnabled()) { - std::cerr << "[DEBUG] Skipping analysis due to low Japanese ratio: " - << japaneseRatio << " < " << config_.analysis.minJapaneseRatio - << std::endl; - } + prepared.japaneseRatio = + text::TextProcessor::calculateJapaneseRatio(prepared.cleanText); + prepared.belowMinJapaneseRatio = + enforceMinJapaneseRatio && config_.analysis.minJapaneseRatio > 0.0 && + prepared.japaneseRatio < config_.analysis.minJapaneseRatio; + + return prepared; +} + +std::vector +Analyzer::analyzePreparedText(const PreparedText &prepared) { + std::vector tokens; + + if (prepared.cleanText.empty()) { return tokens; } - std::string systemText = encoding::utf8ToSystem(cleanText, system_charset_); + std::string systemText = + encoding::utf8ToSystem(prepared.cleanText, system_charset_); MeCab::Tagger *tagger = mecab_manager_->getMeCabTagger(); if (!tagger) { @@ -98,7 +95,7 @@ std::vector Analyzer::analyzeText(const std::string &text) { return tokens; } - std::vector lineStarts = computeLineStarts(cleanText); + TextOffsetMapper offsetMapper(prepared.cleanText); size_t currentBytePos = 0; @@ -116,17 +113,17 @@ std::vector Analyzer::analyzeText(const std::string &text) { if (token.surface.empty()) continue; - while (currentBytePos < cleanText.size()) { - size_t remainingBytes = cleanText.size() - currentBytePos; + while (currentBytePos < prepared.cleanText.size()) { + size_t remainingBytes = prepared.cleanText.size() - currentBytePos; if (remainingBytes >= token.surface.size() && - cleanText.substr(currentBytePos, token.surface.size()) == + prepared.cleanText.substr(currentBytePos, token.surface.size()) == token.surface) { break; } currentBytePos++; } - Position pos = byteOffsetToPosition(cleanText, lineStarts, currentBytePos); + Position pos = offsetMapper.byteOffsetToPosition(currentBytePos); token.line = pos.line; token.startChar = pos.character; token.endChar = pos.character + utf8ToUtf16Length(token.surface); @@ -142,13 +139,14 @@ std::vector Analyzer::analyzeText(const std::string &text) { token.tokenType = pos::POSAnalyzer::mapPosToType(token.feature.c_str()); token.tokenModifiers = pos::POSAnalyzer::computeModifiers( - cleanText, currentBytePos, token.surface.size(), token.feature.c_str()); + prepared.cleanText, currentBytePos, token.surface.size(), + token.feature.c_str()); tokens.push_back(token); currentBytePos += token.surface.size(); } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Analysis completed: " << tokens.size() << " tokens generated" << std::endl; } @@ -156,6 +154,25 @@ std::vector Analyzer::analyzeText(const std::string &text) { return tokens; } +std::vector Analyzer::analyzeText(const std::string &text) { + if (debug::isEnabled()) { + std::cerr << "[DEBUG] Analyzing text of length: " << text.size() + << std::endl; + } + + PreparedText prepared = prepareText(text, true); + if (prepared.belowMinJapaneseRatio) { + if (debug::isEnabled()) { + std::cerr << "[DEBUG] Skipping analysis due to low Japanese ratio: " + << prepared.japaneseRatio << " < " + << config_.analysis.minJapaneseRatio << std::endl; + } + return {}; + } + + return analyzePreparedText(prepared); +} + std::vector Analyzer::checkGrammar(const std::string &text) { std::vector diagnostics; @@ -163,31 +180,29 @@ std::vector Analyzer::checkGrammar(const std::string &text) { return diagnostics; } - std::string cleanText = text::TextProcessor::sanitizeUTF8(text); - double japaneseRatio = text::TextProcessor::calculateJapaneseRatio(cleanText); - if (config_.analysis.minJapaneseRatio > 0.0 && - japaneseRatio < config_.analysis.minJapaneseRatio) { - if (isDebugEnabled()) { + PreparedText prepared = prepareText(text, true); + if (prepared.belowMinJapaneseRatio) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Skipping grammar check due to low Japanese ratio: " - << japaneseRatio << " < " << config_.analysis.minJapaneseRatio - << std::endl; + << prepared.japaneseRatio << " < " + << config_.analysis.minJapaneseRatio << std::endl; } return diagnostics; } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Starting grammar check" << std::endl; } - std::vector tokens = analyzeText(text); + std::vector tokens = analyzePreparedText(prepared); std::vector sentences = - text::TextProcessor::splitIntoSentences(text); + text::TextProcessor::splitIntoSentences(prepared.cleanText); - grammar::GrammarChecker::checkGrammar(text, tokens, sentences, diagnostics, - &config_); + grammar::GrammarChecker::checkGrammar(prepared.cleanText, tokens, sentences, + diagnostics, &config_); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Grammar check completed: " << diagnostics.size() << " diagnostics generated" << std::endl; } @@ -200,19 +215,24 @@ Analyzer::analyzeDependencies(const std::string &text) { std::vector dependencies; if (!mecab_manager_->isCaboChaAvailable()) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] CaboCha not available for dependency analysis" << std::endl; } return dependencies; } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Starting dependency analysis" << std::endl; } - std::string cleanText = text::TextProcessor::sanitizeUTF8(text); - std::string systemText = encoding::utf8ToSystem(cleanText, system_charset_); + PreparedText prepared = prepareText(text, false); + if (prepared.cleanText.empty()) { + return dependencies; + } + + std::string systemText = + encoding::utf8ToSystem(prepared.cleanText, system_charset_); cabocha_t *parser = mecab_manager_->getCaboChaParser(); if (!parser) { @@ -258,7 +278,7 @@ Analyzer::analyzeDependencies(const std::string &text) { dependencies.push_back(dep); } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Dependency analysis completed: " << dependencies.size() << " chunks found" << std::endl; } @@ -277,24 +297,3 @@ bool Analyzer::isCaboChaAvailable() const { } } // namespace MoZuku - -size_t computeByteOffset(const std::string &text, int line, int character) { - std::vector lineStarts = computeLineStarts(text); - if (line >= static_cast(lineStarts.size())) { - return text.size(); - } - - size_t lineStart = lineStarts[line]; - size_t bytePos = lineStart; - int utf16Pos = 0; - - while (bytePos < text.size() && utf16Pos < character && - text[bytePos] != '\n') { - unsigned char c = static_cast(text[bytePos]); - int seqLen = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1; - bytePos += seqLen; - utf16Pos += (seqLen == 4) ? 2 : 1; - } - - return bytePos; -} diff --git a/mozuku-lsp/src/comment_extractor.cpp b/mozuku-lsp/src/comment_extractor.cpp index a45bc7e..a093419 100644 --- a/mozuku-lsp/src/comment_extractor.cpp +++ b/mozuku-lsp/src/comment_extractor.cpp @@ -1,55 +1,13 @@ #include "comment_extractor.hpp" +#include "mozuku/treesitter/document.hpp" -#include #include -#include #include #include -#include #include -#include - -extern "C" { -const TSLanguage *tree_sitter_c(); -const TSLanguage *tree_sitter_cpp(); -const TSLanguage *tree_sitter_html(); -const TSLanguage *tree_sitter_javascript(); -const TSLanguage *tree_sitter_python(); -const TSLanguage *tree_sitter_rust(); -const TSLanguage *tree_sitter_typescript(); -const TSLanguage *tree_sitter_tsx(); -const TSLanguage *tree_sitter_latex(); -} - namespace { -using LanguageFactory = const TSLanguage *(*)(); - -const std::unordered_map &languageMap() { - static const std::unordered_map map = { - {"c", tree_sitter_c}, - {"cpp", tree_sitter_cpp}, - {"html", tree_sitter_html}, - {"c++", tree_sitter_cpp}, - {"javascript", tree_sitter_javascript}, - {"javascriptreact", tree_sitter_tsx}, - {"typescript", tree_sitter_typescript}, - {"typescriptreact", tree_sitter_tsx}, - {"tsx", tree_sitter_tsx}, - {"python", tree_sitter_python}, - {"rust", tree_sitter_rust}, - {"latex", tree_sitter_latex}}; - return map; -} - -std::string toLower(std::string input) { - std::transform( - input.begin(), input.end(), input.begin(), - [](unsigned char c) { return static_cast(std::tolower(c)); }); - return input; -} - inline bool isNewline(char c) { return c == '\n' || c == '\r'; } inline void setSpace(char &c) { @@ -216,64 +174,22 @@ namespace MoZuku { namespace comments { const TSLanguage *resolveLanguage(const std::string &languageId) { - const auto &map = languageMap(); - auto it = map.find(toLower(languageId)); - if (it == map.end()) { - return nullptr; - } - return it->second(); + return treesitter::resolveLanguage(languageId); } bool isLanguageSupported(const std::string &languageId) { - const auto &map = languageMap(); - return map.find(toLower(languageId)) != map.end(); + return treesitter::isLanguageSupported(languageId); } std::vector extractComments(const std::string &languageId, const std::string &text) { std::vector segments; - - const TSLanguage *language = resolveLanguage(languageId); - if (!language) { - return segments; - } - - TSParser *parser = ts_parser_new(); - if (!parser) { + treesitter::ParsedDocument document(languageId, text); + if (!document.isValid()) { return segments; } - std::unique_ptr parserGuard( - parser, &ts_parser_delete); - - if (!ts_parser_set_language(parser, language)) { - return segments; - } - - TSTree *tree = - ts_parser_parse_string(parser, nullptr, text.c_str(), text.size()); - if (!tree) { - return segments; - } - - std::unique_ptr treeGuard(tree, - &ts_tree_delete); - - TSNode root = ts_tree_root_node(tree); - if (ts_node_is_null(root)) { - return segments; - } - std::vector stack; - stack.push_back(root); - - while (!stack.empty()) { - TSNode node = stack.back(); - stack.pop_back(); - - if (ts_node_is_null(node)) { - continue; - } - + treesitter::walkDepthFirst(document.root(), [&](TSNode node) { const char *type = ts_node_type(node); if (type) { std::string_view nodeType(type); @@ -290,18 +206,11 @@ std::vector extractComments(const std::string &languageId, segment.sanitized = std::move(segmentText); segments.push_back(std::move(segment)); } - continue; + return false; } } - - uint32_t childCount = ts_node_child_count(node); - for (uint32_t i = 0; i < childCount; ++i) { - TSNode child = ts_node_child(node, i); - if (!ts_node_is_null(child)) { - stack.push_back(child); - } - } - } + return true; + }); return segments; } diff --git a/mozuku-lsp/src/document_preprocessor.cpp b/mozuku-lsp/src/document_preprocessor.cpp new file mode 100644 index 0000000..dc3b240 --- /dev/null +++ b/mozuku-lsp/src/document_preprocessor.cpp @@ -0,0 +1,335 @@ +#include "mozuku/analysis/document_preprocessor.hpp" +#include "encoding_utils.hpp" +#include "mozuku/treesitter/document.hpp" + +#include +#include +#include + +namespace { + +struct LocalByteRange { + size_t startByte{0}; + size_t endByte{0}; +}; + +bool isEscaped(const std::string &text, size_t pos) { + size_t count = 0; + while (pos > count && text[pos - count - 1] == '\\') { + ++count; + } + return (count % 2) == 1; +} + +size_t findClosingDollar(const std::string &text, size_t pos) { + for (size_t i = pos; i < text.size(); ++i) { + if (text[i] == '$' && !isEscaped(text, i)) { + return i; + } + } + return std::string::npos; +} + +size_t findClosingDoubleDollar(const std::string &text, size_t pos) { + for (size_t i = pos; i + 1 < text.size(); ++i) { + if (text[i] == '$' && text[i + 1] == '$' && !isEscaped(text, i)) { + return i; + } + } + return std::string::npos; +} + +std::string sanitizeLatexCommentText(const std::string &raw) { + if (raw.empty()) { + return raw; + } + + std::string sanitized = raw; + sanitized[0] = ' '; + size_t idx = 1; + while (idx < sanitized.size() && sanitized[idx] == '%') { + sanitized[idx] = ' '; + ++idx; + } + while (idx < sanitized.size() && + (sanitized[idx] == ' ' || sanitized[idx] == '\t')) { + sanitized[idx] = ' '; + ++idx; + } + return sanitized; +} + +std::vector +collectLatexComments(const std::string &text) { + std::vector segments; + size_t pos = 0; + while (pos < text.size()) { + size_t lineStart = pos; + size_t lineEnd = text.find('\n', pos); + if (lineEnd == std::string::npos) { + lineEnd = text.size(); + } + + size_t current = lineStart; + bool found = false; + while (current < lineEnd) { + if (text[current] == '%' && !isEscaped(text, current)) { + found = true; + break; + } + ++current; + } + + if (found) { + MoZuku::comments::CommentSegment segment; + segment.startByte = current; + segment.endByte = lineEnd; + segment.sanitized = + sanitizeLatexCommentText(text.substr(current, lineEnd - current)); + segments.push_back(std::move(segment)); + } + + if (lineEnd >= text.size()) { + break; + } + pos = lineEnd + 1; + } + + return segments; +} + +std::vector collectHtmlContentRanges(const std::string &text) { + std::vector ranges; + MoZuku::treesitter::ParsedDocument document("html", text); + if (!document.isValid()) { + return ranges; + } + + MoZuku::treesitter::walkDepthFirst(document.root(), [&](TSNode node) { + const char *type = ts_node_type(node); + if (type && std::strcmp(type, "text") == 0) { + size_t start = ts_node_start_byte(node); + size_t end = ts_node_end_byte(node); + if (start >= end || end > text.size()) { + return false; + } + + size_t trimmedStart = start; + while (trimmedStart < end && + std::isspace(static_cast(text[trimmedStart]))) { + ++trimmedStart; + } + size_t trimmedEnd = end; + while (trimmedEnd > trimmedStart && + std::isspace(static_cast(text[trimmedEnd - 1]))) { + --trimmedEnd; + } + if (trimmedEnd > trimmedStart) { + ranges.push_back({trimmedStart, trimmedEnd}); + } + return false; + } + return true; + }); + + return ranges; +} + +std::vector collectLatexContentRanges(const std::string &text) { + std::vector ranges; + size_t i = 0; + while (i < text.size()) { + unsigned char c = static_cast(text[i]); + if (c == '%' && !isEscaped(text, i)) { + size_t lineEnd = text.find('\n', i); + if (lineEnd == std::string::npos) { + break; + } + i = lineEnd + 1; + continue; + } + if (c == '$' && !isEscaped(text, i)) { + if (i + 1 < text.size() && text[i + 1] == '$') { + size_t closing = findClosingDoubleDollar(text, i + 2); + if (closing == std::string::npos) { + break; + } + i = closing + 2; + continue; + } + + size_t closing = findClosingDollar(text, i + 1); + if (closing == std::string::npos) { + break; + } + i = closing + 1; + continue; + } + if (c == '\\') { + ++i; + while (i < text.size()) { + unsigned char ch = static_cast(text[i]); + if (!std::isalpha(ch) && ch != '@') { + break; + } + ++i; + } + if (i < text.size() && text[i] == '*') { + ++i; + } + continue; + } + if (c == '{' || c == '}') { + ++i; + continue; + } + if (std::isspace(c)) { + ++i; + continue; + } + + size_t start = i; + bool advanced = false; + while (i < text.size()) { + unsigned char d = static_cast(text[i]); + if (d == '\\' || d == '$' || d == '{' || d == '}' || + (d == '%' && !isEscaped(text, i))) { + break; + } + if (d < 0x80 && (std::isspace(d) || std::ispunct(d))) { + break; + } + i += MoZuku::encoding::utf8SequenceLength(d); + advanced = true; + } + if (advanced) { + ranges.push_back({start, i}); + continue; + } + ++i; + } + + return ranges; +} + +std::vector toByteRanges(const std::vector &ranges) { + std::vector converted; + converted.reserve(ranges.size()); + for (const auto &range : ranges) { + converted.push_back(ByteRange{range.startByte, range.endByte}); + } + return converted; +} + +void appendCommentRanges( + std::vector &ranges, + const std::vector &segments) { + ranges.reserve(ranges.size() + segments.size()); + for (const auto &segment : segments) { + ranges.push_back(ByteRange{segment.startByte, segment.endByte}); + } +} + +std::string buildMaskWithContentRanges( + const std::string &text, const std::vector &contentRanges, + const std::vector &commentSegments) { + std::string masked = text; + for (char &ch : masked) { + if (ch != '\n' && ch != '\r') { + ch = ' '; + } + } + + for (const auto &range : contentRanges) { + if (range.startByte >= masked.size()) { + continue; + } + size_t len = std::min(range.endByte - range.startByte, + masked.size() - range.startByte); + for (size_t i = 0; i < len; ++i) { + masked[range.startByte + i] = text[range.startByte + i]; + } + } + + for (const auto &segment : commentSegments) { + if (segment.startByte >= masked.size()) { + continue; + } + size_t len = + std::min(segment.sanitized.size(), masked.size() - segment.startByte); + for (size_t i = 0; i < len; ++i) { + masked[segment.startByte + i] = segment.sanitized[i]; + } + } + + return masked; +} + +std::string buildCommentOnlyMask( + const std::string &text, + const std::vector &segments) { + std::string masked = text; + for (char &ch : masked) { + if (ch != '\n' && ch != '\r') { + ch = ' '; + } + } + + const size_t docSize = masked.size(); + for (const auto &segment : segments) { + if (segment.startByte >= docSize) { + continue; + } + size_t maxCopy = + std::min(docSize - segment.startByte, segment.sanitized.size()); + for (size_t i = 0; i < maxCopy; ++i) { + masked[segment.startByte + i] = segment.sanitized[i]; + } + } + + return masked; +} + +} // namespace + +namespace MoZuku::analysis { + +ProcessedDocument DocumentPreprocessor::prepare(const std::string &languageId, + const std::string &text) const { + ProcessedDocument result; + result.analysisText = text; + + if (languageId.empty() || languageId == "japanese") { + return result; + } + + if (languageId == "html") { + result.commentSegments = comments::extractComments(languageId, text); + std::vector contentRanges = collectHtmlContentRanges(text); + result.contentHighlightRanges = toByteRanges(contentRanges); + appendCommentRanges(result.contentHighlightRanges, result.commentSegments); + result.analysisText = + buildMaskWithContentRanges(text, contentRanges, result.commentSegments); + return result; + } + + if (languageId == "latex") { + result.commentSegments = collectLatexComments(text); + std::vector contentRanges = collectLatexContentRanges(text); + result.contentHighlightRanges = toByteRanges(contentRanges); + appendCommentRanges(result.contentHighlightRanges, result.commentSegments); + result.analysisText = + buildMaskWithContentRanges(text, contentRanges, result.commentSegments); + return result; + } + + if (!comments::isLanguageSupported(languageId)) { + return result; + } + + result.commentSegments = comments::extractComments(languageId, text); + result.analysisText = buildCommentOnlyMask(text, result.commentSegments); + return result; +} + +} // namespace MoZuku::analysis diff --git a/mozuku-lsp/src/encoding_utils.cpp b/mozuku-lsp/src/encoding_utils.cpp index 89f6ea0..b46cc47 100644 --- a/mozuku-lsp/src/encoding_utils.cpp +++ b/mozuku-lsp/src/encoding_utils.cpp @@ -1,44 +1,141 @@ #include "encoding_utils.hpp" + +#include +#include +#include #include #include +namespace { + +std::string normalizeCharsetName(const std::string &charset) { + std::string normalized; + normalized.reserve(charset.size()); + for (unsigned char c : charset) { + if (std::isalnum(c)) { + normalized.push_back(static_cast(std::toupper(c))); + } + } + return normalized; +} + +bool isUtf8Charset(const std::string &charset) { + return normalizeCharsetName(charset) == "UTF8"; +} + +bool isSameCharset(const std::string &lhs, const std::string &rhs) { + return normalizeCharsetName(lhs) == normalizeCharsetName(rhs); +} + +struct IconvCloser { + explicit IconvCloser(iconv_t handle) : handle_(handle) {} + ~IconvCloser() { + if (handle_ != (iconv_t)-1) { + iconv_close(handle_); + } + } + + iconv_t get() const { return handle_; } + +private: + iconv_t handle_; +}; + +void appendBuffer(std::string &result, const std::array &buffer, + size_t remaining) { + result.append(buffer.data(), buffer.size() - remaining); +} + +void stripUnsupportedControlChars(std::string &text) { + std::string filtered; + filtered.reserve(text.size()); + for (unsigned char c : text) { + if (c >= 0x20 || c == 0x09 || c == 0x0A || c == 0x0D || c >= 0x80) { + filtered.push_back(static_cast(c)); + } + } + text.swap(filtered); +} + +} // namespace + namespace MoZuku { namespace encoding { std::string convertEncoding(const std::string &input, const std::string &fromCharset, - const std::string &toCharset) { + const std::string &toCharset, + ConversionOptions options) { if (input.empty()) return input; + if (!options.skipInvalidInput && isSameCharset(fromCharset, toCharset)) { + return input; + } + iconv_t cd = iconv_open(toCharset.c_str(), fromCharset.c_str()); if (cd == (iconv_t)-1) { return input; } + IconvCloser guard(cd); + char *inBuf = const_cast(input.data()); size_t inBytesLeft = input.size(); - size_t outBytesLeft = input.size() * 4; // Conservative estimate + std::string result; + result.reserve(input.size() * 2 + 16); + std::array outputBuffer{}; - std::string result(outBytesLeft, '\0'); + while (true) { + char *outBuf = outputBuffer.data(); + size_t outBytesLeft = outputBuffer.size(); + size_t status = + iconv(guard.get(), &inBuf, &inBytesLeft, &outBuf, &outBytesLeft); + appendBuffer(result, outputBuffer, outBytesLeft); - char *inBuf = const_cast(input.data()); - char *outBuf = &result[0]; + if (status != static_cast(-1)) { + break; + } + + if (errno == E2BIG) { + continue; + } + + if (options.skipInvalidInput && (errno == EILSEQ || errno == EINVAL)) { + if (inBytesLeft == 0) { + break; + } + ++inBuf; + --inBytesLeft; + continue; + } - if (iconv(cd, &inBuf, &inBytesLeft, &outBuf, &outBytesLeft) == (size_t)-1) { - iconv_close(cd); return input; } - iconv_close(cd); + while (true) { + char *outBuf = outputBuffer.data(); + size_t outBytesLeft = outputBuffer.size(); + size_t status = + iconv(guard.get(), nullptr, nullptr, &outBuf, &outBytesLeft); + appendBuffer(result, outputBuffer, outBytesLeft); + + if (status != static_cast(-1)) { + break; + } + + if (errno == E2BIG) { + continue; + } + + return input; + } - // Resize result to actual converted size - result.resize(result.size() - outBytesLeft); return result; } std::string systemToUtf8(const std::string &input, const std::string &systemCharset) { - if (systemCharset == "UTF-8" || systemCharset.empty()) { + if (systemCharset.empty() || isUtf8Charset(systemCharset)) { return input; } return convertEncoding(input, systemCharset, "UTF-8"); @@ -46,11 +143,34 @@ std::string systemToUtf8(const std::string &input, std::string utf8ToSystem(const std::string &input, const std::string &systemCharset) { - if (systemCharset == "UTF-8" || systemCharset.empty()) { + if (systemCharset.empty() || isUtf8Charset(systemCharset)) { return input; } return convertEncoding(input, "UTF-8", systemCharset); } +std::string sanitizeUtf8(const std::string &input) { + std::string sanitized = + convertEncoding(input, "UTF-8", "UTF-8", ConversionOptions{true}); + stripUnsupportedControlChars(sanitized); + return sanitized; +} + +size_t utf8SequenceLength(unsigned char c) { + if (c < 0x80) { + return 1; + } + if ((c & 0xE0) == 0xC0) { + return 2; + } + if ((c & 0xF0) == 0xE0) { + return 3; + } + if ((c & 0xF8) == 0xF0) { + return 4; + } + return 1; +} + } // namespace encoding } // namespace MoZuku diff --git a/mozuku-lsp/src/grammar_checker.cpp b/mozuku-lsp/src/grammar_checker.cpp index a8ce345..eb51cf8 100644 --- a/mozuku-lsp/src/grammar_checker.cpp +++ b/mozuku-lsp/src/grammar_checker.cpp @@ -1,7 +1,7 @@ #include "grammar_checker.hpp" +#include "mozuku/core/debug.hpp" #include "pos_analyzer.hpp" #include "utf16.hpp" -#include #include namespace MoZuku { @@ -13,69 +13,11 @@ struct RuleContext { const std::string &text; const std::vector &tokens; const std::vector &sentences; - const std::vector &lineStarts; + const TextOffsetMapper &offsets; const std::vector &tokenBytePositions; int severity{2}; }; -bool isAdversativeGa(const std::string &feature) { - // MeCab: 品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,... - // 逆接の接続助詞「が」: 助詞,接続助詞,*,*,*,*,が,ガ,ガ - int fieldIndex = 0; - size_t start = 0; - size_t end = 0; - - std::string pos, sub1, base; - while (end != std::string::npos) { - end = feature.find(',', start); - std::string part = feature.substr( - start, end == std::string::npos ? std::string::npos : end - start); - if (fieldIndex == 0) - pos = part; - else if (fieldIndex == 1) - sub1 = part; - else if (fieldIndex == 6) - base = part; - - if (end == std::string::npos) - break; - start = end + 1; - ++fieldIndex; - if (fieldIndex > 6 && !base.empty()) { - break; - } - } - - return pos == "助詞" && sub1 == "接続助詞" && base == "が"; -} - -bool isConjunction(const std::string &feature) { - size_t comma = feature.find(','); - std::string pos = - (comma == std::string::npos) ? feature : feature.substr(0, comma); - return pos == "接続詞"; -} - -bool isParticle(const std::string &feature) { - size_t comma = feature.find(','); - std::string pos = - (comma == std::string::npos) ? feature : feature.substr(0, comma); - return pos == "助詞"; -} - -std::string particleKey(const std::string &feature) { - // "助詞,格助詞,一般,..." -> "助詞,格助詞" - size_t firstComma = feature.find(','); - if (firstComma == std::string::npos) { - return feature; - } - size_t secondComma = feature.find(',', firstComma + 1); - if (secondComma == std::string::npos) { - return feature.substr(0, firstComma); - } - return feature.substr(0, secondComma); -} - DetailedPOS parsePos(const std::string &feature) { return MoZuku::pos::POSAnalyzer::parseDetailedPOS(feature.c_str(), "UTF-8"); } @@ -95,44 +37,21 @@ bool isSpecialRaCase(const DetailedPOS &pos) { (pos.baseForm == "来れる" || pos.baseForm == "見れる"); } -// UTF-16ベースのトークン位置をUTF-8バイトオフセットに変換 -size_t toByteOffset(const TokenData &token, const std::string &text, - const std::vector &lineStarts) { - if (token.line >= static_cast(lineStarts.size())) { - return text.size(); - } - - size_t lineStart = lineStarts[token.line]; - size_t bytePos = lineStart; - int utf16Pos = 0; - - while (bytePos < text.size() && utf16Pos < token.startChar && - text[bytePos] != '\n') { - unsigned char c = static_cast(text[bytePos]); - int seqLen = (c >= 0xF0) ? 4 : (c >= 0xE0) ? 3 : (c >= 0xC0) ? 2 : 1; - bytePos += seqLen; - utf16Pos += (seqLen == 4) ? 2 : 1; - } - - return bytePos; -} - std::vector computeTokenBytePositions(const std::vector &tokens, - const std::string &text, - const std::vector &lineStarts) { + const TextOffsetMapper &offsetMapper) { std::vector positions; positions.reserve(tokens.size()); for (const auto &token : tokens) { - positions.push_back(toByteOffset(token, text, lineStarts)); + positions.push_back(offsetMapper.tokenStartByteOffset(token)); } return positions; } Range makeRange(const RuleContext &ctx, size_t startByte, size_t endByte) { Range range; - range.start = byteOffsetToPosition(ctx.text, ctx.lineStarts, startByte); - range.end = byteOffsetToPosition(ctx.text, ctx.lineStarts, endByte); + range.start = ctx.offsets.byteOffsetToPosition(startByte); + range.end = ctx.offsets.byteOffsetToPosition(endByte); return range; } @@ -159,16 +78,6 @@ size_t countCommas(const std::string &text) { } // namespace -static bool isDebugEnabled() { - static bool initialized = false; - static bool debug = false; - if (!initialized) { - debug = (std::getenv("MOZUKU_DEBUG") != nullptr); - initialized = true; - } - return debug; -} - void checkCommaLimit(const RuleContext &ctx, std::vector &diags, int limit) { if (limit <= 0) @@ -186,7 +95,7 @@ void checkCommaLimit(const RuleContext &ctx, std::vector &diags, diag.message = "一文に使用できる読点「、」は最大" + std::to_string(limit) + "個までです (現在" + std::to_string(commaCount) + "個) "; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Comma limit exceeded in sentence " << sentence.sentenceId << ": count=" << commaCount << "\n"; } @@ -203,7 +112,7 @@ void checkAdversativeGa(const RuleContext &ctx, std::vector &diags, for (const auto &sentence : ctx.sentences) { size_t count = 0; for (size_t i = 0; i < ctx.tokens.size(); ++i) { - if (!isAdversativeGa(ctx.tokens[i].feature)) { + if (!pos::POSAnalyzer::isAdversativeGaFeature(ctx.tokens[i].feature)) { continue; } size_t bytePos = ctx.tokenBytePositions[i]; @@ -223,7 +132,7 @@ void checkAdversativeGa(const RuleContext &ctx, std::vector &diags, std::to_string(maxCount + 1) + "回以上使われています (" + std::to_string(count) + "回) "; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Adversative 'が' exceeded in sentence " << sentence.sentenceId << ": count=" << count << "\n"; } @@ -252,11 +161,11 @@ void checkDuplicateParticleSurface(const RuleContext &ctx, continue; } - if (!isParticle(token.feature)) { + if (!pos::POSAnalyzer::isParticleFeature(token.feature)) { continue; } - std::string currentKey = particleKey(token.feature); + std::string currentKey = pos::POSAnalyzer::particleKey(token.feature); if (hasLast && token.surface == lastSurface && currentKey == lastKey) { ++streak; @@ -267,7 +176,7 @@ void checkDuplicateParticleSurface(const RuleContext &ctx, diag.severity = ctx.severity; diag.message = "同じ助詞「" + token.surface + "」が連続しています"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Duplicate particle '" << token.surface << "' in sentence " << sentence.sentenceId << "\n"; } @@ -305,8 +214,9 @@ void checkAdjacentParticles(const RuleContext &ctx, continue; } - bool currentIsParticle = isParticle(token.feature); - std::string currentKey = particleKey(token.feature); + bool currentIsParticle = + pos::POSAnalyzer::isParticleFeature(token.feature); + std::string currentKey = pos::POSAnalyzer::particleKey(token.feature); if (currentIsParticle && prevIsParticle && currentKey == prevKey && bytePos == prevStartByte + prevToken.surface.size()) { ++streak; @@ -317,7 +227,7 @@ void checkAdjacentParticles(const RuleContext &ctx, diag.severity = ctx.severity; diag.message = "助詞が連続して使われています"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Consecutive particles '" << prevToken.surface << "' -> '" << token.surface << "' in sentence " << sentence.sentenceId << "\n"; @@ -355,7 +265,7 @@ void checkConjunctionRepeats(const RuleContext &ctx, for (size_t i = 0; i < ctx.tokens.size(); ++i) { const auto &token = ctx.tokens[i]; - if (!isConjunction(token.feature)) { + if (!pos::POSAnalyzer::isConjunctionFeature(token.feature)) { continue; } @@ -374,7 +284,7 @@ void checkConjunctionRepeats(const RuleContext &ctx, diag.severity = ctx.severity; diag.message = "同じ接続詞「" + token.surface + "」が連続しています"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Duplicate conjunction '" << token.surface << "' detected across punctuation\n"; } @@ -412,7 +322,7 @@ void checkRaDropping(const RuleContext &ctx, std::vector &diags) { diag.message = messageRa; diags.push_back(std::move(diag)); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Ra-dropping special case detected: " << token.surface << "\n"; } @@ -436,7 +346,7 @@ void checkRaDropping(const RuleContext &ctx, std::vector &diags) { diag.message = messageRa; diags.push_back(std::move(diag)); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Ra-dropping detected between tokens '" << prevToken.surface << "' + '" << token.surface << "'\n"; } @@ -456,9 +366,9 @@ void GrammarChecker::checkGrammar( return; } - std::vector lineStarts = computeLineStarts(text); + TextOffsetMapper offsetMapper(text); std::vector tokenBytePositions = - computeTokenBytePositions(tokens, text, lineStarts); + computeTokenBytePositions(tokens, offsetMapper); // ルール共通設定 (現状は警告レベル固定) const int severity = 2; // Warning @@ -468,7 +378,7 @@ void GrammarChecker::checkGrammar( return; } - RuleContext ctx{text, tokens, sentences, lineStarts, tokenBytePositions, + RuleContext ctx{text, tokens, sentences, offsetMapper, tokenBytePositions, severity}; if (config && config->analysis.rules.commaLimit) { diff --git a/mozuku-lsp/src/lsp.cpp b/mozuku-lsp/src/lsp.cpp index 6e0ea84..960595b 100644 --- a/mozuku-lsp/src/lsp.cpp +++ b/mozuku-lsp/src/lsp.cpp @@ -1,38 +1,23 @@ #include "lsp.hpp" #include "analyzer.hpp" #include "comment_extractor.hpp" +#include "mozuku/core/debug.hpp" +#include "pos_analyzer.hpp" #include "utf16.hpp" #include "wikipedia.hpp" #include -#include +#include #include #include #include #include #include -#include - using nlohmann::json; -static bool isDebugEnabled() { - static bool initialized = false; - static bool debug = false; - if (!initialized) { - debug = (std::getenv("MOZUKU_DEBUG") != nullptr); - initialized = true; - } - return debug; -} - namespace { -struct LocalByteRange { - size_t startByte{0}; - size_t endByte{0}; -}; - bool readBoolOption(const json &obj, const char *key, bool &out) { if (!obj.contains(key)) { return false; @@ -52,277 +37,6 @@ bool readBoolOption(const json &obj, const char *key, bool &out) { return false; } -bool isEscaped(const std::string &text, size_t pos) { - size_t count = 0; - while (pos > count && text[pos - count - 1] == '\\') { - ++count; - } - return (count % 2) == 1; -} - -size_t findClosingDollar(const std::string &text, size_t pos) { - for (size_t i = pos; i < text.size(); ++i) { - if (text[i] == '$' && !isEscaped(text, i)) { - return i; - } - } - return std::string::npos; -} - -size_t findClosingDoubleDollar(const std::string &text, size_t pos) { - for (size_t i = pos; i + 1 < text.size(); ++i) { - if (text[i] == '$' && text[i + 1] == '$' && !isEscaped(text, i)) { - return i; - } - } - return std::string::npos; -} - -size_t findClosingCommand(const std::string &text, size_t pos, - const std::string &closing) { - size_t current = pos; - while (current < text.size()) { - size_t found = text.find(closing, current); - if (found == std::string::npos) - return std::string::npos; - if (!isEscaped(text, found)) - return found; - current = found + closing.size(); - } - return std::string::npos; -} - -std::string processLatexMath(const std::string &text) { return text; } - -std::string sanitizeLatexCommentText(const std::string &raw) { - if (raw.empty()) - return raw; - - std::string sanitized = raw; - sanitized[0] = ' '; - size_t idx = 1; - while (idx < sanitized.size() && sanitized[idx] == '%') { - sanitized[idx] = ' '; - ++idx; - } - while (idx < sanitized.size() && - (sanitized[idx] == ' ' || sanitized[idx] == '\t')) { - sanitized[idx] = ' '; - ++idx; - } - return sanitized; -} - -std::vector -collectLatexComments(const std::string &text) { - std::vector segments; - size_t pos = 0; - while (pos < text.size()) { - size_t lineStart = pos; - size_t lineEnd = text.find('\n', pos); - if (lineEnd == std::string::npos) - lineEnd = text.size(); - - size_t current = lineStart; - bool found = false; - while (current < lineEnd) { - if (text[current] == '%' && !isEscaped(text, current)) { - found = true; - break; - } - ++current; - } - - if (found) { - MoZuku::comments::CommentSegment segment; - segment.startByte = current; - segment.endByte = lineEnd; - segment.sanitized = - sanitizeLatexCommentText(text.substr(current, lineEnd - current)); - segments.push_back(std::move(segment)); - } - - if (lineEnd >= text.size()) - break; - pos = lineEnd + 1; - } - - return segments; -} - -size_t utf8CharLen(unsigned char c) { - if (c < 0x80) - return 1; - if ((c >> 5) == 0x6) - return 2; - if ((c >> 4) == 0xE) - return 3; - if ((c >> 3) == 0x1E) - return 4; - return 1; -} - -std::vector collectHtmlContentRanges(const std::string &text) { - std::vector ranges; - const TSLanguage *language = MoZuku::comments::resolveLanguage("html"); - if (!language) - return ranges; - - TSParser *parser = ts_parser_new(); - if (!parser) - return ranges; - - std::unique_ptr parserGuard( - parser, &ts_parser_delete); - if (!ts_parser_set_language(parser, language)) { - return ranges; - } - - TSTree *tree = - ts_parser_parse_string(parser, nullptr, text.c_str(), text.size()); - if (!tree) - return ranges; - - std::unique_ptr treeGuard(tree, - &ts_tree_delete); - - TSNode root = ts_tree_root_node(tree); - if (ts_node_is_null(root)) - return ranges; - - std::vector stack; - stack.push_back(root); - - while (!stack.empty()) { - TSNode node = stack.back(); - stack.pop_back(); - - if (ts_node_is_null(node)) - continue; - - const char *type = ts_node_type(node); - if (type && std::strcmp(type, "text") == 0) { - size_t start = ts_node_start_byte(node); - size_t end = ts_node_end_byte(node); - if (start >= end || end > text.size()) - continue; - - size_t trimmedStart = start; - while (trimmedStart < end && - std::isspace(static_cast(text[trimmedStart]))) { - ++trimmedStart; - } - size_t trimmedEnd = end; - while (trimmedEnd > trimmedStart && - std::isspace(static_cast(text[trimmedEnd - 1]))) { - --trimmedEnd; - } - if (trimmedEnd > trimmedStart) { - ranges.push_back({trimmedStart, trimmedEnd}); - } - continue; - } - - uint32_t childCount = ts_node_child_count(node); - for (uint32_t i = 0; i < childCount; ++i) { - TSNode child = ts_node_child(node, i); - if (!ts_node_is_null(child)) { - stack.push_back(child); - } - } - } - - return ranges; -} - -std::vector collectLatexContentRanges(const std::string &text) { - std::vector ranges; - size_t i = 0; - while (i < text.size()) { - unsigned char c = static_cast(text[i]); - if (c == '%' && !isEscaped(text, i)) { - size_t lineEnd = text.find('\n', i); - if (lineEnd == std::string::npos) - break; - i = lineEnd + 1; - continue; - } - if (c == '$' && !isEscaped(text, i)) { - if (i + 1 < text.size() && text[i + 1] == '$') { - size_t closing = findClosingDoubleDollar(text, i + 2); - if (closing == std::string::npos) - break; - i = closing + 2; - continue; - } else { - size_t closing = findClosingDollar(text, i + 1); - if (closing == std::string::npos) - break; - i = closing + 1; - continue; - } - } - if (c == '\\') { - ++i; - while (i < text.size()) { - unsigned char ch = static_cast(text[i]); - if (!std::isalpha(ch) && ch != '@') - break; - ++i; - } - if (i < text.size() && text[i] == '*') - ++i; - continue; - } - if (c == '{' || c == '}') { - ++i; - continue; - } - if (std::isspace(c)) { - ++i; - continue; - } - - size_t start = i; - bool advanced = false; - while (i < text.size()) { - unsigned char d = static_cast(text[i]); - if (d == '\\' || d == '$' || d == '{' || d == '}' || - (d == '%' && !isEscaped(text, i))) { - break; - } - if (d < 0x80) { - if (std::isspace(d) || std::ispunct(d)) - break; - } - size_t len = utf8CharLen(d); - i += len; - advanced = true; - } - if (advanced) { - ranges.push_back({start, i}); - continue; - } - // ensure progress to avoid infinite loop - if (!advanced) - ++i; - } - - return ranges; -} - -std::vector -collectContentHighlightRanges(const std::string &languageId, - const std::string &text) { - if (languageId == "html") { - return collectHtmlContentRanges(text); - } - if (languageId == "latex") { - return collectLatexContentRanges(text); - } - return {}; -} - } // namespace LSPServer::LSPServer(std::istream &in, std::ostream &out) : in_(in), out_(out) { @@ -419,7 +133,7 @@ void LSPServer::run() { json req = json::parse(jsonPayload); handle(req); } catch (const json::parse_error &e) { - if (isDebugEnabled()) { + if (MoZuku::debug::isEnabled()) { std::cerr << "[DEBUG] JSON parse error: " << e.what() << std::endl; } } @@ -537,22 +251,48 @@ void LSPServer::onInitialized() { // 初期化完了 } +LSPServer::DocumentState &LSPServer::ensureDocument(const std::string &uri) { + return documents_[uri]; +} + +LSPServer::DocumentState *LSPServer::findDocument(const std::string &uri) { + auto it = documents_.find(uri); + return it == documents_.end() ? nullptr : &it->second; +} + +const LSPServer::DocumentState * +LSPServer::findDocument(const std::string &uri) const { + auto it = documents_.find(uri); + return it == documents_.end() ? nullptr : &it->second; +} + +bool LSPServer::isJapaneseLanguage(const DocumentState &document) { + return document.languageId == "japanese"; +} + void LSPServer::onDidOpen(const json ¶ms) { std::string uri = params["textDocument"]["uri"]; std::string text = params["textDocument"]["text"]; - docs_[uri] = text; + auto &document = ensureDocument(uri); + document.text = text; + document.tokens.clear(); + document.tokensCached = false; + document.diagnosticsByLine.clear(); if (params["textDocument"].contains("languageId") && params["textDocument"]["languageId"].is_string()) { - docLanguages_[uri] = params["textDocument"]["languageId"]; + document.languageId = params["textDocument"]["languageId"]; + } else { + document.languageId.clear(); } - analyzeAndPublish(uri, text); + analyzeAndPublish(uri); } void LSPServer::onDidChange(const json ¶ms) { std::string uri = params["textDocument"]["uri"]; auto changes = params["contentChanges"]; - std::string &text = docs_[uri]; + auto &document = ensureDocument(uri); + std::string &text = document.text; std::string oldText = text; // 位置を維持するため変更を逆順に適用 @@ -566,8 +306,8 @@ void LSPServer::onDidChange(const json ¶ms) { int endLine = range["end"]["line"]; int endChar = range["end"]["character"]; - size_t startOffset = computeByteOffset(text, startLine, startChar); - size_t endOffset = computeByteOffset(text, endLine, endChar); + size_t startOffset = positionToByteOffset(text, startLine, startChar); + size_t endOffset = positionToByteOffset(text, endLine, endChar); std::string newText = change["text"]; text.replace(startOffset, endOffset - startOffset, newText); @@ -577,25 +317,28 @@ void LSPServer::onDidChange(const json ¶ms) { } } + document.tokensCached = false; + document.tokens.clear(); + // 最適化: 変更された行のみ再解析 analyzeChangedLines(uri, text, oldText); } void LSPServer::onDidSave(const json ¶ms) { std::string uri = params["textDocument"]["uri"]; - if (docs_.find(uri) != docs_.end()) { - analyzeAndPublish(uri, docs_[uri]); + if (findDocument(uri) != nullptr) { + analyzeAndPublish(uri); } } json LSPServer::onSemanticTokensFull(const json &id, const json ¶ms) { std::string uri = params["textDocument"]["uri"]; - if (docs_.find(uri) == docs_.end()) { + const auto *document = findDocument(uri); + if (!document) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } - auto langIt = docLanguages_.find(uri); - if (langIt == docLanguages_.end() || langIt->second != "japanese") { + if (!isJapaneseLanguage(*document)) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } @@ -605,12 +348,12 @@ json LSPServer::onSemanticTokensFull(const json &id, const json ¶ms) { json LSPServer::onSemanticTokensRange(const json &id, const json ¶ms) { std::string uri = params["textDocument"]["uri"]; - if (docs_.find(uri) == docs_.end()) { + const auto *document = findDocument(uri); + if (!document) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } - auto langIt = docLanguages_.find(uri); - if (langIt == docLanguages_.end() || langIt->second != "japanese") { + if (!isJapaneseLanguage(*document)) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } @@ -618,70 +361,36 @@ json LSPServer::onSemanticTokensRange(const json &id, const json ¶ms) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", {{"data", tokens}}}}; } -bool isNoun(const std::string &tokenType, const std::string &feature) { - // tokenTypeが "noun" の場合 - if (tokenType == "noun") { - return true; - } - - // MeCabのfeature文字列から品詞を判定 - // feature形式: - // "品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音" - if (!feature.empty()) { - size_t commaPos = feature.find(','); - if (commaPos != std::string::npos) { - std::string mainPOS = feature.substr(0, commaPos); - return mainPOS == "名詞"; - } - } - - return false; -} - json LSPServer::onHover(const json &id, const json ¶ms) { std::string uri = params["textDocument"]["uri"]; - if (docs_.find(uri) == docs_.end() || - docTokens_.find(uri) == docTokens_.end()) { + const auto *document = findDocument(uri); + if (!document || !document->tokensCached) { return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } int line = params["position"]["line"]; int character = params["position"]["character"]; - const auto docIt = docs_.find(uri); - if (docIt == docs_.end()) { - return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; - } - // japanese 以外の言語では、コメント/コンテンツ範囲内でのみ hover を表示 // (HTML: タグ内テキスト、LaTeX: タグ・数式以外のテキスト、その他: コメント内) - auto langIt = docLanguages_.find(uri); - bool isJapanese = - (langIt != docLanguages_.end() && langIt->second == "japanese"); + bool isJapanese = isJapaneseLanguage(*document); if (!isJapanese) { - size_t offset = computeByteOffset(docIt->second, line, character); + size_t offset = positionToByteOffset(document->text, line, character); bool insideComment = false; - const auto segmentsIt = docCommentSegments_.find(uri); - if (segmentsIt != docCommentSegments_.end()) { - for (const auto &segment : segmentsIt->second) { - if (offset >= segment.startByte && offset < segment.endByte) { - insideComment = true; - break; - } + for (const auto &segment : document->commentSegments) { + if (offset >= segment.startByte && offset < segment.endByte) { + insideComment = true; + break; } } bool insideContent = false; - if (langIt != docLanguages_.end() && - (langIt->second == "html" || langIt->second == "latex")) { - const auto contentIt = docContentHighlightRanges_.find(uri); - if (contentIt != docContentHighlightRanges_.end()) { - for (const auto &range : contentIt->second) { - if (offset >= range.startByte && offset < range.endByte) { - insideContent = true; - break; - } + if (document->languageId == "html" || document->languageId == "latex") { + for (const auto &range : document->contentHighlightRanges) { + if (offset >= range.startByte && offset < range.endByte) { + insideContent = true; + break; } } } @@ -692,7 +401,7 @@ json LSPServer::onHover(const json &id, const json ¶ms) { } // 位置にあるトークンを検索 - const auto &tokens = docTokens_[uri]; + const auto &tokens = document->tokens; for (const auto &token : tokens) { if (token.line == line && character >= token.startChar && character < token.endChar) { @@ -712,7 +421,8 @@ json LSPServer::onHover(const json &id, const json ¶ms) { } // 名詞の場合、Wikipediaサマリを追加 - if (isNoun(token.tokenType, token.feature)) { + if (token.tokenType == "noun" || + MoZuku::pos::POSAnalyzer::isNounFeature(token.feature)) { std::string query = token.baseForm.empty() ? token.surface : token.baseForm; @@ -730,7 +440,7 @@ json LSPServer::onHover(const json &id, const json ¶ms) { cached_entry->response_code); } } else { - if (isDebugEnabled()) { + if (MoZuku::debug::isEnabled()) { std::cerr << "[DEBUG] fetching Wikipedia: " << query << std::endl; } @@ -739,13 +449,13 @@ json LSPServer::onHover(const json &id, const json ¶ms) { std::thread([query, future = std::move(future)]() mutable { try { auto result = future.get(); - if (isDebugEnabled()) { + if (MoZuku::debug::isEnabled()) { std::cerr << "[DEBUG] Wikipedia取得完了: " << query << ", ステータス: " << result.response_code << std::endl; } } catch (const std::exception &e) { - if (isDebugEnabled()) { + if (MoZuku::debug::isEnabled()) { std::cerr << "[DEBUG] Wikipedia取得失敗: " << query << ", エラー: " << e.what() << std::endl; } @@ -754,71 +464,61 @@ json LSPServer::onHover(const json &id, const json ¶ms) { } } - return json{ - {"jsonrpc", "2.0"}, - {"id", id}, - {"result", - {{"contents", {{"kind", "markdown"}, {"value", markdown.str()}}}, - {"range", - {{"start", {{"line", token.line}, {"character", token.startChar}}}, - {"end", - {{"line", token.line}, {"character", token.endChar}}}}}}}}; + return json{{"jsonrpc", "2.0"}, + {"id", id}, + {"result", presenter_.hoverResult(token, markdown.str())}}; } } return json{{"jsonrpc", "2.0"}, {"id", id}, {"result", nullptr}}; } -void LSPServer::analyzeAndPublish(const std::string &uri, - const std::string &text) { +void LSPServer::analyzeAndPublish(const std::string &uri) { + auto &document = ensureDocument(uri); + const std::string &text = document.text; + if (!analyzer_->isInitialized()) { analyzer_->initialize(config_); } - std::string analysisText = prepareAnalysisText(uri, text); - - std::vector tokens = analyzer_->analyzeText(analysisText); - std::vector diags = analyzer_->checkGrammar(analysisText); + auto prepared = prepareDocument(document); - docTokens_[uri] = tokens; - cacheDiagnostics(uri, diags); + std::vector tokens = analyzer_->analyzeText(prepared.analysisText); + std::vector diags = + analyzer_->checkGrammar(prepared.analysisText); - // 診断情報を配信 - json diagnostics = json::array(); - for (const auto &diag : diags) { - diagnostics.push_back({{"range", - {{"start", - {{"line", diag.range.start.line}, - {"character", diag.range.start.character}}}, - {"end", - {{"line", diag.range.end.line}, - {"character", diag.range.end.character}}}}}, - {"severity", diag.severity}, - {"message", diag.message}}); - } + document.tokens = tokens; + document.tokensCached = true; + cacheDiagnostics(document, diags); notify("textDocument/publishDiagnostics", - {{"uri", uri}, {"diagnostics", diagnostics}}); + presenter_.publishDiagnosticsParams(uri, diags)); // コンテンツ範囲を通知 (コメント範囲 or HTML/LaTeX のコンテンツ範囲) // HTML: タグ内テキスト、LaTeX: タグ・数式以外のテキスト - const auto segmentsIt = docCommentSegments_.find(uri); - if (segmentsIt != docCommentSegments_.end()) { - sendCommentHighlights(uri, text, segmentsIt->second); + static const std::vector kEmptySegments; + if (!document.commentSegments.empty()) { + notify("mozuku/commentHighlights", + presenter_.commentHighlightsParams(uri, text, + document.commentSegments)); } else { - static const std::vector kEmptySegments; - sendCommentHighlights(uri, text, kEmptySegments); + notify("mozuku/commentHighlights", + presenter_.commentHighlightsParams(uri, text, kEmptySegments)); } - const auto contentIt = docContentHighlightRanges_.find(uri); - if (contentIt != docContentHighlightRanges_.end()) { - sendContentHighlights(uri, text, contentIt->second); + static const std::vector kEmptyContent; + if (!document.contentHighlightRanges.empty()) { + notify("mozuku/contentHighlights", + presenter_.contentHighlightsParams(uri, text, + document.contentHighlightRanges)); } else { - static const std::vector kEmptyContent; - sendContentHighlights(uri, text, kEmptyContent); + notify("mozuku/contentHighlights", + presenter_.contentHighlightsParams(uri, text, kEmptyContent)); } - sendSemanticHighlights(uri, tokens); + bool isJapanese = isJapaneseLanguage(document); + notify("mozuku/semanticHighlights", + presenter_.semanticHighlightsParams(uri, isJapanese, tokens)); } void LSPServer::analyzeChangedLines(const std::string &uri, @@ -828,319 +528,67 @@ void LSPServer::analyzeChangedLines(const std::string &uri, std::set changedLines = findChangedLines(oldText, newText); // 変更行の診断情報を削除 - removeDiagnosticsForLines(uri, changedLines); + if (auto *document = findDocument(uri)) { + removeDiagnosticsForLines(*document, changedLines); + } // 現在は文書全体を再解析 // TODO: パフォーマンス向上のため行単位の解析を実装 - analyzeAndPublish(uri, newText); -} - -std::string LSPServer::prepareAnalysisText(const std::string &uri, - const std::string &text) { - auto langIt = docLanguages_.find(uri); - if (langIt == docLanguages_.end()) { - docCommentSegments_.erase(uri); - docContentHighlightRanges_.erase(uri); - return text; - } - - const std::string &languageId = langIt->second; - if (languageId == "japanese") { - docCommentSegments_.erase(uri); - docContentHighlightRanges_.erase(uri); - return text; - } - - // HTML: ドキュメント本文をハイライト (
text
の text 部分) - if (languageId == "html") { - std::vector commentSegments = - MoZuku::comments::extractComments(languageId, text); - docCommentSegments_[uri] = commentSegments; - - std::vector contentRanges = collectHtmlContentRanges(text); - std::vector contentByteRanges; - contentByteRanges.reserve(contentRanges.size()); - for (const auto &range : contentRanges) { - contentByteRanges.push_back(ByteRange{range.startByte, range.endByte}); - } - // コメントも本文ハイライト対象に含める (クライアント側で装飾しやすくする) - for (const auto &segment : commentSegments) { - contentByteRanges.push_back( - ByteRange{segment.startByte, segment.endByte}); - } - docContentHighlightRanges_[uri] = std::move(contentByteRanges); - - // 全体をマスクしてコンテンツ部分のみ復元 - std::string masked = text; - for (char &ch : masked) { - if (ch != '\n' && ch != '\r') { - ch = ' '; - } - } - - for (const auto &range : contentRanges) { - if (range.startByte >= masked.size()) - continue; - size_t len = std::min(range.endByte - range.startByte, - masked.size() - range.startByte); - for (size_t i = 0; i < len; ++i) { - masked[range.startByte + i] = text[range.startByte + i]; - } - } - - for (const auto &segment : commentSegments) { - if (segment.startByte >= masked.size()) - continue; - size_t len = - std::min(segment.sanitized.size(), masked.size() - segment.startByte); - for (size_t i = 0; i < len; ++i) { - masked[segment.startByte + i] = segment.sanitized[i]; - } - } - - return masked; - } - - // LaTeX: ドキュメント本文をハイライト (タグ・数式を除くテキスト部分) - if (languageId == "latex") { - std::vector commentSegments = - collectLatexComments(text); - docCommentSegments_[uri] = commentSegments; - - std::vector contentRanges = collectLatexContentRanges(text); - std::vector contentByteRanges; - contentByteRanges.reserve(contentRanges.size()); - for (const auto &range : contentRanges) { - contentByteRanges.push_back(ByteRange{range.startByte, range.endByte}); - } - for (const auto &segment : commentSegments) { - contentByteRanges.push_back( - ByteRange{segment.startByte, segment.endByte}); - } - docContentHighlightRanges_[uri] = std::move(contentByteRanges); - - // 全体をマスクしてコンテンツ部分のみ復元 - std::string masked = text; - for (char &ch : masked) { - if (ch != '\n' && ch != '\r') { - ch = ' '; - } - } - - for (const auto &range : contentRanges) { - if (range.startByte >= masked.size()) - continue; - size_t len = std::min(range.endByte - range.startByte, - masked.size() - range.startByte); - for (size_t i = 0; i < len; ++i) { - masked[range.startByte + i] = text[range.startByte + i]; - } - } - - for (const auto &segment : commentSegments) { - if (segment.startByte >= masked.size()) - continue; - size_t len = - std::min(segment.sanitized.size(), masked.size() - segment.startByte); - for (size_t i = 0; i < len; ++i) { - masked[segment.startByte + i] = segment.sanitized[i]; - } - } - - return masked; - } - - if (!MoZuku::comments::isLanguageSupported(languageId)) { - docCommentSegments_.erase(uri); - docContentHighlightRanges_.erase(uri); - return text; - } - - // その他の言語: コメント部分をハイライト - std::vector segments = - MoZuku::comments::extractComments(languageId, text); - docCommentSegments_[uri] = segments; - docContentHighlightRanges_.erase(uri); - - std::string masked = text; - for (char &ch : masked) { - if (ch != '\n' && ch != '\r') { - ch = ' '; - } - } - - if (segments.empty()) { - return masked; - } - - const size_t docSize = masked.size(); - for (const auto &segment : segments) { - if (segment.startByte >= docSize) { - continue; - } - const std::string &sanitized = segment.sanitized; - size_t maxCopy = std::min(docSize - segment.startByte, sanitized.size()); - for (size_t i = 0; i < maxCopy; ++i) { - masked[segment.startByte + i] = sanitized[i]; - } - } - - return masked; -} - -void LSPServer::sendCommentHighlights( - const std::string &uri, const std::string &text, - const std::vector &segments) { - json ranges = json::array(); - - std::vector lineStarts = computeLineStarts(text); - for (const auto &segment : segments) { - Position start = byteOffsetToPosition(text, lineStarts, segment.startByte); - Position end = byteOffsetToPosition(text, lineStarts, segment.endByte); - - json range = { - {"start", {{"line", start.line}, {"character", start.character}}}, - {"end", {{"line", end.line}, {"character", end.character}}}}; - ranges.push_back(std::move(range)); - } - - notify("mozuku/commentHighlights", {{"uri", uri}, {"ranges", ranges}}); -} - -void LSPServer::sendContentHighlights(const std::string &uri, - const std::string &text, - const std::vector &ranges) { - json lspRanges = json::array(); - - std::vector lineStarts = computeLineStarts(text); - for (const auto &range : ranges) { - Position start = byteOffsetToPosition(text, lineStarts, range.startByte); - Position end = byteOffsetToPosition(text, lineStarts, range.endByte); - - lspRanges.push_back( - {{"start", {{"line", start.line}, {"character", start.character}}}, - {"end", {{"line", end.line}, {"character", end.character}}}}); - } - - notify("mozuku/contentHighlights", {{"uri", uri}, {"ranges", lspRanges}}); + analyzeAndPublish(uri); } -void LSPServer::sendSemanticHighlights(const std::string &uri, - const std::vector &tokens) { - auto langIt = docLanguages_.find(uri); - bool isJapanese = - (langIt != docLanguages_.end() && langIt->second == "japanese"); - - // japanese の場合のみセマンティックハイライトを無効化 - // (.ja.txt, .ja.md は LSP 側のセマンティックトークンを使用) - // HTML/LaTeX など他の言語は VS Code 拡張側の上塗りハイライトを使用 - if (isJapanese) { - notify("mozuku/semanticHighlights", - {{"uri", uri}, {"tokens", json::array()}}); - return; +MoZuku::analysis::ProcessedDocument +LSPServer::prepareDocument(DocumentState &document) { + if (document.languageId.empty()) { + document.commentSegments.clear(); + document.contentHighlightRanges.clear(); + return {document.text, {}, {}}; } - json tokenEntries = json::array(); - for (const auto &token : tokens) { - tokenEntries.push_back( - {{"range", - {{"start", {{"line", token.line}, {"character", token.startChar}}}, - {"end", {{"line", token.line}, {"character", token.endChar}}}}}, - {"type", token.tokenType}, - {"modifiers", token.tokenModifiers}}); - } + auto prepared = preprocessor_.prepare(document.languageId, document.text); + document.commentSegments = prepared.commentSegments; + document.contentHighlightRanges = prepared.contentHighlightRanges; - notify("mozuku/semanticHighlights", {{"uri", uri}, {"tokens", tokenEntries}}); + return prepared; } json LSPServer::buildSemanticTokens(const std::string &uri) { - auto docIt = docs_.find(uri); - if (docIt == docs_.end()) { + auto *document = findDocument(uri); + if (!document) { return json::array(); } - auto cached = docTokens_.find(uri); - if (cached != docTokens_.end()) { - return buildSemanticTokensFromTokens(cached->second); + if (document->tokensCached) { + return presenter_.semanticTokensData(document->tokens, tokenTypes_); } if (!analyzer_->isInitialized()) { analyzer_->initialize(config_); } - std::string analysisText = prepareAnalysisText(uri, docIt->second); - std::vector tokens = analyzer_->analyzeText(analysisText); - docTokens_[uri] = tokens; - - return buildSemanticTokensFromTokens(tokens); -} - -json LSPServer::buildSemanticTokensFromTokens( - const std::vector &tokens) { - json data = json::array(); + auto prepared = prepareDocument(*document); + std::vector tokens = analyzer_->analyzeText(prepared.analysisText); + document->tokens = tokens; + document->tokensCached = true; - int prevLine = 0, prevChar = 0; - - for (const auto &token : tokens) { - int deltaLine = token.line - prevLine; - int deltaChar = - (deltaLine == 0) ? token.startChar - prevChar : token.startChar; - - auto typeIt = - std::find(tokenTypes_.begin(), tokenTypes_.end(), token.tokenType); - int typeIndex = - (typeIt != tokenTypes_.end()) - ? static_cast(std::distance(tokenTypes_.begin(), typeIt)) - : 0; - - data.push_back(deltaLine); - data.push_back(deltaChar); - data.push_back(token.endChar - token.startChar); - data.push_back(typeIndex); - data.push_back(token.tokenModifiers); - - prevLine = token.line; - prevChar = token.startChar; - } - - return data; + return presenter_.semanticTokensData(document->tokens, tokenTypes_); } -void LSPServer::cacheDiagnostics(const std::string &uri, +void LSPServer::cacheDiagnostics(DocumentState &document, const std::vector &diags) { - docDiagnostics_[uri].clear(); + document.diagnosticsByLine.clear(); for (const auto &diag : diags) { int line = diag.range.start.line; - docDiagnostics_[uri][line].push_back(diag); + document.diagnosticsByLine[line].push_back(diag); } } -void LSPServer::removeDiagnosticsForLines(const std::string &uri, +void LSPServer::removeDiagnosticsForLines(DocumentState &document, const std::set &lines) { - if (docDiagnostics_.find(uri) == docDiagnostics_.end()) - return; - - auto &uriDiags = docDiagnostics_[uri]; for (int line : lines) { - uriDiags.erase(line); - } -} - -std::vector -LSPServer::getAllDiagnostics(const std::string &uri) const { - std::vector allDiags; - - auto uriIt = docDiagnostics_.find(uri); - if (uriIt != docDiagnostics_.end()) { - for (const auto &linePair : uriIt->second) { - for (const auto &diag : linePair.second) { - allDiags.push_back(diag); - } - } + document.diagnosticsByLine.erase(line); } - - return allDiags; } std::set LSPServer::findChangedLines(const std::string &oldText, diff --git a/mozuku-lsp/src/mecab_manager.cpp b/mozuku-lsp/src/mecab_manager.cpp index b042182..04fd796 100644 --- a/mozuku-lsp/src/mecab_manager.cpp +++ b/mozuku-lsp/src/mecab_manager.cpp @@ -1,6 +1,6 @@ #include "mecab_manager.hpp" +#include "mozuku/core/debug.hpp" #include -#include #include #include #include @@ -14,22 +14,12 @@ namespace MoZuku { namespace mecab { -static bool isDebugEnabled() { - static bool initialized = false; - static bool debug = false; - if (!initialized) { - debug = (std::getenv("MOZUKU_DEBUG") != nullptr); - initialized = true; - } - return debug; -} - MeCabManager::MeCabManager(bool enableCaboCha) : mecab_tagger_(nullptr), cabocha_parser_(nullptr), system_charset_("UTF-8"), cabocha_available_(false), enable_cabocha_(enableCaboCha) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCabManager created with CaboCha " << (enableCaboCha ? "enabled" : "disabled") << std::endl; } @@ -50,7 +40,7 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, const std::string &mecabCharset) { SystemLibInfo systemMeCab = detectSystemMeCab(); if (!systemMeCab.isAvailable) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[ERROR] System MeCab not detected" << std::endl; } return false; @@ -67,13 +57,13 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, mecab_args = "-d " + mecabDicPath; } else if (!systemMeCab.dicPath.empty()) { mecab_args = "-d " + systemMeCab.dicPath + "/ipadic"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Using detected MeCab dicdir: " << systemMeCab.dicPath << "/ipadic" << std::endl; } } - if (isDebugEnabled() && !mecab_args.empty()) { + if (debug::isEnabled() && !mecab_args.empty()) { std::cerr << "[DEBUG] MeCab args: " << mecab_args << std::endl; } @@ -81,13 +71,13 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, if (!mecab_tagger_) { std::string error = MeCab::getTaggerError() ? MeCab::getTaggerError() : "Unknown MeCab error"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[ERROR] MeCab initialization failed with args '" << mecab_args << "': " << error << std::endl; } if (!mecab_args.empty()) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Trying MeCab without explicit dictionary path..." << std::endl; } @@ -95,7 +85,7 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, if (!mecab_tagger_) { error = MeCab::getTaggerError() ? MeCab::getTaggerError() : "Unknown MeCab error"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[ERROR] MeCab fallback initialization also failed: " << error << std::endl; } @@ -108,7 +98,7 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, system_charset_ = testMeCabCharset(mecab_tagger_, system_charset_); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCab successfully initialized with charset: " << system_charset_ << std::endl; } @@ -119,22 +109,22 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, cabocha_parser_ = cabocha_new2(""); if (cabocha_parser_) { cabocha_available_ = true; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] CaboCha successfully initialized" << std::endl; } } else { const char *error = cabocha_strerror(nullptr); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] CaboCha initialization failed: " << (error ? error : "Unknown error") << std::endl; } } - } else if (isDebugEnabled()) { + } else if (debug::isEnabled()) { std::cerr << "[DEBUG] CaboCha not available on system" << std::endl; } } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCabManager initialized - MeCab: " << (mecab_tagger_ ? "OK" : "FAIL") << ", CaboCha: " << (cabocha_available_ ? "OK" : "N/A") @@ -147,7 +137,7 @@ bool MeCabManager::initialize(const std::string &mecabDicPath, SystemLibInfo MeCabManager::detectSystemMeCab() { SystemLibInfo info; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Detecting system MeCab installation..." << std::endl; } @@ -164,7 +154,7 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { } info.dicPath = dicdir; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] mecab-config --dicdir: " << dicdir << std::endl; } } @@ -187,7 +177,7 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { charset.erase(charset.find_last_not_of(" \t") + 1); info.charset = charset; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Found charset in dicrc: " << charset << std::endl; } @@ -200,12 +190,12 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { if (info.charset.empty()) { info.charset = "UTF-8"; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Using default charset: UTF-8" << std::endl; } } else if (info.charset != "UTF-8") { // Test if MeCab actually works with UTF-8 despite dicrc settings - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] dicrc says charset: " << info.charset << ", testing actual behavior..." << std::endl; } @@ -228,7 +218,7 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { if (surface == testUtf8 && surface.size() == 6) { // "誤解" is 6 bytes in UTF-8 utf8Works = true; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCab actually works with UTF-8 input, " "overriding dicrc charset from " << info.charset << " to UTF-8" << std::endl; @@ -246,7 +236,7 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { info.isAvailable = !info.dicPath.empty(); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] System MeCab detection result - Available: " << (info.isAvailable ? "yes" : "no") << ", DicPath: " << info.dicPath << ", Charset: " << info.charset @@ -259,7 +249,7 @@ SystemLibInfo MeCabManager::detectSystemMeCab() { SystemLibInfo MeCabManager::detectSystemCaboCha() { SystemLibInfo info; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Detecting system CaboCha installation..." << std::endl; } @@ -271,7 +261,7 @@ SystemLibInfo MeCabManager::detectSystemCaboCha() { char buffer[256]; if (fgets(buffer, sizeof(buffer), pipe)) { info.isAvailable = true; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] cabocha-config found, system CaboCha available" << std::endl; } @@ -282,7 +272,7 @@ SystemLibInfo MeCabManager::detectSystemCaboCha() { SystemLibInfo mecabInfo = detectSystemMeCab(); info.charset = mecabInfo.charset; - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] System CaboCha detection result - Available: " << (info.isAvailable ? "yes" : "no") << ", Charset: " << info.charset << std::endl; @@ -310,7 +300,7 @@ std::string MeCabManager::testMeCabCharset(MeCab::Tagger *tagger, // If we get back the same UTF-8 text, MeCab is working in UTF-8 mode if (surface == testUtf8 && surface.size() == 6) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCab accepts UTF-8 input directly, using UTF-8" << std::endl; } @@ -318,7 +308,7 @@ std::string MeCabManager::testMeCabCharset(MeCab::Tagger *tagger, } } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] MeCab requires " << originalCharset << " encoding" << std::endl; } diff --git a/mozuku-lsp/src/pos_analyzer.cpp b/mozuku-lsp/src/pos_analyzer.cpp index 76200b4..3e7e00d 100644 --- a/mozuku-lsp/src/pos_analyzer.cpp +++ b/mozuku-lsp/src/pos_analyzer.cpp @@ -9,9 +9,8 @@ std::string POSAnalyzer::mapPosToType(const char *feature) { if (!feature) return "unknown"; - std::string f = text::TextProcessor::sanitizeUTF8(std::string(feature)); - auto p = f.find(','); - std::string pos = (p == std::string::npos) ? f : f.substr(0, p); + std::vector fields = parseFeatureFields(std::string(feature)); + std::string pos = fields.empty() ? "" : fields.front(); if (pos.find("名詞") != std::string::npos) return "noun"; @@ -39,6 +38,38 @@ std::string POSAnalyzer::mapPosToType(const char *feature) { return "unknown"; } +bool POSAnalyzer::isNounFeature(const std::string &feature) { + std::vector fields = parseFeatureFields(feature); + return !fields.empty() && fields[0] == "名詞"; +} + +bool POSAnalyzer::isParticleFeature(const std::string &feature) { + std::vector fields = parseFeatureFields(feature); + return !fields.empty() && fields[0] == "助詞"; +} + +bool POSAnalyzer::isConjunctionFeature(const std::string &feature) { + std::vector fields = parseFeatureFields(feature); + return !fields.empty() && fields[0] == "接続詞"; +} + +bool POSAnalyzer::isAdversativeGaFeature(const std::string &feature) { + std::vector fields = parseFeatureFields(feature); + return fields.size() > 6 && fields[0] == "助詞" && fields[1] == "接続助詞" && + fields[6] == "が"; +} + +std::string POSAnalyzer::particleKey(const std::string &feature) { + std::vector fields = parseFeatureFields(feature); + if (fields.empty()) { + return ""; + } + if (fields.size() == 1) { + return fields[0]; + } + return fields[0] + "," + fields[1]; +} + void POSAnalyzer::parseFeatureDetails(const char *feature, std::string &baseForm, std::string &reading, @@ -75,12 +106,12 @@ DetailedPOS POSAnalyzer::parseDetailedPOS(const char *feature, if (!feature) return pos; - std::string f = + std::string featureText = (systemCharset == "UTF-8") - ? std::string(feature) + ? text::TextProcessor::sanitizeUTF8(std::string(feature)) : encoding::systemToUtf8(std::string(feature), systemCharset); - std::vector fields = splitFeature(f); + std::vector fields = splitFeature(featureText); // Fill in the detailed POS structure if (fields.size() > 0) @@ -134,6 +165,11 @@ unsigned POSAnalyzer::computeModifiers(const std::string &text, size_t start, return mods; } +std::vector +POSAnalyzer::parseFeatureFields(const std::string &feature) { + return splitFeature(text::TextProcessor::sanitizeUTF8(feature)); +} + std::vector POSAnalyzer::splitFeature(const std::string &feature) { std::vector fields; size_t pos = 0; diff --git a/mozuku-lsp/src/presenter.cpp b/mozuku-lsp/src/presenter.cpp new file mode 100644 index 0000000..9f2a3e4 --- /dev/null +++ b/mozuku-lsp/src/presenter.cpp @@ -0,0 +1,123 @@ +#include "mozuku/lsp/presenter.hpp" + +#include "utf16.hpp" + +#include +#include + +namespace { + +nlohmann::json makeRangeJson(const Position &start, const Position &end) { + return {{"start", {{"line", start.line}, {"character", start.character}}}, + {"end", {{"line", end.line}, {"character", end.character}}}}; +} + +nlohmann::json makeTokenRangeJson(const TokenData &token) { + return makeRangeJson(Position{token.line, token.startChar}, + Position{token.line, token.endChar}); +} + +} // namespace + +namespace MoZuku::lsp { + +Presenter::json Presenter::publishDiagnosticsParams( + const std::string &uri, const std::vector &diags) const { + json diagnostics = json::array(); + for (const auto &diag : diags) { + diagnostics.push_back( + {{"range", makeRangeJson(diag.range.start, diag.range.end)}, + {"severity", diag.severity}, + {"message", diag.message}}); + } + + return {{"uri", uri}, {"diagnostics", diagnostics}}; +} + +Presenter::json Presenter::commentHighlightsParams( + const std::string &uri, const std::string &text, + const std::vector &segments) const { + json ranges = json::array(); + TextOffsetMapper offsetMapper(text); + + for (const auto &segment : segments) { + Position start = offsetMapper.byteOffsetToPosition(segment.startByte); + Position end = offsetMapper.byteOffsetToPosition(segment.endByte); + ranges.push_back(makeRangeJson(start, end)); + } + + return {{"uri", uri}, {"ranges", ranges}}; +} + +Presenter::json +Presenter::contentHighlightsParams(const std::string &uri, + const std::string &text, + const std::vector &ranges) const { + json lspRanges = json::array(); + TextOffsetMapper offsetMapper(text); + + for (const auto &range : ranges) { + Position start = offsetMapper.byteOffsetToPosition(range.startByte); + Position end = offsetMapper.byteOffsetToPosition(range.endByte); + lspRanges.push_back(makeRangeJson(start, end)); + } + + return {{"uri", uri}, {"ranges", lspRanges}}; +} + +Presenter::json Presenter::semanticHighlightsParams( + const std::string &uri, bool isJapanese, + const std::vector &tokens) const { + if (isJapanese) { + return {{"uri", uri}, {"tokens", json::array()}}; + } + + json tokenEntries = json::array(); + for (const auto &token : tokens) { + tokenEntries.push_back({{"range", makeTokenRangeJson(token)}, + {"type", token.tokenType}, + {"modifiers", token.tokenModifiers}}); + } + + return {{"uri", uri}, {"tokens", tokenEntries}}; +} + +Presenter::json Presenter::semanticTokensData( + const std::vector &tokens, + const std::vector &tokenTypes) const { + json data = json::array(); + int prevLine = 0; + int prevChar = 0; + + for (const auto &token : tokens) { + int deltaLine = token.line - prevLine; + int deltaChar = + (deltaLine == 0) ? token.startChar - prevChar : token.startChar; + + auto typeIt = + std::find(tokenTypes.begin(), tokenTypes.end(), token.tokenType); + int typeIndex = + (typeIt != tokenTypes.end()) + ? static_cast(std::distance(tokenTypes.begin(), typeIt)) + : 0; + + data.push_back(deltaLine); + data.push_back(deltaChar); + data.push_back(token.endChar - token.startChar); + data.push_back(typeIndex); + data.push_back(token.tokenModifiers); + + prevLine = token.line; + prevChar = token.startChar; + } + + return data; +} + +Presenter::json Presenter::hoverResult(const TokenData &token, + const std::string &markdown) const { + return {{"contents", {{"kind", "markdown"}, {"value", markdown}}}, + {"range", makeTokenRangeJson(token)}}; +} + +} // namespace MoZuku::lsp diff --git a/mozuku-lsp/src/text_processor.cpp b/mozuku-lsp/src/text_processor.cpp index 2fa03b9..06cbcbc 100644 --- a/mozuku-lsp/src/text_processor.cpp +++ b/mozuku-lsp/src/text_processor.cpp @@ -1,85 +1,28 @@ #include "text_processor.hpp" +#include "encoding_utils.hpp" +#include "mozuku/core/debug.hpp" #include -#include #include +#include +#include namespace MoZuku { namespace text { -static bool isDebugEnabled() { - static bool initialized = false; - static bool debug = false; - if (!initialized) { - debug = (std::getenv("MOZUKU_DEBUG") != nullptr); - initialized = true; - } - return debug; -} - std::string TextProcessor::sanitizeUTF8(const std::string &input) { - if (input.empty()) - return input; - - std::string result; - result.reserve(input.size()); - - for (size_t i = 0; i < input.size(); ++i) { - unsigned char c = static_cast(input[i]); - - // ASCII characters (0x00-0x7F) are safe - if (c < 0x80) { - // Skip control characters except tab, newline, carriage return - if (c >= 0x20 || c == 0x09 || c == 0x0A || c == 0x0D) { - result += static_cast(c); - } - continue; - } - - // Handle multi-byte UTF-8 sequences - size_t seqLen = 0; - if ((c & 0xE0) == 0xC0) - seqLen = 2; // 110xxxxx (2-byte) - else if ((c & 0xF0) == 0xE0) - seqLen = 3; // 1110xxxx (3-byte) - else if ((c & 0xF8) == 0xF0) - seqLen = 4; // 11110xxx (4-byte) - else { - // Invalid UTF-8 start byte, skip it - continue; - } - - // Check if we have enough bytes for the sequence - if (i + seqLen > input.size()) { - break; // Incomplete sequence at end of string - } - - // Validate all continuation bytes - if (isValidUtf8Sequence(input, i, seqLen)) { - // Valid sequence, copy it - for (size_t j = 0; j < seqLen; ++j) { - result += input[i + j]; - } - i += seqLen - 1; // -1 because loop will increment i - } else { - // Invalid sequence, skip start byte (continuation bytes will be handled - // in next iterations) - continue; - } - } - - return result; + return encoding::sanitizeUtf8(input); } std::vector TextProcessor::splitIntoSentences(const std::string &text) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] splitIntoSentences called with text length: " << text.size() << std::endl; } std::vector sentences; if (text.empty()) { - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Empty text, returning empty sentences" << std::endl; } return sentences; @@ -158,7 +101,7 @@ TextProcessor::splitIntoSentences(const std::string &text) { sentence.text = sentence.text.substr(textStart, textEnd - textStart); sentences.push_back(sentence); - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] Created sentence " << sentenceId - 1 << ": length=" << sentence.text.size() << ", start=" << sentence.start << ", end=" << sentence.end @@ -178,7 +121,7 @@ TextProcessor::splitIntoSentences(const std::string &text) { } } - if (isDebugEnabled()) { + if (debug::isEnabled()) { std::cerr << "[DEBUG] splitIntoSentences completed: created " << sentences.size() << " sentences" << std::endl; } diff --git a/mozuku-lsp/src/tree_sitter_document.cpp b/mozuku-lsp/src/tree_sitter_document.cpp new file mode 100644 index 0000000..00187b3 --- /dev/null +++ b/mozuku-lsp/src/tree_sitter_document.cpp @@ -0,0 +1,107 @@ +#include "mozuku/treesitter/document.hpp" + +#include +#include +#include +#include +#include + +extern "C" { +const TSLanguage *tree_sitter_c(); +const TSLanguage *tree_sitter_cpp(); +const TSLanguage *tree_sitter_html(); +const TSLanguage *tree_sitter_javascript(); +const TSLanguage *tree_sitter_python(); +const TSLanguage *tree_sitter_rust(); +const TSLanguage *tree_sitter_typescript(); +const TSLanguage *tree_sitter_tsx(); +const TSLanguage *tree_sitter_latex(); +} + +namespace { + +using LanguageFactory = const TSLanguage *(*)(); + +const std::unordered_map &languageMap() { + static const std::unordered_map map = { + {"c", tree_sitter_c}, + {"cpp", tree_sitter_cpp}, + {"c++", tree_sitter_cpp}, + {"html", tree_sitter_html}, + {"javascript", tree_sitter_javascript}, + {"javascriptreact", tree_sitter_tsx}, + {"typescript", tree_sitter_typescript}, + {"typescriptreact", tree_sitter_tsx}, + {"tsx", tree_sitter_tsx}, + {"python", tree_sitter_python}, + {"rust", tree_sitter_rust}, + {"latex", tree_sitter_latex}}; + return map; +} + +std::string toLower(std::string input) { + std::transform( + input.begin(), input.end(), input.begin(), + [](unsigned char c) { return static_cast(std::tolower(c)); }); + return input; +} + +struct ParserDeleter { + void operator()(TSParser *parser) const { + if (parser) { + ts_parser_delete(parser); + } + } +}; + +} // namespace + +namespace MoZuku::treesitter { + +const TSLanguage *resolveLanguage(const std::string &languageId) { + const auto &map = languageMap(); + auto it = map.find(toLower(languageId)); + if (it == map.end()) { + return nullptr; + } + return it->second(); +} + +bool isLanguageSupported(const std::string &languageId) { + const auto &map = languageMap(); + return map.find(toLower(languageId)) != map.end(); +} + +ParsedDocument::ParsedDocument() : tree_(nullptr, &ts_tree_delete) {} + +ParsedDocument::ParsedDocument(const std::string &languageId, + const std::string &text) + : ParsedDocument(resolveLanguage(languageId), text) {} + +ParsedDocument::ParsedDocument(const TSLanguage *language, + const std::string &text) + : tree_(nullptr, &ts_tree_delete) { + if (!language) { + return; + } + + std::unique_ptr parser(ts_parser_new()); + if (!parser) { + return; + } + + if (!ts_parser_set_language(parser.get(), language)) { + return; + } + + tree_.reset( + ts_parser_parse_string(parser.get(), nullptr, text.c_str(), text.size())); +} + +bool ParsedDocument::isValid() const { return tree_ != nullptr; } + +TSNode ParsedDocument::root() const { + return tree_ ? ts_tree_root_node(tree_.get()) : TSNode{}; +} + +} // namespace MoZuku::treesitter diff --git a/mozuku-lsp/src/utf16.cpp b/mozuku-lsp/src/utf16.cpp index def910b..a85f84f 100644 --- a/mozuku-lsp/src/utf16.cpp +++ b/mozuku-lsp/src/utf16.cpp @@ -1,139 +1,162 @@ #include "utf16.hpp" +#include "encoding_utils.hpp" + namespace { -static inline int utf8SeqLen(unsigned char c) { - if (c < 0x80) + +size_t validatedSequenceLength(const std::string &text, size_t offset) { + if (offset >= text.size()) { + return 0; + } + + size_t seqLen = MoZuku::encoding::utf8SequenceLength( + static_cast(text[offset])); + if (seqLen == 0 || offset + seqLen > text.size()) { return 1; - if (c < 0xE0) - return 2; - if (c < 0xF0) - return 3; - return 4; + } + + for (size_t i = 1; i < seqLen; ++i) { + unsigned char c = static_cast(text[offset + i]); + if ((c & 0xC0) != 0x80) { + return 1; + } + } + + return seqLen; } -static inline unsigned int decodeCodePoint(const std::string &s, size_t &i) { - unsigned char c = static_cast(s[i]); - if (c < 0x80) { - return s[i++]; +unsigned int decodeCodePoint(const std::string &text, size_t offset, + size_t seqLen) { + unsigned char c = static_cast(text[offset]); + if (seqLen == 1) { + return c; } - if ((c >> 5) == 0x6) { - unsigned int cp = - ((c & 0x1F) << 6) | (static_cast(s[i + 1]) & 0x3F); - i += 2; - return cp; + if (seqLen == 2) { + return ((c & 0x1F) << 6) | + (static_cast(text[offset + 1]) & 0x3F); } - if ((c >> 4) == 0xE) { - unsigned int cp = ((c & 0x0F) << 12) | - ((static_cast(s[i + 1]) & 0x3F) << 6) | - (static_cast(s[i + 2]) & 0x3F); - i += 3; - return cp; + if (seqLen == 3) { + return ((c & 0x0F) << 12) | + ((static_cast(text[offset + 1]) & 0x3F) << 6) | + (static_cast(text[offset + 2]) & 0x3F); + } + return ((c & 0x07) << 18) | + ((static_cast(text[offset + 1]) & 0x3F) << 12) | + ((static_cast(text[offset + 2]) & 0x3F) << 6) | + (static_cast(text[offset + 3]) & 0x3F); +} + +int utf16UnitsAt(const std::string &text, size_t offset, size_t seqLen) { + if (seqLen < 4) { + return 1; } - unsigned int cp = ((c & 0x07) << 18) | - ((static_cast(s[i + 1]) & 0x3F) << 12) | - ((static_cast(s[i + 2]) & 0x3F) << 6) | - (static_cast(s[i + 3]) & 0x3F); - i += 4; - return cp; + + unsigned int cp = decodeCodePoint(text, offset, seqLen); + return cp <= 0xFFFF ? 1 : 2; } + } // namespace +TextOffsetMapper::TextOffsetMapper(const std::string &text) + : text_(text), line_starts_(computeLineStarts(text)) {} + +const std::vector &TextOffsetMapper::lineStarts() const { + return line_starts_; +} + +Position TextOffsetMapper::byteOffsetToPosition(size_t offset) const { + return ::byteOffsetToPosition(text_, line_starts_, offset); +} + +size_t TextOffsetMapper::positionToByteOffset(int line, int character) const { + return ::positionToByteOffset(text_, line_starts_, line, character); +} + +size_t TextOffsetMapper::positionToByteOffset(const Position &position) const { + return positionToByteOffset(position.line, position.character); +} + +size_t TextOffsetMapper::tokenStartByteOffset(const TokenData &token) const { + return positionToByteOffset(token.line, token.startChar); +} + std::vector computeLineStarts(const std::string &text) { std::vector lineStarts; lineStarts.reserve(64); lineStarts.push_back(0); - for (size_t i = 0; i < text.size(); ++i) - if (text[i] == '\n') + for (size_t i = 0; i < text.size(); ++i) { + if (text[i] == '\n') { lineStarts.push_back(i + 1); + } + } return lineStarts; } Position byteOffsetToPosition(const std::string &text, const std::vector &lineStarts, size_t offset) { - // オフセットをテキストサイズに制限 - if (offset > text.size()) + if (offset > text.size()) { offset = text.size(); + } - // オフセット以下の最後の開始位置を二分探索で検索 - size_t lo = 0, hi = lineStarts.size(); + size_t lo = 0; + size_t hi = lineStarts.size(); while (lo + 1 < hi) { size_t mid = (lo + hi) / 2; - if (lineStarts[mid] <= offset) + if (lineStarts[mid] <= offset) { lo = mid; - else + } else { hi = mid; + } + } + + size_t bytePos = lineStarts[lo]; + int utf16Pos = 0; + + while (bytePos < offset && bytePos < text.size() && text[bytePos] != '\n') { + size_t seqLen = validatedSequenceLength(text, bytePos); + utf16Pos += utf16UnitsAt(text, bytePos, seqLen); + bytePos += seqLen; } - size_t lineStart = lineStarts[lo]; + return Position{static_cast(lo), utf16Pos}; +} - // 行開始からオフセットまでのUTF-16コードユニット数をカウント - size_t i = lineStart; - unsigned int col16 = 0; +size_t positionToByteOffset(const std::string &text, + const std::vector &lineStarts, int line, + int character) { + if (line < 0 || lineStarts.empty()) { + return 0; + } + if (line >= static_cast(lineStarts.size())) { + return text.size(); + } - while (i < offset && i < text.size() && text[i] != '\n') { - unsigned char c = static_cast(text[i]); + size_t bytePos = lineStarts[line]; + int utf16Pos = 0; - // 効率性と正確性のためASCII文字を直接処理 - if (c < 0x80) { - // ASCII文字 (タブ、スペースを含む) は常に1つのUTF-16コードユニット - col16 += 1; - i += 1; - } else { - // マルチバイトUTF-8文字 - size_t prev = i; - unsigned int cp = decodeCodePoint(text, i); - - // UTF-16エンコーディング: - // BMP文字は1コードユニット、その他は2コードユニット (サロゲートペア) - if (cp <= 0xFFFF) { - col16 += 1; // BMP文字: 1 UTF-16コードユニット - } else { - col16 += 2; // 非BMP文字: 2 UTF-16コードユニット (サロゲートペア) - } - - // 無限ループを防ぐ安全性チェック - if (i == prev) { - i++; // 無効なバイトをスキップ - col16++; // 1コードユニットとしてカウント - } - } + while (bytePos < text.size() && utf16Pos < character && + text[bytePos] != '\n') { + size_t seqLen = validatedSequenceLength(text, bytePos); + utf16Pos += utf16UnitsAt(text, bytePos, seqLen); + bytePos += seqLen; } - return Position{static_cast(lo), static_cast(col16)}; + return bytePos; +} + +size_t positionToByteOffset(const std::string &text, int line, int character) { + return positionToByteOffset(text, computeLineStarts(text), line, character); } size_t utf8ToUtf16Length(const std::string &utf8Str) { - size_t i = 0; + size_t offset = 0; size_t utf16Length = 0; - while (i < utf8Str.size()) { - unsigned char c = static_cast(utf8Str[i]); - - // 効率性と正確性のためASCII文字を直接処理 - if (c < 0x80) { - // ASCII文字 (タブ、スペースを含む) は常に1つのUTF-16コードユニット - utf16Length += 1; - i += 1; - } else { - // マルチバイトUTF-8文字 - size_t prev = i; - unsigned int cp = decodeCodePoint(utf8Str, i); - - // UTF-16エンコーディング: - // BMP文字は1コードユニット、その他は2コードユニット (サロゲートペア) - if (cp <= 0xFFFF) { - utf16Length += 1; // BMP character - } else { - utf16Length += 2; // Non-BMP character (surrogate pair) - } - - // 無限ループを防ぐ安全性チェック - if (i == prev) { - i++; // 無効なバイトをスキップ - utf16Length++; // 1コードユニットとしてカウント - } - } + while (offset < utf8Str.size()) { + size_t seqLen = validatedSequenceLength(utf8Str, offset); + utf16Length += utf16UnitsAt(utf8Str, offset, seqLen); + offset += seqLen; } return utf16Length; diff --git a/vscode-mozuku/src/client.ts b/vscode-mozuku/src/client.ts index a0babb3..9159a5b 100644 --- a/vscode-mozuku/src/client.ts +++ b/vscode-mozuku/src/client.ts @@ -1,6 +1,5 @@ import * as vscode from "vscode"; import * as fs from "fs"; -import * as path from "path"; import { LanguageClient, LanguageClientOptions, @@ -8,6 +7,8 @@ import { TransportKind, State, } from "vscode-languageclient/node"; +import { buildDocumentSelector, buildInitializationOptions } from "./config"; +import { resolveServerPath as discoverServerPath } from "./server-discovery"; type CommentHighlightMessage = { uri: string; @@ -37,21 +38,6 @@ type SemanticHighlightMessage = { }>; }; -const supportedLanguages = [ - "japanese", - "c", - "cpp", - "html", - "python", - "javascript", - "javascriptreact", - "typescript", - "typescriptreact", - "rust", - "html", - "latex", -]; - export async function startClient( ctx: vscode.ExtensionContext, serverPath: string, @@ -60,7 +46,7 @@ export async function startClient( process.env.VSCODE_DEBUG_MODE === "true" || ctx.extensionMode === vscode.ExtensionMode.Development; - const resolved = resolveServerPath(ctx, serverPath); + const resolved = discoverServerPath(ctx, serverPath, isDebug); console.log("[MoZuku] 最終的に解決されたサーバーパス:", resolved); if (!fs.existsSync(resolved)) { @@ -85,87 +71,7 @@ export async function startClient( }, }; - const config = vscode.workspace.getConfiguration("mozuku"); - const initOptions = { - mozuku: { - mecab: { - dicdir: config.get("mecab.dicdir", ""), - charset: config.get("mecab.charset", "UTF-8"), - }, - analysis: { - enableCaboCha: config.get("analysis.enableCaboCha", true), - grammarCheck: config.get("analysis.grammarCheck", true), - minJapaneseRatio: config.get("analysis.minJapaneseRatio", 0.1), - warningMinSeverity: config.get( - "analysis.warningMinSeverity", - 2, - ), - warnings: { - particleDuplicate: config.get( - "analysis.warnings.particleDuplicate", - true, - ), - particleSequence: config.get( - "analysis.warnings.particleSequence", - true, - ), - particleMismatch: config.get( - "analysis.warnings.particleMismatch", - true, - ), - sentenceStructure: config.get( - "analysis.warnings.sentenceStructure", - false, - ), - styleConsistency: config.get( - "analysis.warnings.styleConsistency", - false, - ), - redundancy: config.get( - "analysis.warnings.redundancy", - false, - ), - }, - rules: { - commaLimit: config.get("analysis.rules.commaLimit", true), - adversativeGa: config.get( - "analysis.rules.adversativeGa", - true, - ), - duplicateParticleSurface: config.get( - "analysis.rules.duplicateParticleSurface", - true, - ), - adjacentParticles: config.get( - "analysis.rules.adjacentParticles", - true, - ), - conjunctionRepeat: config.get( - "analysis.rules.conjunctionRepeat", - true, - ), - raDropping: config.get("analysis.rules.raDropping", true), - commaLimitMax: config.get("analysis.rules.commaLimitMax", 3), - adversativeGaMax: config.get( - "analysis.rules.adversativeGaMax", - 1, - ), - duplicateParticleSurfaceMaxRepeat: config.get( - "analysis.rules.duplicateParticleSurfaceMaxRepeat", - 1, - ), - adjacentParticlesMaxRepeat: config.get( - "analysis.rules.adjacentParticlesMaxRepeat", - 1, - ), - conjunctionRepeatMax: config.get( - "analysis.rules.conjunctionRepeatMax", - 1, - ), - }, - }, - }, - }; + const initOptions = buildInitializationOptions(); if (isDebug) { console.log( @@ -174,14 +80,8 @@ export async function startClient( ); } - const documentSelector = [ - ...supportedLanguages.map((language) => ({ language })), - { scheme: "file", pattern: "**/*.ja.txt" }, - { scheme: "file", pattern: "**/*.ja.md" }, - ]; - const clientOptions: LanguageClientOptions = { - documentSelector, + documentSelector: buildDocumentSelector(), synchronize: { fileEvents: vscode.workspace.createFileSystemWatcher("**/*"), }, @@ -439,212 +339,12 @@ export async function startClient( return client; } -function resolveServerPath( +export function resolveServerPath( ctx: vscode.ExtensionContext, configured: string, ): string { - const isWindows = process.platform === "win32"; - const exeName = isWindows ? "mozuku-lsp.exe" : "mozuku-lsp"; const isDebug = process.env.VSCODE_DEBUG_MODE === "true" || ctx.extensionMode === vscode.ExtensionMode.Development; - const configuredValue = configured.trim(); - const envValue = process.env.MOZUKU_LSP?.trim() ?? ""; - const workspaceRoot = vscode.workspace.workspaceFolders?.[0]?.uri.fsPath; - const extensionRoot = ctx.extensionUri.fsPath; - const seen = new Set(); - - if (isDebug) { - console.log("[MoZuku] サーバーパスを解決中:", { - configured: configuredValue, - extensionPath: extensionRoot, - workspaceFolders: vscode.workspace.workspaceFolders?.map( - (f) => f.uri.fsPath, - ), - }); - } - - const candidates: { type: string; path: string }[] = []; - const add = (type: string, p: string | undefined) => { - if (!p || p.trim().length === 0) { - return; - } - const normalized = path.normalize(p); - if (seen.has(normalized)) { - return; - } - seen.add(normalized); - candidates.push({ type, path: p }); - }; - const addResolvedPath = (type: string, candidate: string | undefined) => { - if (!candidate) { - return; - } - if (path.isAbsolute(candidate)) { - add(type, candidate); - return; - } - if (workspaceRoot) { - add(`${type} (workspace)`, path.join(workspaceRoot, candidate)); - } - add(`${type} (extension)`, path.join(extensionRoot, candidate)); - add(`${type} (cwd)`, path.resolve(candidate)); - }; - const installDirs = (): string[] => { - const dirs: string[] = []; - const pathEnv = process.env.PATH || ""; - for (const dir of pathEnv.split(path.delimiter)) { - if (dir) { - dirs.push(dir); - } - } - - const home = process.env.HOME || process.env.USERPROFILE; - if (home) { - dirs.push(path.join(home, ".local", "bin")); - dirs.push(path.join(home, "bin")); - } - - if (isWindows) { - const localAppData = process.env.LOCALAPPDATA; - if (localAppData) { - dirs.push(path.join(localAppData, "Programs", "MoZuku", "bin")); - dirs.push(path.join(localAppData, "Programs", "mozuku-lsp", "bin")); - } - for (const base of [ - process.env.ProgramFiles, - process.env["ProgramFiles(x86)"], - ]) { - if (!base) { - continue; - } - dirs.push(path.join(base, "MoZuku", "bin")); - dirs.push(path.join(base, "mozuku-lsp", "bin")); - } - } else { - dirs.push("/usr/local/bin"); - dirs.push("/usr/bin"); - if (process.platform === "darwin") { - dirs.push("/opt/homebrew/bin"); - dirs.push("/opt/local/bin"); - } - } - - return dirs; - }; - const addCommandSearch = (type: string, commandName: string | undefined) => { - if (!commandName || hasPathSep(commandName)) { - return; - } - const names = - isWindows && !commandName.toLowerCase().endsWith(".exe") - ? [commandName, `${commandName}.exe`] - : [commandName]; - for (const dir of installDirs()) { - for (const name of names) { - add(type, path.join(dir, name)); - } - } - }; - - if (configuredValue && hasPathSep(configuredValue)) { - addResolvedPath("設定済み", configuredValue); - } - if (envValue && hasPathSep(envValue)) { - addResolvedPath("環境変数 MOZUKU_LSP", envValue); - } - - addCommandSearch( - "設定済みコマンド", - configuredValue && !hasPathSep(configuredValue) - ? configuredValue - : undefined, - ); - addCommandSearch( - "環境変数 MOZUKU_LSP", - envValue && !hasPathSep(envValue) ? envValue : undefined, - ); - addCommandSearch("デフォルトコマンド", exeName); - - add( - "パッケージ済み", - vscode.Uri.joinPath(ctx.extensionUri, "bin", exeName).fsPath, - ); - - const plat = process.platform; - const arch = process.arch; - add( - "パッケージ済み", - vscode.Uri.joinPath( - ctx.extensionUri, - "server", - "bin", - `${plat}-${arch}`, - exeName, - ).fsPath, - ); - - if (workspaceRoot) { - add( - "ワークスペース-install", - path.join(workspaceRoot, "build", "install", "bin", exeName), - ); - add("ワークスペース-build", path.join(workspaceRoot, "build", exeName)); - add( - "ワークスペース-install", - path.join( - workspaceRoot, - "mozuku-lsp", - "build", - "install", - "bin", - exeName, - ), - ); - add( - "ワークスペース-build", - path.join(workspaceRoot, "mozuku-lsp", "build", exeName), - ); - } - - add( - "開発-install", - path.join( - extensionRoot, - "..", - "mozuku-lsp", - "build", - "install", - "bin", - exeName, - ), - ); - add( - "開発-build", - path.join(extensionRoot, "..", "mozuku-lsp", "build", exeName), - ); - - for (const candidate of candidates) { - if (fs.existsSync(candidate.path)) { - if (isDebug) { - console.log(`[MoZuku] ${candidate.type}パスを使用:`, candidate.path); - } - return candidate.path; - } else if (isDebug) { - console.log( - `[MoZuku] ${candidate.type}パスが見つかりません:`, - candidate.path, - ); - } - } - - const fallback = configuredValue || envValue || exeName; - if (isDebug) { - console.log("[MoZuku] フォールバックパスを使用:", fallback); - } - return fallback; -} - -function hasPathSep(p: string): boolean { - return p.includes("/") || p.includes("\\"); + return discoverServerPath(ctx, configured, isDebug); } diff --git a/vscode-mozuku/src/config.ts b/vscode-mozuku/src/config.ts new file mode 100644 index 0000000..a821a9f --- /dev/null +++ b/vscode-mozuku/src/config.ts @@ -0,0 +1,112 @@ +import * as vscode from "vscode"; +import type { LanguageClientOptions } from "vscode-languageclient/node"; + +const supportedLanguages = [ + "japanese", + "c", + "cpp", + "html", + "python", + "javascript", + "javascriptreact", + "typescript", + "typescriptreact", + "rust", + "html", + "latex", +]; + +export function buildInitializationOptions() { + const config = vscode.workspace.getConfiguration("mozuku"); + + return { + mozuku: { + mecab: { + dicdir: config.get("mecab.dicdir", ""), + charset: config.get("mecab.charset", "UTF-8"), + }, + analysis: { + enableCaboCha: config.get("analysis.enableCaboCha", true), + grammarCheck: config.get("analysis.grammarCheck", true), + minJapaneseRatio: config.get("analysis.minJapaneseRatio", 0.1), + warningMinSeverity: config.get( + "analysis.warningMinSeverity", + 2, + ), + warnings: { + particleDuplicate: config.get( + "analysis.warnings.particleDuplicate", + true, + ), + particleSequence: config.get( + "analysis.warnings.particleSequence", + true, + ), + particleMismatch: config.get( + "analysis.warnings.particleMismatch", + true, + ), + sentenceStructure: config.get( + "analysis.warnings.sentenceStructure", + false, + ), + styleConsistency: config.get( + "analysis.warnings.styleConsistency", + false, + ), + redundancy: config.get( + "analysis.warnings.redundancy", + false, + ), + }, + rules: { + commaLimit: config.get("analysis.rules.commaLimit", true), + adversativeGa: config.get( + "analysis.rules.adversativeGa", + true, + ), + duplicateParticleSurface: config.get( + "analysis.rules.duplicateParticleSurface", + true, + ), + adjacentParticles: config.get( + "analysis.rules.adjacentParticles", + true, + ), + conjunctionRepeat: config.get( + "analysis.rules.conjunctionRepeat", + true, + ), + raDropping: config.get("analysis.rules.raDropping", true), + commaLimitMax: config.get("analysis.rules.commaLimitMax", 3), + adversativeGaMax: config.get( + "analysis.rules.adversativeGaMax", + 1, + ), + duplicateParticleSurfaceMaxRepeat: config.get( + "analysis.rules.duplicateParticleSurfaceMaxRepeat", + 1, + ), + adjacentParticlesMaxRepeat: config.get( + "analysis.rules.adjacentParticlesMaxRepeat", + 1, + ), + conjunctionRepeatMax: config.get( + "analysis.rules.conjunctionRepeatMax", + 1, + ), + }, + }, + }, + }; +} + +export function buildDocumentSelector(): NonNullable< + LanguageClientOptions["documentSelector"] +> { + return [ + ...supportedLanguages.map((language) => ({ language })), + { scheme: "file", pattern: "**/*.ja.txt" }, + { scheme: "file", pattern: "**/*.ja.md" }, + ]; +} diff --git a/vscode-mozuku/src/extension.ts b/vscode-mozuku/src/extension.ts index a7ed152..8b23f6b 100644 --- a/vscode-mozuku/src/extension.ts +++ b/vscode-mozuku/src/extension.ts @@ -1,16 +1,16 @@ -import * as vscode from 'vscode'; -import { startClient } from './client'; +import * as vscode from "vscode"; +import { startClient } from "./client"; export async function activate(context: vscode.ExtensionContext) { - console.log('[MoZuku] Extension activation started...'); + console.log("[MoZuku] Extension activation started..."); const serverPath = - vscode.workspace.getConfiguration('mozuku').get('serverPath') || - 'mozuku-lsp'; - console.log('[MoZuku] LSP client starting: server path =', serverPath); + vscode.workspace.getConfiguration("mozuku").get("serverPath") || + "mozuku-lsp"; + console.log("[MoZuku] LSP client starting: server path =", serverPath); const client = await startClient(context, serverPath); - console.log('[MoZuku] Extension activation completed'); + console.log("[MoZuku] Extension activation completed"); } -export function deactivate() { } +export function deactivate() {} diff --git a/vscode-mozuku/src/server-discovery.ts b/vscode-mozuku/src/server-discovery.ts new file mode 100644 index 0000000..511a5f2 --- /dev/null +++ b/vscode-mozuku/src/server-discovery.ts @@ -0,0 +1,213 @@ +import * as fs from "fs"; +import * as path from "path"; +import * as vscode from "vscode"; + +export function resolveServerPath( + ctx: vscode.ExtensionContext, + configured: string, + isDebug: boolean, +): string { + const isWindows = process.platform === "win32"; + const exeName = isWindows ? "mozuku-lsp.exe" : "mozuku-lsp"; + const configuredValue = configured.trim(); + const envValue = process.env.MOZUKU_LSP?.trim() ?? ""; + const workspaceRoot = vscode.workspace.workspaceFolders?.[0]?.uri.fsPath; + const extensionRoot = ctx.extensionUri.fsPath; + const seen = new Set(); + + if (isDebug) { + console.log("[MoZuku] サーバーパスを解決中:", { + configured: configuredValue, + extensionPath: extensionRoot, + workspaceFolders: vscode.workspace.workspaceFolders?.map( + (f) => f.uri.fsPath, + ), + }); + } + + const candidates: { type: string; path: string }[] = []; + const add = (type: string, candidatePath: string | undefined) => { + if (!candidatePath || candidatePath.trim().length === 0) { + return; + } + const normalized = path.normalize(candidatePath); + if (seen.has(normalized)) { + return; + } + seen.add(normalized); + candidates.push({ type, path: candidatePath }); + }; + + const addResolvedPath = (type: string, candidate: string | undefined) => { + if (!candidate) { + return; + } + if (path.isAbsolute(candidate)) { + add(type, candidate); + return; + } + if (workspaceRoot) { + add(`${type} (workspace)`, path.join(workspaceRoot, candidate)); + } + add(`${type} (extension)`, path.join(extensionRoot, candidate)); + add(`${type} (cwd)`, path.resolve(candidate)); + }; + + const installDirs = (): string[] => { + const dirs: string[] = []; + const pathEnv = process.env.PATH || ""; + for (const dir of pathEnv.split(path.delimiter)) { + if (dir) { + dirs.push(dir); + } + } + + const home = process.env.HOME || process.env.USERPROFILE; + if (home) { + dirs.push(path.join(home, ".local", "bin")); + dirs.push(path.join(home, "bin")); + } + + if (isWindows) { + const localAppData = process.env.LOCALAPPDATA; + if (localAppData) { + dirs.push(path.join(localAppData, "Programs", "MoZuku", "bin")); + dirs.push(path.join(localAppData, "Programs", "mozuku-lsp", "bin")); + } + for (const base of [ + process.env.ProgramFiles, + process.env["ProgramFiles(x86)"], + ]) { + if (!base) { + continue; + } + dirs.push(path.join(base, "MoZuku", "bin")); + dirs.push(path.join(base, "mozuku-lsp", "bin")); + } + } else { + dirs.push("/usr/local/bin"); + dirs.push("/usr/bin"); + if (process.platform === "darwin") { + dirs.push("/opt/homebrew/bin"); + dirs.push("/opt/local/bin"); + } + } + + return dirs; + }; + + const addCommandSearch = (type: string, commandName: string | undefined) => { + if (!commandName || hasPathSep(commandName)) { + return; + } + const names = + isWindows && !commandName.toLowerCase().endsWith(".exe") + ? [commandName, `${commandName}.exe`] + : [commandName]; + for (const dir of installDirs()) { + for (const name of names) { + add(type, path.join(dir, name)); + } + } + }; + + if (configuredValue && hasPathSep(configuredValue)) { + addResolvedPath("設定済み", configuredValue); + } + if (envValue && hasPathSep(envValue)) { + addResolvedPath("環境変数 MOZUKU_LSP", envValue); + } + + addCommandSearch( + "設定済みコマンド", + configuredValue && !hasPathSep(configuredValue) + ? configuredValue + : undefined, + ); + addCommandSearch( + "環境変数 MOZUKU_LSP", + envValue && !hasPathSep(envValue) ? envValue : undefined, + ); + addCommandSearch("デフォルトコマンド", exeName); + + add( + "パッケージ済み", + vscode.Uri.joinPath(ctx.extensionUri, "bin", exeName).fsPath, + ); + + add( + "パッケージ済み", + vscode.Uri.joinPath( + ctx.extensionUri, + "server", + "bin", + `${process.platform}-${process.arch}`, + exeName, + ).fsPath, + ); + + if (workspaceRoot) { + add( + "ワークスペース-install", + path.join(workspaceRoot, "build", "install", "bin", exeName), + ); + add("ワークスペース-build", path.join(workspaceRoot, "build", exeName)); + add( + "ワークスペース-install", + path.join( + workspaceRoot, + "mozuku-lsp", + "build", + "install", + "bin", + exeName, + ), + ); + add( + "ワークスペース-build", + path.join(workspaceRoot, "mozuku-lsp", "build", exeName), + ); + } + + add( + "開発-install", + path.join( + extensionRoot, + "..", + "mozuku-lsp", + "build", + "install", + "bin", + exeName, + ), + ); + add( + "開発-build", + path.join(extensionRoot, "..", "mozuku-lsp", "build", exeName), + ); + + for (const candidate of candidates) { + if (fs.existsSync(candidate.path)) { + if (isDebug) { + console.log(`[MoZuku] ${candidate.type}パスを使用:`, candidate.path); + } + return candidate.path; + } + if (isDebug) { + console.log( + `[MoZuku] ${candidate.type}パスが見つかりません:`, + candidate.path, + ); + } + } + + const fallback = configuredValue || envValue || exeName; + if (isDebug) { + console.log("[MoZuku] フォールバックパスを使用:", fallback); + } + return fallback; +} + +function hasPathSep(candidate: string): boolean { + return candidate.includes("/") || candidate.includes("\\"); +}