t3tra-dev · t3tra-dev · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/mozuku-lsp/CMakeLists.txt b/mozuku-lsp/CMakeLists.txt
@@ -126,6 +126,9 @@ set(MOZUKU_SOURCES
   src/grammar_checker.cpp
   src/wikipedia.cpp
   src/comment_extractor.cpp
+  src/document_preprocessor.cpp
+  src/presenter.cpp
+  src/tree_sitter_document.cpp
 )
 
 add_executable(mozuku-lsp ${MOZUKU_SOURCES})

diff --git a/mozuku-lsp/include/analyzer.hpp b/mozuku-lsp/include/analyzer.hpp
@@ -1,115 +1,19 @@
 #pragma once
 
+#include "mozuku/core/config.hpp"
+#include "mozuku/core/types.hpp"
+
 #include <memory>
 #include <string>
 #include <vector>
 
-struct TokenData;
-struct Diagnostic;
-
-struct DetailedPOS {
-  std::string mainPOS;       // 主品詞 (名詞, 動詞, 助詞...)
-  std::string subPOS1;       // 品詞細分類1 (格助詞, 副助詞, 係助詞...)
-  std::string subPOS2;       // 品詞細分類2
-  std::string subPOS3;       // 品詞細分類3
-  std::string inflection;    // 活用型
-  std::string conjugation;   // 活用形
-  std::string baseForm;      // 原形
-  std::string reading;       // 読み
-  std::string pronunciation; // 発音
-
-  bool isParticle() const { return mainPOS == "助詞"; }
-  bool isVerb() const { return mainPOS == "動詞"; }
-  bool isNoun() const { return mainPOS == "名詞"; }
-};
-
-// Information about a particle (助詞) token
-struct ParticleInfo {
-  std::string surface;  // 表層形
-  std::string function; // 格助詞, 副助詞, 係助詞, 接続助詞
-  std::string role;     // より詳細な役割
-  size_t position;      // 文中の位置 (バイト単位)
-  int tokenIndex;       // トークン配列内のインデックス
-  int sentenceId;       // 所属する文のID
-};
-
-// Sentence boundary information
-struct SentenceBoundary {
-  size_t start;     // 文の開始位置 (バイト単位)
-  size_t end;       // 文の終了位置 (バイト単位)
-  int sentenceId;   // 文のID
-  std::string text; // 文の内容
-};
-
-// Dependency parsing information from CaboCha
-struct DependencyInfo {
-  int chunkId;      // チャンクID
-  int headId;       // 係り先チャンクID
-  double score;     // 係り受けスコア
-  std::string text; // チャンクのテキスト
-};
-
-// Configuration structures (shared between LSP server and analyzer)
-struct MeCabConfig {
-  std::string dicPath;           // Dictionary directory path
-  std::string charset = "UTF-8"; // Character encoding
-};
-
-struct AnalysisConfig {
-  bool enableCaboCha = true; // Enable CaboCha dependency parsing
-  bool grammarCheck = true;  // Enable grammar diagnostics
-  double minJapaneseRatio =
-      0.1; // Minimum Japanese character ratio for analysis
-
-  struct RuleToggles {
-    bool commaLimit = true;
-    bool adversativeGa = true;
-    bool duplicateParticleSurface = true;
-    bool adjacentParticles = true;
-    bool conjunctionRepeat = true;
-    bool raDropping = true;
-    int commaLimitMax = 3;
-    int adversativeGaMax = 1;
-    int duplicateParticleSurfaceMaxRepeat = 1;
-    int adjacentParticlesMaxRepeat = 1;
-    int conjunctionRepeatMax = 1;
-  } rules;
-
-  // Enhanced grammar warning settings
-  struct WarningLevels {
-    bool particleDuplicate = true;  // 二重助詞警告
-    bool particleSequence = true;   // 不適切助詞連続
-    bool particleMismatch = true;   // 動詞-助詞不整合
-    bool sentenceStructure = false; // 文構造問題 (実験的)
-    bool styleConsistency = false;  // 文体混在 (実験的)
-    bool redundancy = false;        // 冗長表現 (実験的)
-  } warnings;
-
-  int warningMinSeverity =
-      2; // 最小警告レベル (1=Error, 2=Warning, 3=Info, 4=Hint)
-};
-
-struct MoZukuConfig {
-  MeCabConfig mecab;
-  AnalysisConfig analysis;
-};
-
 void analyzeText(const std::string &text, std::vector<TokenData> &tokens,
                  std::vector<Diagnostic> &diags,
                  const MoZukuConfig *config = nullptr);
 
 void performGrammarDiagnostics(const std::string &text,
                                std::vector<Diagnostic> &diags);
 
-size_t computeByteOffset(const std::string &text, int line, int character);
-
-namespace MoZukuModifiers {
-static constexpr unsigned Proper = 1u << 0;  // "proper"
-static constexpr unsigned Numeric = 1u << 1; // "numeric"
-static constexpr unsigned Kana = 1u << 2;    // "kana"
-static constexpr unsigned Kanji = 1u << 3;   // "kanji"
-} // namespace MoZukuModifiers
-
 namespace MoZuku {
 
 namespace mecab {
@@ -132,6 +36,16 @@ class Analyzer {
   bool isCaboChaAvailable() const;
 
 private:
+  struct PreparedText {
+    std::string cleanText;
+    double japaneseRatio{0.0};
+    bool belowMinJapaneseRatio{false};
+  };
+
+  PreparedText prepareText(const std::string &text,
+                           bool enforceMinJapaneseRatio) const;
+  std::vector<TokenData> analyzePreparedText(const PreparedText &prepared);
+
   std::unique_ptr<mecab::MeCabManager> mecab_manager_;
   MoZukuConfig config_;
   std::string system_charset_;

diff --git a/mozuku-lsp/include/encoding_utils.hpp b/mozuku-lsp/include/encoding_utils.hpp
@@ -5,15 +5,24 @@
 namespace MoZuku {
 namespace encoding {
 
+struct ConversionOptions {
+  bool skipInvalidInput{false};
+};
+
 std::string convertEncoding(const std::string &input,
                             const std::string &fromCharset,
-                            const std::string &toCharset = "UTF-8");
+                            const std::string &toCharset = "UTF-8",
+                            ConversionOptions options = {});
 
 std::string systemToUtf8(const std::string &input,
                          const std::string &systemCharset);
 
 std::string utf8ToSystem(const std::string &input,
                          const std::string &systemCharset);
 
+std::string sanitizeUtf8(const std::string &input);
+
+size_t utf8SequenceLength(unsigned char c);
+
 } // namespace encoding
 } // namespace MoZuku
diff --git a/mozuku-lsp/include/grammar_checker.hpp b/mozuku-lsp/include/grammar_checker.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "analyzer.hpp"
-#include "lsp.hpp"
+#include "mozuku/core/config.hpp"
+#include "mozuku/core/types.hpp"
 #include <string>
 #include <vector>
 

diff --git a/mozuku-lsp/include/lsp.hpp b/mozuku-lsp/include/lsp.hpp
@@ -1,6 +1,10 @@
 #pragma once
 
 #include "analyzer.hpp"
+#include "mozuku/analysis/document_preprocessor.hpp"
+#include "mozuku/core/config.hpp"
+#include "mozuku/core/types.hpp"
+#include "mozuku/lsp/presenter.hpp"
 #include <cstddef>
 #include <istream>
 #include <memory>
@@ -15,78 +19,35 @@
 
 using json = nlohmann::json;
 
-struct Position {
-  int line{0};
-  int character{0};
-};
-
-struct Range {
-  Position start;
-  Position end;
-};
-
-struct Diagnostic {
-  Range range;
-  int severity{2};
-  std::string message;
-};
-
-struct TokenData {
-  int line{0};
-  int startChar{0};
-  int endChar{0};
-  std::string tokenType; // e.g. "noun", "verb" ...
-  unsigned int tokenModifiers{0};
-
-  std::string surface; // 表層形
-  std::string
-      feature; // 品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
-  std::string baseForm;      // 原形
-  std::string reading;       // 読み
-  std::string pronunciation; // 発音
-};
-
-struct AnalyzerResult {
-  std::vector<TokenData> tokens;
-  std::vector<Diagnostic> diags;
-};
-
-struct ByteRange {
-  size_t startByte{0};
-  size_t endByte{0};
-};
-
 class LSPServer {
 public:
   LSPServer(std::istream &in, std::ostream &out);
   void run();
 
 private:
+  struct DocumentState {
+    std::string text;
+    std::string languageId;
+    std::vector<TokenData> tokens;
+    bool tokensCached{false};
+    std::unordered_map<int, std::vector<Diagnostic>> diagnosticsByLine;
+    std::vector<MoZuku::comments::CommentSegment> commentSegments;
+    std::vector<ByteRange> contentHighlightRanges;
+  };
+
   std::istream &in_;
   std::ostream &out_;
 
-  // インメモリテキストストア: uri -> 全テキスト
-  std::unordered_map<std::string, std::string> docs_;
-  // ドキュメントの言語ID: uri -> languageId
-  std::unordered_map<std::string, std::string> docLanguages_;
-  // hover用トークン情報: uri -> トークンデータ
-  std::unordered_map<std::string, std::vector<TokenData>> docTokens_;
-  // 行ベースの診断キャッシュ: uri -> 行番号 -> 診断情報
-  std::unordered_map<std::string,
-                     std::unordered_map<int, std::vector<Diagnostic>>>
-      docDiagnostics_;
-  // コメント解析に使用するセグメント
-  std::unordered_map<std::string, std::vector<MoZuku::comments::CommentSegment>>
-      docCommentSegments_;
-  // HTML/LaTeX 本文ハイライト用の範囲
-  std::unordered_map<std::string, std::vector<ByteRange>>
-      docContentHighlightRanges_;
+  // ドキュメント単位の状態: uri -> テキスト/解析結果/補助メタデータ
+  std::unordered_map<std::string, DocumentState> documents_;
   std::vector<std::string> tokenTypes_;
   std::vector<std::string> tokenModifiers_;
 
   MoZukuConfig config_;
 
   std::unique_ptr<MoZuku::Analyzer> analyzer_;
+  MoZuku::analysis::DocumentPreprocessor preprocessor_;
+  MoZuku::lsp::Presenter presenter_;
 
   bool readMessage(std::string &jsonPayload);
   void reply(const json &msg);
@@ -103,26 +64,21 @@ class LSPServer {
   json onSemanticTokensRange(const json &id, const json &params);
   json onHover(const json &id, const json &params);
 
-  void analyzeAndPublish(const std::string &uri, const std::string &text);
+  DocumentState &ensureDocument(const std::string &uri);
+  DocumentState *findDocument(const std::string &uri);
+  const DocumentState *findDocument(const std::string &uri) const;
+  static bool isJapaneseLanguage(const DocumentState &document);
+
+  void analyzeAndPublish(const std::string &uri);
   void analyzeChangedLines(const std::string &uri, const std::string &newText,
                            const std::string &oldText);
-  std::string prepareAnalysisText(const std::string &uri,
-                                  const std::string &text);
-  void sendCommentHighlights(
-      const std::string &uri, const std::string &text,
-      const std::vector<MoZuku::comments::CommentSegment> &segments);
-  void sendSemanticHighlights(const std::string &uri,
-                              const std::vector<TokenData> &tokens);
-  void sendContentHighlights(const std::string &uri, const std::string &text,
-                             const std::vector<ByteRange> &ranges);
+  MoZuku::analysis::ProcessedDocument prepareDocument(DocumentState &document);
   json buildSemanticTokens(const std::string &uri);
-  json buildSemanticTokensFromTokens(const std::vector<TokenData> &tokens);
 
-  void cacheDiagnostics(const std::string &uri,
+  void cacheDiagnostics(DocumentState &document,
                         const std::vector<Diagnostic> &diags);
-  void removeDiagnosticsForLines(const std::string &uri,
+  void removeDiagnosticsForLines(DocumentState &document,
                                  const std::set<int> &lines);
-  std::vector<Diagnostic> getAllDiagnostics(const std::string &uri) const;
   std::set<int> findChangedLines(const std::string &oldText,
                                  const std::string &newText) const;
 };
diff --git a/mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp b/mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "comment_extractor.hpp"
+#include "mozuku/core/types.hpp"
+
+#include <string>
+#include <vector>
+
+namespace MoZuku::analysis {
+
+struct ProcessedDocument {
+  std::string analysisText;
+  std::vector<comments::CommentSegment> commentSegments;
+  std::vector<ByteRange> contentHighlightRanges;
+};
+
+class DocumentPreprocessor {
+public:
+  ProcessedDocument prepare(const std::string &languageId,
+                            const std::string &text) const;
+};
+
+} // namespace MoZuku::analysis
diff --git a/mozuku-lsp/include/mozuku/core/config.hpp b/mozuku-lsp/include/mozuku/core/config.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <string>
+
+namespace MoZuku::core {
+
+struct MeCabConfig {
+  std::string dicPath;
+  std::string charset = "UTF-8";
+};
+
+struct AnalysisConfig {
+  bool enableCaboCha = true;
+  bool grammarCheck = true;
+  double minJapaneseRatio = 0.1;
+
+  struct RuleToggles {
+    bool commaLimit = true;
+    bool adversativeGa = true;
+    bool duplicateParticleSurface = true;
+    bool adjacentParticles = true;
+    bool conjunctionRepeat = true;
+    bool raDropping = true;
+    int commaLimitMax = 3;
+    int adversativeGaMax = 1;
+    int duplicateParticleSurfaceMaxRepeat = 1;
+    int adjacentParticlesMaxRepeat = 1;
+    int conjunctionRepeatMax = 1;
+  } rules;
+
+  struct WarningLevels {
+    bool particleDuplicate = true;
+    bool particleSequence = true;
+    bool particleMismatch = true;
+    bool sentenceStructure = false;
+    bool styleConsistency = false;
+    bool redundancy = false;
+  } warnings;
+
+  int warningMinSeverity = 2;
+};
+
+struct MoZukuConfig {
+  MeCabConfig mecab;
+  AnalysisConfig analysis;
+};
+
+} // namespace MoZuku::core
+
+using MeCabConfig = MoZuku::core::MeCabConfig;
+using AnalysisConfig = MoZuku::core::AnalysisConfig;
+using MoZukuConfig = MoZuku::core::MoZukuConfig;