Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions mozuku-lsp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ set(MOZUKU_SOURCES
src/grammar_checker.cpp
src/wikipedia.cpp
src/comment_extractor.cpp
src/document_preprocessor.cpp
src/presenter.cpp
src/tree_sitter_document.cpp
)

add_executable(mozuku-lsp ${MOZUKU_SOURCES})
Expand Down
112 changes: 13 additions & 99 deletions mozuku-lsp/include/analyzer.hpp
Original file line number Diff line number Diff line change
@@ -1,115 +1,19 @@
#pragma once

#include "mozuku/core/config.hpp"
#include "mozuku/core/types.hpp"

#include <memory>
#include <string>
#include <vector>

struct TokenData;
struct Diagnostic;

struct DetailedPOS {
std::string mainPOS; // 主品詞 (名詞, 動詞, 助詞...)
std::string subPOS1; // 品詞細分類1 (格助詞, 副助詞, 係助詞...)
std::string subPOS2; // 品詞細分類2
std::string subPOS3; // 品詞細分類3
std::string inflection; // 活用型
std::string conjugation; // 活用形
std::string baseForm; // 原形
std::string reading; // 読み
std::string pronunciation; // 発音

bool isParticle() const { return mainPOS == "助詞"; }
bool isVerb() const { return mainPOS == "動詞"; }
bool isNoun() const { return mainPOS == "名詞"; }
};

// Information about a particle (助詞) token
struct ParticleInfo {
std::string surface; // 表層形
std::string function; // 格助詞, 副助詞, 係助詞, 接続助詞
std::string role; // より詳細な役割
size_t position; // 文中の位置 (バイト単位)
int tokenIndex; // トークン配列内のインデックス
int sentenceId; // 所属する文のID
};

// Sentence boundary information
struct SentenceBoundary {
size_t start; // 文の開始位置 (バイト単位)
size_t end; // 文の終了位置 (バイト単位)
int sentenceId; // 文のID
std::string text; // 文の内容
};

// Dependency parsing information from CaboCha
struct DependencyInfo {
int chunkId; // チャンクID
int headId; // 係り先チャンクID
double score; // 係り受けスコア
std::string text; // チャンクのテキスト
};

// Configuration structures (shared between LSP server and analyzer)
struct MeCabConfig {
std::string dicPath; // Dictionary directory path
std::string charset = "UTF-8"; // Character encoding
};

struct AnalysisConfig {
bool enableCaboCha = true; // Enable CaboCha dependency parsing
bool grammarCheck = true; // Enable grammar diagnostics
double minJapaneseRatio =
0.1; // Minimum Japanese character ratio for analysis

struct RuleToggles {
bool commaLimit = true;
bool adversativeGa = true;
bool duplicateParticleSurface = true;
bool adjacentParticles = true;
bool conjunctionRepeat = true;
bool raDropping = true;
int commaLimitMax = 3;
int adversativeGaMax = 1;
int duplicateParticleSurfaceMaxRepeat = 1;
int adjacentParticlesMaxRepeat = 1;
int conjunctionRepeatMax = 1;
} rules;

// Enhanced grammar warning settings
struct WarningLevels {
bool particleDuplicate = true; // 二重助詞警告
bool particleSequence = true; // 不適切助詞連続
bool particleMismatch = true; // 動詞-助詞不整合
bool sentenceStructure = false; // 文構造問題 (実験的)
bool styleConsistency = false; // 文体混在 (実験的)
bool redundancy = false; // 冗長表現 (実験的)
} warnings;

int warningMinSeverity =
2; // 最小警告レベル (1=Error, 2=Warning, 3=Info, 4=Hint)
};

struct MoZukuConfig {
MeCabConfig mecab;
AnalysisConfig analysis;
};

void analyzeText(const std::string &text, std::vector<TokenData> &tokens,
std::vector<Diagnostic> &diags,
const MoZukuConfig *config = nullptr);

void performGrammarDiagnostics(const std::string &text,
std::vector<Diagnostic> &diags);

size_t computeByteOffset(const std::string &text, int line, int character);

namespace MoZukuModifiers {
static constexpr unsigned Proper = 1u << 0; // "proper"
static constexpr unsigned Numeric = 1u << 1; // "numeric"
static constexpr unsigned Kana = 1u << 2; // "kana"
static constexpr unsigned Kanji = 1u << 3; // "kanji"
} // namespace MoZukuModifiers

namespace MoZuku {

namespace mecab {
Expand All @@ -132,6 +36,16 @@ class Analyzer {
bool isCaboChaAvailable() const;

private:
struct PreparedText {
std::string cleanText;
double japaneseRatio{0.0};
bool belowMinJapaneseRatio{false};
};

PreparedText prepareText(const std::string &text,
bool enforceMinJapaneseRatio) const;
std::vector<TokenData> analyzePreparedText(const PreparedText &prepared);

std::unique_ptr<mecab::MeCabManager> mecab_manager_;
MoZukuConfig config_;
std::string system_charset_;
Expand Down
11 changes: 10 additions & 1 deletion mozuku-lsp/include/encoding_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,24 @@
namespace MoZuku {
namespace encoding {

struct ConversionOptions {
bool skipInvalidInput{false};
};

std::string convertEncoding(const std::string &input,
const std::string &fromCharset,
const std::string &toCharset = "UTF-8");
const std::string &toCharset = "UTF-8",
ConversionOptions options = {});

std::string systemToUtf8(const std::string &input,
const std::string &systemCharset);

std::string utf8ToSystem(const std::string &input,
const std::string &systemCharset);

std::string sanitizeUtf8(const std::string &input);

size_t utf8SequenceLength(unsigned char c);

} // namespace encoding
} // namespace MoZuku
4 changes: 2 additions & 2 deletions mozuku-lsp/include/grammar_checker.hpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#pragma once

#include "analyzer.hpp"
#include "lsp.hpp"
#include "mozuku/core/config.hpp"
#include "mozuku/core/types.hpp"
#include <string>
#include <vector>

Expand Down
98 changes: 27 additions & 71 deletions mozuku-lsp/include/lsp.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#pragma once

#include "analyzer.hpp"
#include "mozuku/analysis/document_preprocessor.hpp"
#include "mozuku/core/config.hpp"
#include "mozuku/core/types.hpp"
#include "mozuku/lsp/presenter.hpp"
#include <cstddef>
#include <istream>
#include <memory>
Expand All @@ -15,78 +19,35 @@

using json = nlohmann::json;

struct Position {
int line{0};
int character{0};
};

struct Range {
Position start;
Position end;
};

struct Diagnostic {
Range range;
int severity{2};
std::string message;
};

struct TokenData {
int line{0};
int startChar{0};
int endChar{0};
std::string tokenType; // e.g. "noun", "verb" ...
unsigned int tokenModifiers{0};

std::string surface; // 表層形
std::string
feature; // 品詞,品詞細分類1,品詞細分類2,品詞細分類3,活用型,活用形,原形,読み,発音
std::string baseForm; // 原形
std::string reading; // 読み
std::string pronunciation; // 発音
};

struct AnalyzerResult {
std::vector<TokenData> tokens;
std::vector<Diagnostic> diags;
};

struct ByteRange {
size_t startByte{0};
size_t endByte{0};
};

class LSPServer {
public:
LSPServer(std::istream &in, std::ostream &out);
void run();

private:
struct DocumentState {
std::string text;
std::string languageId;
std::vector<TokenData> tokens;
bool tokensCached{false};
std::unordered_map<int, std::vector<Diagnostic>> diagnosticsByLine;
std::vector<MoZuku::comments::CommentSegment> commentSegments;
std::vector<ByteRange> contentHighlightRanges;
};

std::istream &in_;
std::ostream &out_;

// インメモリテキストストア: uri -> 全テキスト
std::unordered_map<std::string, std::string> docs_;
// ドキュメントの言語ID: uri -> languageId
std::unordered_map<std::string, std::string> docLanguages_;
// hover用トークン情報: uri -> トークンデータ
std::unordered_map<std::string, std::vector<TokenData>> docTokens_;
// 行ベースの診断キャッシュ: uri -> 行番号 -> 診断情報
std::unordered_map<std::string,
std::unordered_map<int, std::vector<Diagnostic>>>
docDiagnostics_;
// コメント解析に使用するセグメント
std::unordered_map<std::string, std::vector<MoZuku::comments::CommentSegment>>
docCommentSegments_;
// HTML/LaTeX 本文ハイライト用の範囲
std::unordered_map<std::string, std::vector<ByteRange>>
docContentHighlightRanges_;
// ドキュメント単位の状態: uri -> テキスト/解析結果/補助メタデータ
std::unordered_map<std::string, DocumentState> documents_;
std::vector<std::string> tokenTypes_;
std::vector<std::string> tokenModifiers_;

MoZukuConfig config_;

std::unique_ptr<MoZuku::Analyzer> analyzer_;
MoZuku::analysis::DocumentPreprocessor preprocessor_;
MoZuku::lsp::Presenter presenter_;

bool readMessage(std::string &jsonPayload);
void reply(const json &msg);
Expand All @@ -103,26 +64,21 @@ class LSPServer {
json onSemanticTokensRange(const json &id, const json &params);
json onHover(const json &id, const json &params);

void analyzeAndPublish(const std::string &uri, const std::string &text);
DocumentState &ensureDocument(const std::string &uri);
DocumentState *findDocument(const std::string &uri);
const DocumentState *findDocument(const std::string &uri) const;
static bool isJapaneseLanguage(const DocumentState &document);

void analyzeAndPublish(const std::string &uri);
void analyzeChangedLines(const std::string &uri, const std::string &newText,
const std::string &oldText);
std::string prepareAnalysisText(const std::string &uri,
const std::string &text);
void sendCommentHighlights(
const std::string &uri, const std::string &text,
const std::vector<MoZuku::comments::CommentSegment> &segments);
void sendSemanticHighlights(const std::string &uri,
const std::vector<TokenData> &tokens);
void sendContentHighlights(const std::string &uri, const std::string &text,
const std::vector<ByteRange> &ranges);
MoZuku::analysis::ProcessedDocument prepareDocument(DocumentState &document);
json buildSemanticTokens(const std::string &uri);
json buildSemanticTokensFromTokens(const std::vector<TokenData> &tokens);

void cacheDiagnostics(const std::string &uri,
void cacheDiagnostics(DocumentState &document,
const std::vector<Diagnostic> &diags);
void removeDiagnosticsForLines(const std::string &uri,
void removeDiagnosticsForLines(DocumentState &document,
const std::set<int> &lines);
std::vector<Diagnostic> getAllDiagnostics(const std::string &uri) const;
std::set<int> findChangedLines(const std::string &oldText,
const std::string &newText) const;
};
23 changes: 23 additions & 0 deletions mozuku-lsp/include/mozuku/analysis/document_preprocessor.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#pragma once

#include "comment_extractor.hpp"
#include "mozuku/core/types.hpp"

#include <string>
#include <vector>

namespace MoZuku::analysis {

struct ProcessedDocument {
std::string analysisText;
std::vector<comments::CommentSegment> commentSegments;
std::vector<ByteRange> contentHighlightRanges;
};

class DocumentPreprocessor {
public:
ProcessedDocument prepare(const std::string &languageId,
const std::string &text) const;
};

} // namespace MoZuku::analysis
52 changes: 52 additions & 0 deletions mozuku-lsp/include/mozuku/core/config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include <string>

namespace MoZuku::core {

struct MeCabConfig {
std::string dicPath;
std::string charset = "UTF-8";
};

struct AnalysisConfig {
bool enableCaboCha = true;
bool grammarCheck = true;
double minJapaneseRatio = 0.1;

struct RuleToggles {
bool commaLimit = true;
bool adversativeGa = true;
bool duplicateParticleSurface = true;
bool adjacentParticles = true;
bool conjunctionRepeat = true;
bool raDropping = true;
int commaLimitMax = 3;
int adversativeGaMax = 1;
int duplicateParticleSurfaceMaxRepeat = 1;
int adjacentParticlesMaxRepeat = 1;
int conjunctionRepeatMax = 1;
} rules;

struct WarningLevels {
bool particleDuplicate = true;
bool particleSequence = true;
bool particleMismatch = true;
bool sentenceStructure = false;
bool styleConsistency = false;
bool redundancy = false;
} warnings;

int warningMinSeverity = 2;
};

struct MoZukuConfig {
MeCabConfig mecab;
AnalysisConfig analysis;
};

} // namespace MoZuku::core

using MeCabConfig = MoZuku::core::MeCabConfig;
using AnalysisConfig = MoZuku::core::AnalysisConfig;
using MoZukuConfig = MoZuku::core::MoZukuConfig;
Loading
Loading