Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions mozuku-lsp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ function(add_tree_sitter_language target_name source_root)
endfunction()

function(add_tree_sitter_language_with_fallback target_name package_name source_root)
# 可能ならthird-partyのソースを静的リンクする
if(EXISTS "${source_root}/src/parser.c")
message(STATUS "third-partyから${package_name}をビルドします")
add_tree_sitter_language(${target_name} "${source_root}")
return()
endif()

file(GLOB NIX_PARSER_PATHS "/nix/store/*-${package_name}-grammar-*/parser")
if(NIX_PARSER_PATHS)
list(GET NIX_PARSER_PATHS 0 NIX_PARSER_PATH)
Expand All @@ -90,13 +97,8 @@ function(add_tree_sitter_language_with_fallback target_name package_name source_
add_library(${target_name} INTERFACE IMPORTED GLOBAL)
target_link_libraries(${target_name} INTERFACE ${${target_name}_SYSTEM_LIB})
else()
if(EXISTS "${source_root}/src/parser.c")
message(STATUS "third-partyから${package_name}をビルドします")
add_tree_sitter_language(${target_name} "${source_root}")
else()
message(WARNING "${package_name}が見つかりません。システムにもthird-partyにもライブラリが存在しません。")
add_library(${target_name} INTERFACE IMPORTED GLOBAL)
endif()
message(WARNING "${package_name}が見つかりません。システムにもthird-partyにもライブラリが存在しません。")
add_library(${target_name} INTERFACE IMPORTED GLOBAL)
endif()
endfunction()

Expand Down
6 changes: 6 additions & 0 deletions mozuku-lsp/include/text_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,19 @@ class TextProcessor {
static std::vector<SentenceBoundary>
splitIntoSentences(const std::string &text);

static double calculateJapaneseRatio(const std::string &text);

static bool isJapanesePunctuation(const std::string &text, size_t pos);

static size_t skipWhitespace(const std::string &text, size_t pos);

private:
static bool isValidUtf8Sequence(const std::string &input, size_t pos,
size_t seqLen);
static uint32_t decodeCodepoint(const std::string &text, size_t pos,
size_t seqLen);
static bool isWhitespaceCodepoint(uint32_t codepoint);
static bool isJapaneseCodepoint(uint32_t codepoint);
};

} // namespace text
Expand Down
28 changes: 25 additions & 3 deletions mozuku-lsp/src/analyzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ static bool isDebugEnabled() {
return debug;
}

Analyzer::Analyzer()
: mecab_manager_(std::make_unique<mecab::MeCabManager>(true)) {

Analyzer::Analyzer() {
if (isDebugEnabled()) {
std::cerr << "[DEBUG] Analyzer created" << std::endl;
}
Expand All @@ -35,6 +33,8 @@ Analyzer::~Analyzer() = default;

bool Analyzer::initialize(const MoZukuConfig &config) {
config_ = config;
mecab_manager_ =
std::make_unique<mecab::MeCabManager>(config.analysis.enableCaboCha);

if (isDebugEnabled()) {
std::cerr << "[DEBUG] Initializing analyzer with config" << std::endl;
Expand Down Expand Up @@ -73,6 +73,16 @@ std::vector<TokenData> Analyzer::analyzeText(const std::string &text) {
}

std::string cleanText = text::TextProcessor::sanitizeUTF8(text);
double japaneseRatio = text::TextProcessor::calculateJapaneseRatio(cleanText);
if (config_.analysis.minJapaneseRatio > 0.0 &&
japaneseRatio < config_.analysis.minJapaneseRatio) {
if (isDebugEnabled()) {
std::cerr << "[DEBUG] Skipping analysis due to low Japanese ratio: "
<< japaneseRatio << " < " << config_.analysis.minJapaneseRatio
<< std::endl;
}
return tokens;
}

std::string systemText = encoding::utf8ToSystem(cleanText, system_charset_);

Expand Down Expand Up @@ -153,6 +163,18 @@ std::vector<Diagnostic> Analyzer::checkGrammar(const std::string &text) {
return diagnostics;
}

std::string cleanText = text::TextProcessor::sanitizeUTF8(text);
double japaneseRatio = text::TextProcessor::calculateJapaneseRatio(cleanText);
if (config_.analysis.minJapaneseRatio > 0.0 &&
japaneseRatio < config_.analysis.minJapaneseRatio) {
if (isDebugEnabled()) {
std::cerr << "[DEBUG] Skipping grammar check due to low Japanese ratio: "
<< japaneseRatio << " < " << config_.analysis.minJapaneseRatio
<< std::endl;
}
return diagnostics;
}

if (isDebugEnabled()) {
std::cerr << "[DEBUG] Starting grammar check" << std::endl;
}
Expand Down
106 changes: 46 additions & 60 deletions mozuku-lsp/src/lsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,25 @@ struct LocalByteRange {
size_t endByte{0};
};

bool readBoolOption(const json &obj, const char *key, bool &out) {
if (!obj.contains(key)) {
return false;
}

const auto &value = obj[key];
if (value.is_boolean()) {
out = value.get<bool>();
return true;
}

if (value.is_number_integer()) {
out = value.get<int>() != 0;
return true;
}

return false;
}

bool isEscaped(const std::string &text, size_t pos) {
size_t count = 0;
while (pos > count && text[pos - count - 1] == '\\') {
Expand Down Expand Up @@ -411,6 +430,9 @@ json LSPServer::onInitialize(const json &id, const json &params) {
// initializationOptionsから設定を抽出
if (params.contains("initializationOptions")) {
auto opts = params["initializationOptions"];
if (opts.contains("mozuku") && opts["mozuku"].is_object()) {
opts = opts["mozuku"];
}

// MeCab設定
if (opts.contains("mecab")) {
Expand All @@ -426,14 +448,8 @@ json LSPServer::onInitialize(const json &id, const json &params) {
// 解析設定
if (opts.contains("analysis")) {
auto analysis = opts["analysis"];
if (analysis.contains("enableCaboCha") &&
analysis["enableCaboCha"].is_boolean()) {
config_.analysis.enableCaboCha = analysis["enableCaboCha"];
}
if (analysis.contains("grammarCheck") &&
analysis["grammarCheck"].is_boolean()) {
config_.analysis.grammarCheck = analysis["grammarCheck"];
}
readBoolOption(analysis, "enableCaboCha", config_.analysis.enableCaboCha);
readBoolOption(analysis, "grammarCheck", config_.analysis.grammarCheck);
if (analysis.contains("minJapaneseRatio") &&
analysis["minJapaneseRatio"].is_number()) {
config_.analysis.minJapaneseRatio = analysis["minJapaneseRatio"];
Expand All @@ -446,63 +462,33 @@ json LSPServer::onInitialize(const json &id, const json &params) {
// 警告レベル設定
if (analysis.contains("warnings") && analysis["warnings"].is_object()) {
auto warnings = analysis["warnings"];
if (warnings.contains("particleDuplicate") &&
warnings["particleDuplicate"].is_boolean()) {
config_.analysis.warnings.particleDuplicate =
warnings["particleDuplicate"];
}
if (warnings.contains("particleSequence") &&
warnings["particleSequence"].is_boolean()) {
config_.analysis.warnings.particleSequence =
warnings["particleSequence"];
}
if (warnings.contains("particleMismatch") &&
warnings["particleMismatch"].is_boolean()) {
config_.analysis.warnings.particleMismatch =
warnings["particleMismatch"];
}
if (warnings.contains("sentenceStructure") &&
warnings["sentenceStructure"].is_boolean()) {
config_.analysis.warnings.sentenceStructure =
warnings["sentenceStructure"];
}
if (warnings.contains("styleConsistency") &&
warnings["styleConsistency"].is_boolean()) {
config_.analysis.warnings.styleConsistency =
warnings["styleConsistency"];
}
if (warnings.contains("redundancy") &&
warnings["redundancy"].is_boolean()) {
config_.analysis.warnings.redundancy = warnings["redundancy"];
}
readBoolOption(warnings, "particleDuplicate",
config_.analysis.warnings.particleDuplicate);
readBoolOption(warnings, "particleSequence",
config_.analysis.warnings.particleSequence);
readBoolOption(warnings, "particleMismatch",
config_.analysis.warnings.particleMismatch);
readBoolOption(warnings, "sentenceStructure",
config_.analysis.warnings.sentenceStructure);
readBoolOption(warnings, "styleConsistency",
config_.analysis.warnings.styleConsistency);
readBoolOption(warnings, "redundancy",
config_.analysis.warnings.redundancy);
}

// ルールの有効/無効設定
if (analysis.contains("rules") && analysis["rules"].is_object()) {
auto rules = analysis["rules"];
if (rules.contains("commaLimit") && rules["commaLimit"].is_boolean()) {
config_.analysis.rules.commaLimit = rules["commaLimit"];
}
if (rules.contains("adversativeGa") &&
rules["adversativeGa"].is_boolean()) {
config_.analysis.rules.adversativeGa = rules["adversativeGa"];
}
if (rules.contains("duplicateParticleSurface") &&
rules["duplicateParticleSurface"].is_boolean()) {
config_.analysis.rules.duplicateParticleSurface =
rules["duplicateParticleSurface"];
}
if (rules.contains("adjacentParticles") &&
rules["adjacentParticles"].is_boolean()) {
config_.analysis.rules.adjacentParticles = rules["adjacentParticles"];
}
if (rules.contains("conjunctionRepeat") &&
rules["conjunctionRepeat"].is_boolean()) {
config_.analysis.rules.conjunctionRepeat = rules["conjunctionRepeat"];
}
if (rules.contains("raDropping") && rules["raDropping"].is_boolean()) {
config_.analysis.rules.raDropping = rules["raDropping"];
}
readBoolOption(rules, "commaLimit", config_.analysis.rules.commaLimit);
readBoolOption(rules, "adversativeGa",
config_.analysis.rules.adversativeGa);
readBoolOption(rules, "duplicateParticleSurface",
config_.analysis.rules.duplicateParticleSurface);
readBoolOption(rules, "adjacentParticles",
config_.analysis.rules.adjacentParticles);
readBoolOption(rules, "conjunctionRepeat",
config_.analysis.rules.conjunctionRepeat);
readBoolOption(rules, "raDropping", config_.analysis.rules.raDropping);
if (rules.contains("commaLimitMax") &&
rules["commaLimitMax"].is_number_integer()) {
config_.analysis.rules.commaLimitMax = rules["commaLimitMax"];
Expand Down
88 changes: 88 additions & 0 deletions mozuku-lsp/src/text_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,52 @@ TextProcessor::splitIntoSentences(const std::string &text) {
return sentences;
}

double TextProcessor::calculateJapaneseRatio(const std::string &text) {
if (text.empty()) {
return 0.0;
}

size_t japaneseCount = 0;
size_t visibleCount = 0;

for (size_t i = 0; i < text.size(); ++i) {
unsigned char c = static_cast<unsigned char>(text[i]);
size_t seqLen = 1;

if (c < 0x80) {
seqLen = 1;
} else if ((c & 0xE0) == 0xC0) {
seqLen = 2;
} else if ((c & 0xF0) == 0xE0) {
seqLen = 3;
} else if ((c & 0xF8) == 0xF0) {
seqLen = 4;
} else {
continue;
}

if (!isValidUtf8Sequence(text, i, seqLen)) {
continue;
}

uint32_t codepoint = decodeCodepoint(text, i, seqLen);
if (!isWhitespaceCodepoint(codepoint)) {
++visibleCount;
if (isJapaneseCodepoint(codepoint)) {
++japaneseCount;
}
}

i += seqLen - 1;
}

if (visibleCount == 0) {
return 0.0;
}

return static_cast<double>(japaneseCount) / static_cast<double>(visibleCount);
}

bool TextProcessor::isJapanesePunctuation(const std::string &text, size_t pos) {
if (pos + 2 >= text.size())
return false;
Expand Down Expand Up @@ -237,5 +283,47 @@ bool TextProcessor::isValidUtf8Sequence(const std::string &input, size_t pos,
return true;
}

uint32_t TextProcessor::decodeCodepoint(const std::string &text, size_t pos,
size_t seqLen) {
const unsigned char c0 = static_cast<unsigned char>(text[pos]);
if (seqLen == 1) {
return c0;
}

const unsigned char c1 = static_cast<unsigned char>(text[pos + 1]);
if (seqLen == 2) {
return (static_cast<uint32_t>(c0 & 0x1F) << 6) |
static_cast<uint32_t>(c1 & 0x3F);
}

const unsigned char c2 = static_cast<unsigned char>(text[pos + 2]);
if (seqLen == 3) {
return (static_cast<uint32_t>(c0 & 0x0F) << 12) |
(static_cast<uint32_t>(c1 & 0x3F) << 6) |
static_cast<uint32_t>(c2 & 0x3F);
}

const unsigned char c3 = static_cast<unsigned char>(text[pos + 3]);
return (static_cast<uint32_t>(c0 & 0x07) << 18) |
(static_cast<uint32_t>(c1 & 0x3F) << 12) |
(static_cast<uint32_t>(c2 & 0x3F) << 6) |
static_cast<uint32_t>(c3 & 0x3F);
}

bool TextProcessor::isWhitespaceCodepoint(uint32_t codepoint) {
return codepoint == 0x09 || codepoint == 0x0A || codepoint == 0x0D ||
codepoint == 0x20 || codepoint == 0x3000;
}

bool TextProcessor::isJapaneseCodepoint(uint32_t codepoint) {
return (codepoint >= 0x3040 && codepoint <= 0x309F) ||
(codepoint >= 0x30A0 && codepoint <= 0x30FF) ||
(codepoint >= 0x31F0 && codepoint <= 0x31FF) ||
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
(codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
(codepoint >= 0x3000 && codepoint <= 0x303F) ||
(codepoint >= 0xFF66 && codepoint <= 0xFF9F);
}

} // namespace text
} // namespace MoZuku
3 changes: 3 additions & 0 deletions vim-mozuku/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# vim-mozuku

MoZuku LSP を Vim/Neovim で使うための軽量プラグインです。`vim-mozuku/` を runtimepath に追加することで有効化できます。
Loading