-
Notifications
You must be signed in to change notification settings - Fork 29
Open
Description
如果语言模型是分词的,is_character_based_==False, 分词语言模型只有遇到space_id才打分
make_ngram[4]得到截止当前帧固定长度窗口内的词语。
如果输出token没有SPACE_ID_,只取当前帧固定长度窗口内的词语,前面的词不就没有取到的机会吗?
// language model scoring
float ngram_score = 0.0;
if (ext_scorer != nullptr ) {
if (hotwords_scorer != nullptr &&
!hotwords_scorer->hotwords_dict.empty() &&
!(hotwords_scorer->is_character_based ^ ext_scorer->is_character_based()) &&
hotwords_scorer->window_length >= ext_scorer->get_max_order()) {
std::vector<std::string>::const_iterator first = ngram.end() - ext_scorer->get_max_order();
std::vector<std::string>::const_iterator last = ngram.end();
std::vector<std::string> slice_ngram(first, last);
ngram_score = ext_scorer->get_log_cond_prob(slice_ngram) * ext_scorer->alpha + ext_scorer->beta;
} else {
if (c == space_id || ext_scorer->is_character_based()) {
// skip scoring the space
if (ext_scorer->is_character_based()) {
prefix_to_score = prefix_new;
} else {
prefix_to_score = prefix;
}
ngram = ext_scorer->make_ngram(prefix_to_score);// 通过调用make_ngram[3]函数获取当前前缀中的Ngram
ngram_score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha + ext_scorer->beta; //调用语言模型计算上面公式中的条件概率。
}
}
}
std::vector<std::string> Scorer::make_ngram(PathTrie* prefix) {
std::vector<std::string> ngram;
PathTrie* current_node = prefix;
PathTrie* new_node = nullptr;
for (int order = 0; order < max_order_; order++) {
std::vector<int> prefix_vec;
if (is_character_based_) {
new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_, 1);
current_node = new_node;
} else {
new_node = current_node->get_path_vec(prefix_vec, SPACE_ID_);
current_node = new_node->parent; // Skipping spaces
}
// reconstruct word
std::string word = vec2str(prefix_vec);
ngram.push_back(word);
if (new_node->character == -1) {
// No more spaces, but still need order
for (int i = 0; i < max_order_ - order - 1; i++) {
ngram.push_back(START_TOKEN);
}
break;
}
}
std::reverse(ngram.begin(), ngram.end());
return ngram;
}
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels