Skip to content
Snippets Groups Projects

community/lucene++: upgrade to 3.0.9

Merged Celeste requested to merge Celeste/aports:upgrade-lucene++ into master
Files
5
+ 0
72
From fd9eaf10c49239d700af848062acc1d5efd54aa8 Mon Sep 17 00:00:00 2001
From: liuzhangjian <liuzhangjian@uniontech.com>
Date: Fri, 4 Dec 2020 15:41:31 +0800
Subject: [PATCH] Title:fix a bug of ChineseTokenizer
Description:When I use ChineseAnalyzer for Chinese word segmentation, I find that English and numbers are treated as one word and I think they should be separated.
RootCause:Null
Solution:
---
.../common/analysis/cn/ChineseFilter.cpp | 2 +-
.../common/analysis/cn/ChineseTokenizer.cpp | 22 ++++++++++++++++++-
2 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
index d2a19f3f..83134454 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseFilter.cpp
@@ -38,7 +38,7 @@ bool ChineseFilter::incrementToken() {
if (text.length() > 1) {
return true;
}
- } else if (UnicodeUtil::isOther(text[0])) {
+ } else if (UnicodeUtil::isOther(text[0]) || UnicodeUtil::isDigit(text[0])) {
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return true;
diff --git a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
index 38bf9875..3b4de742 100644
--- a/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
+++ b/src/contrib/analyzers/common/analysis/cn/ChineseTokenizer.cpp
@@ -65,6 +65,7 @@ bool ChineseTokenizer::incrementToken() {
length = 0;
start = offset;
+ bool last_is_en = false, last_is_num = false;
while (true) {
wchar_t c;
@@ -82,11 +83,30 @@ bool ChineseTokenizer::incrementToken() {
c = ioBuffer[bufferIndex++];
}
- if (UnicodeUtil::isDigit(c) || UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+ if (UnicodeUtil::isLower(c) || UnicodeUtil::isUpper(c)) {
+ if (last_is_num) {
+ --bufferIndex;
+ --offset;
+ return flush();
+ }
+
+ push(c);
+ if (length == MAX_WORD_LEN) {
+ return flush();
+ }
+ last_is_en = true;
+ } else if (UnicodeUtil::isDigit(c)) {
+ if (last_is_en) {
+ --bufferIndex;
+ --offset;
+ return flush();
+ }
+
push(c);
if (length == MAX_WORD_LEN) {
return flush();
}
+ last_is_num = true;
} else if (UnicodeUtil::isOther(c)) {
if (length > 0) {
--bufferIndex;
Loading