diff -uNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.cc cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.cc --- xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.cc 1969-12-31 16:00:00.000000000 -0800 +++ cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.cc 2011-07-25 02:40:21.748725825 -0700 @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE.include + */ +#include +#include +#include "cjk-tokenizer.h" + +using namespace std; +using namespace Xapian::CJK; + +// 2E80..2EFF; CJK Radicals Supplement +// 3000..303F; CJK Symbols and Punctuation +// 3040..309F; Hiragana +// 30A0..30FF; Katakana +// 3100..312F; Bopomofo +// 3130..318F; Hangul Compatibility Jamo +// 3190..319F; Kanbun +// 31A0..31BF; Bopomofo Extended +// 31C0..31EF; CJK Strokes +// 31F0..31FF; Katakana Phonetic Extensions +// 3200..32FF; Enclosed CJK Letters and Months +// 3300..33FF; CJK Compatibility +// 3400..4DBF; CJK Unified Ideographs Extension A +// 4DC0..4DFF; Yijing Hexagram Symbols +// 4E00..9FFF; CJK Unified Ideographs +// A700..A71F; Modifier Tone Letters +// AC00..D7AF; Hangul Syllables +// F900..FAFF; CJK Compatibility Ideographs +// FE30..FE4F; CJK Compatibility Forms +// FF00..FFEF; Halfwidth and Fullwidth Forms +// 20000..2A6DF; CJK Unified Ideographs Extension B +// 2F800..2FA1F; CJK Compatibility Ideographs Supplement +bool +Xapian::CJK::codepoint_is_cjk(unsigned p) { + if (p < 0x2E80) return false; + return (((p) >= 0x2E80 && (p) <= 0x2EFF) + || ((p) >= 0x3000 && (p) <= 0x9FFF) + || ((p) >= 0xA700 && (p) <= 0xA71F) + || ((p) >= 0xAC00 && (p) <= 0xD7AF) + || ((p) >= 0xF900 && (p) <= 0xFAFF) + || ((p) >= 0xFE30 && (p) <= 0xFE4F) + || ((p) >= 0xFF00 && (p) <= 0xFFEF) + || ((p) >= 0x20000 && (p) <= 0x2A6DF) + || ((p) >= 0x2F800 && (p) <= 0x2FA1F)); +} + +std::string +Xapian::CJK::get_cjk(Xapian::Utf8Iterator &it) { + string str; + while (codepoint_is_cjk(*it) && it != Xapian::Utf8Iterator()) { + Xapian::Unicode::append_utf8(str, *it); + ++it; + } + return str; +} + +tokenizer::tokenizer() : ngram_size(2), + max_token_count(0) { +} + +void +tokenizer::tokenize(const string &str, vector &token_list) { + Xapian::Utf8Iterator it(str), p, end; + string new_token; + while (it != Xapian::Utf8Iterator()) { + if (max_token_count > 0 + && token_list.size() >= max_token_count) { + break; + } + new_token.resize(0); + p = it; + for (unsigned i = 0; i < ngram_size; ++i) { + if (max_token_count > 0 + && token_list.size() >= max_token_count) { + break; + } + if (p == end) break; + if (codepoint_is_cjk(*p)) { + Xapian::Unicode::append_utf8(new_token, *p); + token_list.push_back(new_token); + } + ++p; + } + ++it; + } +} diff -uNr xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.h cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.h --- xapian.vanilla/xapian-core/queryparser/cjk/cjk-tokenizer.h 1969-12-31 16:00:00.000000000 -0800 +++ cjk-support-patch/xapian-core/queryparser/cjk/cjk-tokenizer.h 2011-07-25 02:40:43.796725321 -0700 @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE.include + */ +#ifndef __CJK_TOKENIZER_H__ +#define __CJK_TOKENIZER_H__ + +#include +#include +#include + +namespace Xapian { +namespace CJK { + bool codepoint_is_cjk(unsigned codepoint); + std::string get_cjk(Xapian::Utf8Iterator &it); + + class tokenizer { + unsigned int ngram_size; + unsigned int max_token_count; + + public: + tokenizer(); + void tokenize(const std::string &str, + std::vector &token_list); + }; +}; /* namespace CJK end */ +}; /* namespace Xapian end */ + +#endif diff -uNr xapian.vanilla/xapian-core/queryparser/Makefile.mk cjk-support-patch/xapian-core/queryparser/Makefile.mk --- xapian.vanilla/xapian-core/queryparser/Makefile.mk 2011-07-18 16:17:17.841682000 -0700 +++ cjk-support-patch/xapian-core/queryparser/Makefile.mk 2011-07-25 02:33:14.232725479 -0700 @@ -7,7 +7,8 @@ noinst_HEADERS +=\ queryparser/queryparser_internal.h\ queryparser/queryparser_token.h\ - queryparser/termgenerator_internal.h + queryparser/termgenerator_internal.h\ + queryparser/cjk/cjk-tokenizer.h lemon_built_sources =\ queryparser/queryparser_internal.cc\ @@ -60,4 +61,5 @@ queryparser/queryparser.cc\ queryparser/queryparser_internal.cc\ queryparser/termgenerator.cc\ - queryparser/termgenerator_internal.cc + queryparser/termgenerator_internal.cc\ + queryparser/cjk/cjk-tokenizer.cc diff -uNr xapian.vanilla/xapian-core/queryparser/queryparser.lemony cjk-support-patch/xapian-core/queryparser/queryparser.lemony --- xapian.vanilla/xapian-core/queryparser/queryparser.lemony 2011-07-18 16:17:17.841682000 -0700 +++ cjk-support-patch/xapian-core/queryparser/queryparser.lemony 2011-07-25 02:37:48.560725686 -0700 @@ -40,6 +40,8 @@ #include +#include "cjk/cjk-tokenizer.h" + using namespace std; using namespace Xapian; @@ -192,6 +194,8 @@ */ Query * as_partial_query(State * state_) const; + Query * as_cjk_query() const; + /// Value range query. Query as_value_range_query() const; @@ -430,6 +434,28 @@ return q; } +Query * +Term::as_cjk_query() const +{ + vector prefix_cjk; + vector cjk_tokens; + vector::iterator cjk_it; + const list & prefixes = prefix_info->prefixes; + list::const_iterator piter; + CJK::tokenizer tk; + tk.tokenize(name, cjk_tokens); + for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) { + for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { + string cjk = *piter; + cjk += *cjk_it; + prefix_cjk.push_back(Query(cjk, 1, pos)); + } + } + Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); + delete this; + return q; +} + Query Term::as_value_range_query() const { @@ -562,10 +588,15 @@ } was_acronym = !term.empty(); + if (term.empty() && CJK::codepoint_is_cjk(*it)) { + term = CJK::get_cjk(it); + } + if (term.empty()) { unsigned prevch = *it; Unicode::append_utf8(term, prevch); while (++it != end) { + if (CJK::codepoint_is_cjk(*it)) break; unsigned ch = *it; if (!is_wordchar(ch)) { // Treat a single embedded '&' or "'" or similar as a word @@ -1075,6 +1106,13 @@ Term * term_obj = new Term(&state, term, prefix_info, unstemmed_term, stem_term, term_pos++); + Utf8Iterator tmp_it(term); + if (CJK::codepoint_is_cjk(*tmp_it)) { + Parse(pParser, CJKTERM, term_obj, &state); + if (it == end) break; + continue; + } + if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { if (it != end) { if ((flags & FLAG_WILDCARD) && *it == '*') { @@ -1935,6 +1973,10 @@ delete U; } +compound_term(T) ::= CJKTERM(U). { + { T = U->as_cjk_query(); } +} + // phrase - The "inside the quotes" part of a double-quoted phrase. %type phrase {Terms *} diff -uNr xapian.vanilla/xapian-core/queryparser/termgenerator_internal.cc cjk-support-patch/xapian-core/queryparser/termgenerator_internal.cc --- xapian.vanilla/xapian-core/queryparser/termgenerator_internal.cc 2011-07-18 16:17:17.841682000 -0700 +++ cjk-support-patch/xapian-core/queryparser/termgenerator_internal.cc 2011-07-25 02:35:35.552725536 -0700 @@ -31,6 +31,8 @@ #include #include +#include "cjk/cjk-tokenizer.h" + using namespace std; namespace Xapian { @@ -164,11 +166,50 @@ } while (true) { - unsigned prevch; + if (CJK::codepoint_is_cjk(*itor)) { + CJK::tokenizer tk; + vector cjk_tokens; + vector::iterator cjk_it; + tk.tokenize(CJK::get_cjk(itor), cjk_tokens); + for (cjk_it = cjk_tokens.begin(); cjk_it != cjk_tokens.end(); ++cjk_it) { + if ((*cjk_it).size() > MAX_PROB_TERM_LENGTH) continue; + + if (stop_mode == STOPWORDS_IGNORE && (*stopper)((*cjk_it))) continue; + + if (with_positions) { + doc.add_posting(prefix + (*cjk_it), ++termpos, weight); + } else { + doc.add_term(prefix + (*cjk_it), weight); + } + if ((flags & FLAG_SPELLING) && prefix.empty()) db.add_spelling(*cjk_it); + + if (!stemmer.internal.get()) continue; + + if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && (*stopper)(*cjk_it)) + continue; + + // Note, this uses the lowercased term, but that's OK as we only + // want to avoid stemming terms starting with a digit. + if (!should_stem(*cjk_it)) continue; + + // Add stemmed form without positional information. + string stem("Z"); + stem += prefix; + stem += stemmer(*cjk_it); + doc.add_term(stem, weight); + } + while (true) { + if (itor == Utf8Iterator()) return; + ch = check_wordchar(*itor); + if (ch) break; + ++itor; + } + } + unsigned prevch; do { Unicode::append_utf8(term, ch); prevch = ch; - if (++itor == Utf8Iterator()) goto endofterm; + if (++itor == Utf8Iterator() || CJK::codepoint_is_cjk(*itor)) goto endofterm; ch = check_wordchar(*itor); } while (ch); diff -uNr xapian.vanilla/xapian-core/tests/queryparsertest.cc cjk-support-patch/xapian-core/tests/queryparsertest.cc --- xapian.vanilla/xapian-core/tests/queryparsertest.cc 2011-07-18 16:17:17.841682000 -0700 +++ cjk-support-patch/xapian-core/tests/queryparsertest.cc 2011-07-25 01:47:07.892723694 -0700 @@ -639,7 +639,15 @@ { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"}, { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, + // Some CJK Test added + { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" }, + { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" }, + { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" }, + { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" }, + { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" }, + { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" }, { NULL, NULL } + }; static bool test_queryparser1() @@ -709,6 +717,9 @@ // Add coverage for other cases similar to the above. { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, + // Some CJK Test added + { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" }, + { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" }, { NULL, NULL } }; @@ -761,6 +772,8 @@ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))"); qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A"); TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))"); + qobj = qp.parse_query("英国 title:文森hello", 0, "A"); + TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))"); return true; } diff -uNr xapian.vanilla/xapian-core/tests/termgentest.cc cjk-support-patch/xapian-core/tests/termgentest.cc --- xapian.vanilla/xapian-core/tests/termgentest.cc 2011-07-18 16:17:17.841682000 -0700 +++ cjk-support-patch/xapian-core/tests/termgentest.cc 2011-07-22 13:41:11.083983990 -0700 @@ -61,6 +61,20 @@ }; static const test test_simple[] = { + + // Basic CJK Test + {"", "久有归天", "久[1] 久有[2] 天[7] 归[5] 归天[6] 有[3] 有归[4]" }, + {"", "극지라", "극[1] 극지[2] 라[5] 지[3] 지라[4]" }, + {"", "ウルス アップ", "ア[6] ウ[1] ウル[2] ス[5] ッ[7] ップ[8] プ[9] ル[3] ルス[4]" }, + + // CJK With Prefix + {"prefix=XA", "发送从", "XA从[5] XA发[1] XA发送[2] XA送[3] XA送从[4]"}, + {"prefix=XA", "点卡思考", "XA卡[3] XA卡思[4] XA思[5] XA思考[6] XA点[1] XA点卡[2] XA考[7]"}, + + // CJK Mix + {"prefix=", "インtestタ", "test[4] イ[1] イン[2] タ[5] ン[3]"}, + {"", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]"}, + // A basic test with a hyphen { "", "simple-example", "example[2] simple[1]" }, { "cont,weight=2",