Index: xapian-core/queryparser/Makefile.mk =================================================================== --- xapian-core/queryparser/Makefile.mk (revision 16053) +++ xapian-core/queryparser/Makefile.mk (revision 16055) @@ -5,6 +5,7 @@ endif noinst_HEADERS +=\ + queryparser/cjk-tokenizer.h\ queryparser/queryparser_internal.h\ queryparser/queryparser_token.h\ queryparser/termgenerator_internal.h @@ -57,6 +58,7 @@ endif lib_src +=\ + queryparser/cjk-tokenizer.cc\ queryparser/queryparser.cc\ queryparser/queryparser_internal.cc\ queryparser/termgenerator.cc\ Index: xapian-core/queryparser/cjk-tokenizer.cc =================================================================== --- xapian-core/queryparser/cjk-tokenizer.cc (revision 0) +++ xapian-core/queryparser/cjk-tokenizer.cc (revision 16055) @@ -0,0 +1,124 @@ +/** @file cjk-tokenizer.cc + * @brief Tokenise CJK text as n-grams + */ +/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * Copyright (c) 2011 Olly Betts + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "cjk-tokenizer.h" + +#include "omassert.h" +#include "xapian/unicode.h" + +#include +#include + +using namespace std; + +static unsigned NGRAM_SIZE = 2; + +bool +CJK::is_cjk_enabled() +{ + const char * p; + static bool result = ((p = getenv("XAPIAN_CJK_NGRAM")) != NULL && *p); + return result; +} + +// 2E80..2EFF; CJK Radicals Supplement +// 3000..303F; CJK Symbols and Punctuation +// 3040..309F; Hiragana +// 30A0..30FF; Katakana +// 3100..312F; Bopomofo +// 3130..318F; Hangul Compatibility Jamo +// 3190..319F; Kanbun +// 31A0..31BF; Bopomofo Extended +// 31C0..31EF; CJK Strokes +// 31F0..31FF; Katakana Phonetic Extensions +// 3200..32FF; Enclosed CJK Letters and Months +// 3300..33FF; CJK Compatibility +// 3400..4DBF; CJK Unified Ideographs Extension A +// 4DC0..4DFF; Yijing Hexagram Symbols +// 4E00..9FFF; CJK Unified Ideographs +// A700..A71F; Modifier Tone Letters +// AC00..D7AF; Hangul Syllables +// F900..FAFF; CJK Compatibility Ideographs +// FE30..FE4F; CJK Compatibility Forms +// FF00..FFEF; Halfwidth and Fullwidth Forms +// 20000..2A6DF; CJK Unified Ideographs Extension B +// 2F800..2FA1F; CJK Compatibility Ideographs Supplement +bool +CJK::codepoint_is_cjk(unsigned p) +{ + if (p < 0x2E80) return false; + return ((p >= 0x2E80 && p <= 0x2EFF) || + (p >= 0x3000 && p <= 0x9FFF) || + (p >= 0xA700 && p <= 0xA71F) || + (p >= 0xAC00 && p <= 0xD7AF) || + (p >= 0xF900 && p <= 0xFAFF) || + (p >= 0xFE30 && p <= 0xFE4F) || + (p >= 0xFF00 && p <= 0xFFEF) || + (p >= 0x20000 && p <= 0x2A6DF) || + (p >= 0x2F800 && p <= 0x2FA1F)); +} + +string +CJK::get_cjk(Xapian::Utf8Iterator &it) +{ + string str; + while (it != Xapian::Utf8Iterator() && codepoint_is_cjk(*it)) { + Xapian::Unicode::append_utf8(str, *it); + ++it; + } + return str; +} + +const string & +CJKTokenIterator::operator*() const +{ + if (current_token.empty()) { + Assert(it != Xapian::Utf8Iterator()); + p = it; + Xapian::Unicode::append_utf8(current_token, *p); + ++p; + len = 1; + } + return current_token; +} + +CJKTokenIterator & +CJKTokenIterator::operator++() +{ + if (len < NGRAM_SIZE && p != Xapian::Utf8Iterator()) { + Xapian::Unicode::append_utf8(current_token, *p); + ++p; + ++len; + } else { + Assert(it != Xapian::Utf8Iterator()); + ++it; + current_token.resize(0); + } + return *this; +} Index: xapian-core/queryparser/queryparser.lemony =================================================================== --- xapian-core/queryparser/queryparser.lemony (revision 16053) +++ xapian-core/queryparser/queryparser.lemony (revision 16055) @@ -33,6 +33,8 @@ // Include the list of token values lemon generates. #include "queryparser_token.h" +#include "cjk-tokenizer.h" + #include #include #include @@ -136,6 +138,8 @@ } }; +class Terms; + /** Class used to pass information about a token from lexer to parser. * * Generally an instance of this class carries term information, but it can be @@ -192,6 +196,12 @@ */ Query * as_partial_query(State * state_) const; + /** Build a query for a string of CJK characters. */ + Query * as_cjk_query() const; + + /** Handle a CJK character string in a positional context. */ + void as_positional_cjk_term(Terms * terms) const; + /// Value range query. Query as_value_range_query() const; @@ -430,6 +440,24 @@ return q; } +Query * +Term::as_cjk_query() const +{ + vector prefix_cjk; + const list & prefixes = prefix_info->prefixes; + list::const_iterator piter; + for (CJKTokenIterator tk(name); tk != CJKTokenIterator(); ++tk) { + for (piter = prefixes.begin(); piter != prefixes.end(); ++piter) { + string cjk = *piter; + cjk += *tk; + prefix_cjk.push_back(Query(cjk, 1, pos)); + } + } + Query * q = new Query(Query::OP_AND, prefix_cjk.begin(), prefix_cjk.end()); + delete this; + return q; +} + Query Term::as_value_range_query() const { @@ -537,6 +565,7 @@ string QueryParser::Internal::parse_term(Utf8Iterator &it, const Utf8Iterator &end, + bool cjk_ngram, bool & is_cjk_term, bool &was_acronym) { string term; @@ -562,10 +591,16 @@ } was_acronym = !term.empty(); + if (cjk_ngram && term.empty() && CJK::codepoint_is_cjk(*it)) { + term = CJK::get_cjk(it); + is_cjk_term = true; + } + if (term.empty()) { unsigned prevch = *it; Unicode::append_utf8(term, prevch); while (++it != end) { + if (cjk_ngram && CJK::codepoint_is_cjk(*it)) break; unsigned ch = *it; if (!is_wordchar(ch)) { // Treat a single embedded '&' or "'" or similar as a word @@ -634,6 +669,8 @@ QueryParser::Internal::parse_query(const string &qs, unsigned flags, const string &default_prefix) { + bool cjk_ngram = CJK::is_cjk_enabled(); + // Set value_ranges if we may have to handle value ranges in the query. bool value_ranges; value_ranges = !valrangeprocs.empty() && (qs.find("..") != string::npos); @@ -975,7 +1012,8 @@ phrased_term: bool was_acronym; - string term = parse_term(it, end, was_acronym); + bool is_cjk_term = false; + string term = parse_term(it, end, cjk_ngram, is_cjk_term, was_acronym); // Boolean operators. if ((mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) && @@ -1075,6 +1113,12 @@ Term * term_obj = new Term(&state, term, prefix_info, unstemmed_term, stem_term, term_pos++); + if (is_cjk_term) { + Parse(pParser, CJKTERM, term_obj, &state); + if (it == end) break; + continue; + } + if (mode == DEFAULT || mode == IN_GROUP || mode == IN_GROUP2) { if (it != end) { if ((flags & FLAG_WILDCARD) && *it == '*') { @@ -1552,6 +1596,23 @@ } }; +void +Term::as_positional_cjk_term(Terms * terms) const +{ + // Add each individual CJK character to the phrase. + string t; + for (Utf8Iterator it(name); it != Utf8Iterator(); ++it) { + Unicode::append_utf8(t, *it); + Term * c = new Term(state, t, prefix_info, unstemmed, stem, pos); + terms->add_positional_term(c); + t.resize(0); + } + + // FIXME: we want to add the n-grams as filters too for efficiency. + + delete this; +} + // Helper macro for converting a boolean operation into a Xapian::Query. #define BOOL_OP_TO_QUERY(E, A, OP, B, OP_TXT) \ do {\ @@ -1935,6 +1996,10 @@ delete U; } +compound_term(T) ::= CJKTERM(U). { + { T = U->as_cjk_query(); } +} + // phrase - The "inside the quotes" part of a double-quoted phrase. %type phrase {Terms *} @@ -1946,11 +2011,21 @@ P->add_positional_term(T); } +phrase(P) ::= CJKTERM(T). { + P = new Terms; + T->as_positional_cjk_term(P); +} + phrase(P) ::= phrase(Q) TERM(T). { P = Q; P->add_positional_term(T); } +phrase(P) ::= phrase(Q) CJKTERM(T). { + P = Q; + T->as_positional_cjk_term(P); +} + // phrased_term - A phrased term works like a single term, but is actually // 2 or more terms linked together into a phrase by punctuation. There must be // at least 2 terms in order to be able to have punctuation between the terms! Index: xapian-core/queryparser/queryparser_internal.h =================================================================== --- xapian-core/queryparser/queryparser_internal.h (revision 16053) +++ xapian-core/queryparser/queryparser_internal.h (revision 16055) @@ -1,7 +1,7 @@ /* queryparser_internal.h: The non-lemon-generated parts of the QueryParser * class. * - * Copyright (C) 2005,2006,2007,2010 Olly Betts + * Copyright (C) 2005,2006,2007,2010,2011 Olly Betts * Copyright (C) 2010 Adam Sjøgren * * This program is free software; you can redistribute it and/or @@ -83,6 +83,7 @@ filter_type type); std::string parse_term(Utf8Iterator &it, const Utf8Iterator &end, + bool cjk_ngram, bool &is_cjk_term, bool &was_acronym); public: Index: xapian-core/queryparser/cjk-tokenizer.h =================================================================== --- xapian-core/queryparser/cjk-tokenizer.h (revision 0) +++ xapian-core/queryparser/cjk-tokenizer.h (revision 16055) @@ -0,0 +1,94 @@ +/** @file cjk-tokenizer.h + * @brief Tokenise CJK text as n-grams + */ +/* Copyright (c) 2007, 2008 Yung-chung Lin (henearkrxern@gmail.com) + * Copyright (c) 2011 Richard Boulton (richard@tartarus.org) + * Copyright (c) 2011 Brandon Schaefer (brandontschaefer@gmail.com) + * Copyright (c) 2011 Olly Betts + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef XAPIAN_INCLUDED_CJK_TOKENIZER_H +#define XAPIAN_INCLUDED_CJK_TOKENIZER_H + +#include "xapian/unicode.h" + +#include + +namespace CJK { + +/** Should we use the CJK n-gram code? + * + * The first time this is called it reads the environmental variable + * XAPIAN_CJK_NGRAM and returns true if it is set to a non-empty value. + * Subsequent calls cache and return the same value. + */ +bool is_cjk_enabled(); + +bool codepoint_is_cjk(unsigned codepoint); + +std::string get_cjk(Xapian::Utf8Iterator &it); + +} + +class CJKTokenIterator { + Xapian::Utf8Iterator it; + + mutable Xapian::Utf8Iterator p; + + mutable unsigned len; + + mutable std::string current_token; + + public: + CJKTokenIterator(const std::string & s) + : it(s) { } + + CJKTokenIterator(const Xapian::Utf8Iterator & it_) + : it(it_) { } + + CJKTokenIterator() + : it() { } + + const std::string & operator*() const; + + CJKTokenIterator & operator++(); + + /// Get the length of the current token in Unicode characters. + unsigned get_length() const { return len; } + + friend bool operator==(const CJKTokenIterator &, const CJKTokenIterator &); +}; + +inline bool +operator==(const CJKTokenIterator & a, const CJKTokenIterator & b) +{ + // We only really care about comparisons where one or other is an end + // iterator. + return a.it == b.it; +} + +inline bool +operator!=(const CJKTokenIterator & a, const CJKTokenIterator & b) +{ + return !(a == b); +} + +#endif // XAPIAN_INCLUDED_CJK_TOKENIZER_H Index: xapian-core/queryparser/termgenerator_internal.cc =================================================================== --- xapian-core/queryparser/termgenerator_internal.cc (revision 16053) +++ xapian-core/queryparser/termgenerator_internal.cc (revision 16055) @@ -1,7 +1,7 @@ /** @file termgenerator_internal.cc * @brief TermGenerator class internals */ -/* Copyright (C) 2007,2010 Olly Betts +/* Copyright (C) 2007,2010,2011 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -31,6 +31,8 @@ #include #include +#include "cjk-tokenizer.h" + using namespace std; namespace Xapian { @@ -127,6 +129,8 @@ TermGenerator::Internal::index_text(Utf8Iterator itor, termcount wdf_inc, const string & prefix, bool with_positions) { + bool cjk_ngram = CJK::is_cjk_enabled(); + int stop_mode = STOPWORDS_INDEX_UNSTEMMED_ONLY; if (!stopper) stop_mode = STOPWORDS_NONE; @@ -164,11 +168,53 @@ } while (true) { + if (cjk_ngram && CJK::codepoint_is_cjk(*itor)) { + const string & cjk = CJK::get_cjk(itor); + for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) { + const string & cjk_token = *tk; + if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue; + + if (stop_mode == STOPWORDS_IGNORE && (*stopper)(cjk_token)) + continue; + + if (with_positions && tk.get_length() == 1) { + doc.add_posting(prefix + cjk_token, ++termpos, wdf_inc); + } else { + doc.add_term(prefix + cjk_token, wdf_inc); + } + if ((flags & FLAG_SPELLING) && prefix.empty()) + db.add_spelling(cjk_token); + + if (!stemmer.internal.get()) continue; + + if (stop_mode == STOPWORDS_INDEX_UNSTEMMED_ONLY && + (*stopper)(cjk_token)) + continue; + + // Note, this uses the lowercased term, but that's OK as we + // only want to avoid stemming terms starting with a digit. + if (!should_stem(cjk_token)) continue; + + // Add stemmed form without positional information. + string stem("Z"); + stem += prefix; + stem += stemmer(cjk_token); + doc.add_term(stem, wdf_inc); + } + while (true) { + if (itor == Utf8Iterator()) return; + ch = check_wordchar(*itor); + if (ch) break; + ++itor; + } + } unsigned prevch; do { Unicode::append_utf8(term, ch); prevch = ch; - if (++itor == Utf8Iterator()) goto endofterm; + if (++itor == Utf8Iterator() || + (cjk_ngram && CJK::codepoint_is_cjk(*itor))) + goto endofterm; ch = check_wordchar(*itor); } while (ch); Index: xapian-core/tests/termgentest.cc =================================================================== --- xapian-core/tests/termgentest.cc (revision 16053) +++ xapian-core/tests/termgentest.cc (revision 16055) @@ -31,6 +31,8 @@ #include "testutils.h" #include "utils.h" +#include // For setenv() or putenv() + using namespace std; #define TESTCASE(S) {#S, test_##S} @@ -106,12 +108,26 @@ "Z\xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80:1 \xe1\x80\x9d\xe1\x80\xae\xe1\x80\x80\xe1\x80\xae\xe1\x80\x95\xe1\x80\xad\xe1\x80\x9e\xe1\x80\xaf\xe1\x80\xb6\xe1\x80\xb8\xe1\x80\x85\xe1\x80\xbd\xe1\x80\xb2\xe1\x80\x9e\xe1\x80\xb0\xe1\x80\x99\xe1\x80\xbb\xe1\x80\xac\xe1\x80\xb8\xe1\x80\x80[1]" }, { "", "fish+chips", "Zchip:1 Zfish:1 chips[2] fish[1]" }, + + // Basic CJK tests: + { "stem=", "久有归天", "久[1] 久有:1 天[4] 归[3] 归天:1 有[2] 有归:1" }, + { "", "극지라", "극[1] 극지:1 라[3] 지[2] 지라:1" }, + { "", "ウルス アップ", "ア[4] ウ[1] ウル:1 ス[3] ッ[5] ップ:1 プ[6] ル[2] ルス:1" }, + + // CJK with prefix: + { "prefix=XA", "发送从", "XA从[3] XA发[1] XA发送:1 XA送[2] XA送从:1" }, + { "prefix=XA", "点卡思考", "XA卡[2] XA卡思:1 XA思[3] XA思考:1 XA点[1] XA点卡:1 XA考[4]" }, + + // CJK mixed with non-CJK: + { "prefix=", "インtestタ", "test[3] イ[1] イン:1 タ[4] ン[2]" }, + { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" }, + // All following tests are for things which we probably don't really want to // behave as they currently do, but we haven't found a sufficiently general // way to implement them yet. // Test number like things - { "", "11:59", "11[1] 59[2]" }, + { "stem=en", "11:59", "11[1] 59[2]" }, { "", "11:59am", "11[1] 59am[2]" }, { NULL, NULL, NULL } @@ -770,6 +786,14 @@ int main(int argc, char **argv) try { + // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set. +#ifdef __WIN32__ + _putenv_s("XAPIAN_CJK_NGRAM", "1"); +#elif defined HAVE_SETENV + setenv("XAPIAN_CJK_NGRAM", "1", 1); +#else + putenv(const_cast("XAPIAN_CJK_NGRAM=1")); +#endif test_driver::parse_command_line(argc, argv); return test_driver::run(tests); } catch (const char * e) { Index: xapian-core/tests/queryparsertest.cc =================================================================== --- xapian-core/tests/queryparsertest.cc (revision 16053) +++ xapian-core/tests/queryparsertest.cc (revision 16055) @@ -33,6 +33,8 @@ #include #include +#include // For setenv() or putenv() + using namespace std; #define TESTCASE(S) {#S, test_##S} @@ -639,6 +641,17 @@ { "multisite:xapian.org site:www.xapian.org author:richard authortitle:richard", "((ZArichard:(pos=1) OR ZArichard:(pos=2) OR ZXTrichard:(pos=2)) FILTER (Hwww.xapian.org AND (Hxapian.org OR Jxapian.org)))"}, { "authortitle:richard-boulton", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, { "authortitle:\"richard boulton\"", "((Arichard:(pos=1) PHRASE 2 Aboulton:(pos=2)) OR (XTrichard:(pos=1) PHRASE 2 XTboulton:(pos=2)))"}, + // Some CJK tests. + { "久有归天愿", "(久:(pos=1) AND 久有:(pos=1) AND 有:(pos=1) AND 有归:(pos=1) AND 归:(pos=1) AND 归天:(pos=1) AND 天:(pos=1) AND 天愿:(pos=1) AND 愿:(pos=1))" }, + { "title:久有 归 天愿", "((XT久:(pos=1) AND XT久有:(pos=1) AND XT有:(pos=1)) OR 归:(pos=2) OR (天:(pos=3) AND 天愿:(pos=3) AND 愿:(pos=3)))" }, + { "h众ello万众", "(Zh:(pos=1) OR 众:(pos=2) OR Zello:(pos=3) OR (万:(pos=4) AND 万众:(pos=4) AND 众:(pos=4)))" }, + { "世(の中)TEST_tm", "(世:(pos=1) OR (の:(pos=2) AND の中:(pos=2) AND 中:(pos=2)) OR test_tm:(pos=3))" }, + { "다녀 AND 와야", "(다:(pos=1) AND 다녀:(pos=1) AND 녀:(pos=1) AND 와:(pos=2) AND 와야:(pos=2) AND 야:(pos=2))" }, + { "authortitle:학술 OR 연구를", "((A학:(pos=1) AND XT학:(pos=1) AND A학술:(pos=1) AND XT학술:(pos=1) AND A술:(pos=1) AND XT술:(pos=1)) OR (연:(pos=2) AND 연구:(pos=2) AND 구:(pos=2) AND 구를:(pos=2) AND 를:(pos=2)))" }, + // FIXME: These should really filter by bigrams to accelerate: + { "\"久有归\"", "(久:(pos=1) PHRASE 3 有:(pos=1) PHRASE 3 归:(pos=1))" }, + { "\"久有test归\"", "(久:(pos=1) PHRASE 4 有:(pos=1) PHRASE 4 test:(pos=2) PHRASE 4 归:(pos=3))" }, + // FIXME: this should work: { "久 NEAR 有", "(久:(pos=1) NEAR 11 有:(pos=2))" }, { NULL, NULL } }; @@ -709,6 +722,9 @@ // Add coverage for other cases similar to the above. { "a b site:xapian.org", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, { "site:xapian.org a b", "((Za:(pos=1) AND Zb:(pos=2)) FILTER Hxapian.org)" }, + // Some CJK tests. + { "author:험가 OR subject:万众 hello world!", "((A험:(pos=1) AND A험가:(pos=1) AND A가:(pos=1)) OR (XT万:(pos=2) AND XT万众:(pos=2) AND XT众:(pos=2) AND Zhello:(pos=3) AND Zworld:(pos=4)))" }, + { "洛伊one儿差点two脸three", "(洛:(pos=1) AND 洛伊:(pos=1) AND 伊:(pos=1) AND Zone:(pos=2) AND 儿:(pos=3) AND 儿差:(pos=3) AND 差:(pos=3) AND 差点:(pos=3) AND 点:(pos=3) AND Ztwo:(pos=4) AND 脸:(pos=5) AND Zthree:(pos=6))" }, { NULL, NULL } }; @@ -761,6 +777,8 @@ TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZAme:(pos=1) OR ZXTstuff:(pos=2)))"); qobj = qp.parse_query("title:(stuff) me", Xapian::QueryParser::FLAG_BOOLEAN, "A"); TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query((ZXTstuff:(pos=1) OR ZAme:(pos=2)))"); + qobj = qp.parse_query("英国 title:文森hello", 0, "A"); + TEST_STRINGS_EQUAL(qobj.get_description(), "Xapian::Query(((A英:(pos=1) AND A英国:(pos=1) AND A国:(pos=1)) OR (XT文:(pos=2) AND XT文森:(pos=2) AND XT森:(pos=2)) OR ZAhello:(pos=3)))"); return true; } @@ -2507,6 +2525,14 @@ int main(int argc, char **argv) try { + // FIXME: It would be better to test with and without XAPIAN_CJK_NGRAM set. +#ifdef __WIN32__ + _putenv_s("XAPIAN_CJK_NGRAM", "1"); +#elif defined HAVE_SETENV + setenv("XAPIAN_CJK_NGRAM", "1", 1); +#else + putenv(const_cast("XAPIAN_CJK_NGRAM=1")); +#endif test_driver::parse_command_line(argc, argv); return test_driver::run(tests); } catch (const char * e) { Index: xapian-core/ChangeLog =================================================================== --- xapian-core/ChangeLog (revision 16053) +++ xapian-core/ChangeLog (revision 16055) @@ -1,3 +1,17 @@ +Wed Aug 24 14:25:21 GMT 2011 Olly Betts + + * Backport change from trunk: + * queryparser/queryparser.lemony: Fix memory leak (caught by existing + testcase queryparser1 when run under valgrind). + +Wed Aug 24 14:13:24 GMT 2011 Olly Betts + + * Backport change from trunk: + * queryparser/,tests/queryparsertest.cc,tests/termgentest.cc: Add + support for indexing and searching CJK text using n-grams. Currently + this is only enabled if environmental variable XAPIAN_CJK_NGRAM is + set to a non-empty value. + Wed Aug 10 06:09:39 GMT 2011 Olly Betts * NEWS: Finalise 1.2.7.