diff -r e0433a73c6c7 dcpp/FavoriteManager.cpp --- a/dcpp/FavoriteManager.cpp Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/FavoriteManager.cpp Thu Dec 22 13:45:51 2016 +0200 @@ -471,7 +471,7 @@ try { SimpleXML xml; Util::migrate(getConfigFile()); - xml.fromXML(File(getConfigFile(), File::READ, File::OPEN).read()); + xml.fromXML(File(getConfigFile(), File::READ, File::OPEN).read(), SimpleXMLReader::FLAG_REPLACE_INVALID_UTF8); if(xml.findChild("Favorites")) { xml.stepIn(); diff -r e0433a73c6c7 dcpp/SimpleXML.cpp --- a/dcpp/SimpleXML.cpp Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/SimpleXML.cpp Thu Dec 22 13:45:51 2016 +0200 @@ -182,14 +182,14 @@ (*currentChild)->attribs.emplace_back(aName, aData); } -void SimpleXML::fromXML(const string& aXML) { +void SimpleXML::fromXML(const string& aXML, int aFlags) { if(!root.children.empty()) { delete root.children[0]; root.children.clear(); } TagReader t(&root); - SimpleXMLReader(&t).parse(aXML); + SimpleXMLReader(&t, aFlags).parse(aXML); if(root.children.size() != 1) { throw SimpleXMLException("Invalid XML file, missing or multiple root tags"); diff -r e0433a73c6c7 dcpp/SimpleXML.h --- a/dcpp/SimpleXML.h Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/SimpleXML.h Thu Dec 22 13:45:51 2016 +0200 @@ -136,7 +136,7 @@ return (!tmp.empty()) && tmp[0] == '1'; } - void fromXML(const string& aXML); + void fromXML(const string& aXML, int aFlags = 0); string toXML(); void toXML(OutputStream* f) { if(!root.children.empty()) root.children[0]->toXML(0, f); } diff -r e0433a73c6c7 dcpp/SimpleXMLReader.cpp --- a/dcpp/SimpleXMLReader.cpp Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/SimpleXMLReader.cpp Thu Dec 22 13:45:51 2016 +0200 @@ -67,8 +67,8 @@ ; } -SimpleXMLReader::SimpleXMLReader(SimpleXMLReader::CallBack* callback) : - bufPos(0), pos(0), cb(callback), state(STATE_START) +SimpleXMLReader::SimpleXMLReader(SimpleXMLReader::CallBack* callback, int aFlags) : + bufPos(0), pos(0), cb(callback), state(STATE_START), flags(aFlags) { elements.reserve(64); attribs.reserve(16); @@ -748,14 +748,16 @@ } void SimpleXMLReader::decodeString(string& str_) { - auto isUtf8 = encoding.empty() || compare(encoding, Text::utf8) == 0; - - if (isUtf8) { - if (!Text::validateUtf8(str_)) { - error("UTF-8 validation failed"); - } - } else { - str_ = Text::toUtf8(str_, encoding); + auto isUtf8 = encoding.empty() || compare(encoding, Text::utf8) == 0; + + if (!isUtf8) { + str_ = Text::toUtf8(str_, encoding); + } else if (!Text::validateUtf8(str_)) { + if (flags & FLAG_REPLACE_INVALID_UTF8) { + str_ = Text::sanitizeUtf8(str_); + } else { + error("Malformed UTF-8 data"); + } } } diff -r e0433a73c6c7 dcpp/SimpleXMLReader.h --- a/dcpp/SimpleXMLReader.h Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/SimpleXMLReader.h Thu Dec 22 13:45:51 2016 +0200 @@ -52,7 +52,12 @@ static const std::string& getAttrib(StringPairList& attribs, const std::string& name, size_t hint); }; - SimpleXMLReader(CallBack* callback); + enum Flags { + // Replace invalid UTF-8 data with placeholder characters + FLAG_REPLACE_INVALID_UTF8 = 0x01, + }; + + SimpleXMLReader(CallBack* callback, int aFlags = 0); virtual ~SimpleXMLReader() { } void parse(InputStream& is, size_t maxSize = 0); @@ -191,6 +196,8 @@ bool error(const char* message); void decodeString(string& str_); + + const int flags; }; diff -r e0433a73c6c7 dcpp/Text.cpp --- a/dcpp/Text.cpp Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/Text.cpp Thu Dec 22 13:45:51 2016 +0200 @@ -203,27 +203,13 @@ if(str.empty()) { return Util::emptyString; } -#ifdef _WIN32 - int size = 0; - tgt.resize( str.length() * 2 ); - while( ( size = WideCharToMultiByte(CP_UTF8, 0, str.c_str(), str.length(), &tgt[0], tgt.length(), NULL, NULL) ) == 0 ){ - if( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) - tgt.resize( tgt.size() * 2 ); - else - break; - } - - tgt.resize( size ); - return tgt; -#else string::size_type n = str.length(); tgt.clear(); for(string::size_type i = 0; i < n; ++i) { wcToUtf8(str[i], tgt); } return tgt; -#endif } const string& wideToAcp(const wstring& str, string& tmp, const string& toCharset) noexcept { @@ -269,26 +255,16 @@ return true; } +string sanitizeUtf8(const string& str) noexcept { + return wideToUtf8(utf8ToWide(str)); +} + const string& utf8ToAcp(const string& str, string& tmp, const string& toCharset) noexcept { wstring wtmp; return wideToAcp(utf8ToWide(str, wtmp), tmp, toCharset); } const wstring& utf8ToWide(const string& str, wstring& tgt) noexcept { -#ifdef _WIN32 - int size = 0; - tgt.resize( str.length()+1 ); - while( ( size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), &tgt[0], (int)tgt.length()) ) == 0 ){ - if( GetLastError() == ERROR_INSUFFICIENT_BUFFER ) { - tgt.resize( tgt.size()*2 ); - } else { - break; - } - - } - tgt.resize( size ); - return tgt; -#else tgt.reserve(str.length()); string::size_type n = str.length(); for(string::size_type i = 0; i < n; ) { @@ -303,7 +279,6 @@ } } return tgt; -#endif } wchar_t toLower(wchar_t c) noexcept { diff -r e0433a73c6c7 dcpp/Text.h --- a/dcpp/Text.h Sun Dec 18 23:39:05 2016 +0100 +++ b/dcpp/Text.h Thu Dec 22 13:45:51 2016 +0200 @@ -81,6 +81,7 @@ int utf8ToWc(const char* str, wchar_t& c); void wcToUtf8(wchar_t c, string& str); + string sanitizeUtf8(const string& str) noexcept; #ifdef UNICODE inline const tstring& toT(const string& str, tstring& tmp) noexcept { return utf8ToWide(str, tmp); } inline tstring toT(const string& str) noexcept { return utf8ToWide(str); } diff -r e0433a73c6c7 test/testxml.cpp --- a/test/testxml.cpp Sun Dec 18 23:39:05 2016 +0100 +++ b/test/testxml.cpp Thu Dec 22 13:45:51 2016 +0200 @@ -120,3 +120,17 @@ } } +TEST(testxml, test_utf_validation) +{ + const char xml[] = ""; + + Collector collector; + SimpleXMLReader reader(&collector, SimpleXMLReader::FLAG_REPLACE_INVALID_UTF8); + + for (size_t i = 0, iend = sizeof(xml); i < iend; ++i) { + reader.parse(xml + i, 1); + } + + ASSERT_EQ(collector.attribValues["_Name"], 1); +} +