|
Boost-Commit : |
Subject: [Boost-commit] svn:boost r75594 - in trunk/libs/locale: src/encoding test
From: artyomtnk_at_[hidden]
Date: 2011-11-21 08:23:31
Author: artyom
Date: 2011-11-21 08:23:30 EST (Mon, 21 Nov 2011)
New Revision: 75594
URL: http://svn.boost.org/trac/boost/changeset/75594
Log:
Fixed incorrect use of MultiByteToWideChar in detection of invalid
input sequences
Text files modified:
trunk/libs/locale/src/encoding/wconv_codepage.ipp | 87 ++++++++++++++++++++++++++++++---------
trunk/libs/locale/test/test_codepage.cpp | 45 ++++++++++++++++++++
2 files changed, 111 insertions(+), 21 deletions(-)
Modified: trunk/libs/locale/src/encoding/wconv_codepage.ipp
==============================================================================
--- trunk/libs/locale/src/encoding/wconv_codepage.ipp (original)
+++ trunk/libs/locale/src/encoding/wconv_codepage.ipp 2011-11-21 08:23:30 EST (Mon, 21 Nov 2011)
@@ -86,12 +86,6 @@
{ "windows932", 932, 0 },
};
- size_t remove_substitutions(std::vector<wchar_t> &v)
- {
- v.erase(std::remove(v.begin(), v.end(), wchar_t(0xFFFD)), v.end());
- return v.size();
- }
-
size_t remove_substitutions(std::vector<char> &v)
{
if(std::find(v.begin(),v.end(),0) == v.end()) {
@@ -107,23 +101,39 @@
return v.size();
}
+ void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
+ {
+ buf.reserve(end-begin);
+ while(begin!=end) {
+ wchar_t wide_buf[4];
+ int n = 0;
+ int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
+ if(len == 2 && begin+1==end)
+ return;
+ n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
+ for(int i=0;i<n;i++)
+ buf.push_back(wide_buf[i]);
+ begin+=len;
+ }
+ }
+
void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
{
if(begin==end)
return;
- DWORD flags = do_skip ? 0 : MB_ERR_INVALID_CHARS;
- if(50220 <= codepage && codepage <= 50229)
- flags = 0;
-
- int n = MultiByteToWideChar(codepage,flags,begin,end-begin,0,0);
- if(n == 0)
+ int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
+ if(n == 0) {
+ if(do_skip) {
+ multibyte_to_wide_one_by_one(codepage,begin,end,buf);
+ return;
+ }
throw conversion_error();
+ }
+
buf.resize(n,0);
- if(MultiByteToWideChar(codepage,flags,begin,end-begin,&buf.front(),buf.size())==0)
+ if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
throw conversion_error();
- if(do_skip)
- remove_substitutions(buf);
}
void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
@@ -256,13 +266,37 @@
}
virtual std::string convert(char const *begin,char const *end)
{
+ if(to_code_page_ == 65001 && from_code_page_ == 65001)
+ return utf_to_utf<char>(begin,end,how_);
+
std::string res;
- std::vector<wchar_t> tmp;
- multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
- if(tmp.empty())
- return res;
+
+ std::vector<wchar_t> tmp; // buffer for mb2w
+ std::wstring tmps; // buffer for utf_to_utf
+ wchar_t const *wbegin=0;
+ wchar_t const *wend=0;
+
+ if(from_code_page_ == 65001) {
+ tmps = utf_to_utf<wchar_t>(begin,end,how_);
+ if(tmps.empty())
+ return res;
+ wbegin = tmps.c_str();
+ wend = wbegin + tmps.size();
+ }
+ else {
+ multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
+ if(tmp.empty())
+ return res;
+ wbegin = &tmp[0];
+ wend = wbegin + tmp.size();
+ }
+
+ if(to_code_page_ == 65001) {
+ return utf_to_utf<char>(wbegin,wend,how_);
+ }
+
std::vector<char> ctmp;
- wide_to_multibyte(to_code_page_,&tmp.front(),&tmp.front()+tmp.size(),how_ == skip,ctmp);
+ wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
if(ctmp.empty())
return res;
res.assign(&ctmp.front(),ctmp.size());
@@ -328,6 +362,9 @@
virtual string_type convert(char const *begin,char const *end)
{
+ if(code_page_ == 65001) {
+ return utf_to_utf<char_type>(begin,end,how_);
+ }
std::vector<wchar_t> tmp;
multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
string_type res;
@@ -363,6 +400,9 @@
virtual std::string convert(CharType const *begin,CharType const *end)
{
+ if(code_page_ == 65001) {
+ return utf_to_utf<char>(begin,end,how_);
+ }
wchar_t const *wbegin = 0;
wchar_t const *wend = 0;
std::vector<wchar_t> buffer; // if needed
@@ -424,9 +464,11 @@
virtual string_type convert(char const *begin,char const *end)
{
+ if(code_page_ == 65001) {
+ return utf_to_utf<char_type>(begin,end,how_);
+ }
std::vector<wchar_t> buf;
multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
- remove_substitutions(buf);
if(buf.empty())
return string_type();
@@ -460,6 +502,9 @@
virtual std::string convert(CharType const *begin,CharType const *end)
{
+ if(code_page_ == 65001) {
+ return utf_to_utf<char>(begin,end,how_);
+ }
std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
std::vector<char> ctmp;
Modified: trunk/libs/locale/test/test_codepage.cpp
==============================================================================
--- trunk/libs/locale/test/test_codepage.cpp (original)
+++ trunk/libs/locale/test/test_codepage.cpp 2011-11-21 08:23:30 EST (Mon, 21 Nov 2011)
@@ -333,6 +333,49 @@
}
+void test_skip(char const *enc,char const *utf,char const *name,char const *opt=0)
+{
+ if(opt!=0) {
+ if(boost::locale::conv::to_utf<char>(enc,name) == opt)
+ test_skip(enc,opt,name);
+ }
+ TEST(boost::locale::conv::to_utf<char>(enc,name) == utf);
+ TEST(boost::locale::conv::to_utf<wchar_t>(enc,name) == boost::locale::conv::utf_to_utf<wchar_t>(utf));
+ #ifdef BOOST_HAS_CHAR16_T
+ TEST(boost::locale::conv::to_utf<char16_t>(enc,name) == boost::locale::conv::utf_to_utf<char16_t>(utf));
+ #endif
+ #ifdef BOOST_HAS_CHAR32_T
+ TEST(boost::locale::conv::to_utf<char32_t>(enc,name) == boost::locale::conv::utf_to_utf<char32_t>(utf));
+ #endif
+}
+
+void test_simple_conversions()
+{
+ namespace blc=boost::locale::conv;
+ std::cout << "- Testing correct invalid bytes skipping" << std::endl;
+ try {
+ std::cout << "-- ISO-8859-8" << std::endl;
+ test_skip("test \xE0\xE1\xFB-","test \xd7\x90\xd7\x91-","ISO-8859-8");
+ test_skip("\xFB","","ISO-8859-8");
+ test_skip("test \xE0\xE1\xFB","test \xd7\x90\xd7\x91","ISO-8859-8");
+ test_skip("\xFB-","-","ISO-8859-8");
+ }
+ catch(blc::invalid_charset_error const &) {
+ std::cout <<"--- not supported" << std::endl;
+ }
+ try {
+ std::cout << "-- cp932" << std::endl;
+ test_skip("test\xE0\xA0 \x83\xF8-","test\xe7\x87\xbf -","cp932","test\xe7\x87\xbf ");
+ test_skip("\x83\xF8","","cp932");
+ test_skip("test\xE0\xA0 \x83\xF8","test\xe7\x87\xbf ","cp932");
+ test_skip("\x83\xF8-","-","cp932","");
+ }
+ catch(blc::invalid_charset_error const &) {
+ std::cout <<"--- not supported" << std::endl;
+ }
+}
+
+
int main()
{
try {
@@ -353,6 +396,8 @@
#if !defined(BOOST_LOCALE_WITH_ICU) && !defined(BOOST_LOCALE_WITH_ICONV) && (defined(BOOST_WINDOWS) || defined(__CYGWIN__))
test_iso_8859_8 = IsValidCodePage(28598)!=0;
#endif
+
+ test_simple_conversions();
for(int type = 0; type < int(def.size()); type ++ ) {
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk