Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r76302 - in branches/release/libs/locale: . src src/encoding test
From: artyomtnk_at_[hidden]
Date: 2012-01-04 05:58:36


Author: artyom
Date: 2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
New Revision: 76302
URL: http://svn.boost.org/trac/boost/changeset/76302

Log:
Merged changesets 75594,75601 from trunk, incorrect used of
MultiByteToWide in detection of invalid sequences.

Properties modified:
   branches/release/libs/locale/ (props changed)
   branches/release/libs/locale/src/ (props changed)
Text files modified:
   branches/release/libs/locale/src/encoding/wconv_codepage.ipp | 87 ++++++++++++++++++++++++++++++---------
   branches/release/libs/locale/test/test_codepage.cpp | 47 +++++++++++++++++++++
   2 files changed, 113 insertions(+), 21 deletions(-)

Modified: branches/release/libs/locale/src/encoding/wconv_codepage.ipp
==============================================================================
--- branches/release/libs/locale/src/encoding/wconv_codepage.ipp (original)
+++ branches/release/libs/locale/src/encoding/wconv_codepage.ipp 2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
@@ -86,12 +86,6 @@
         { "windows932", 932, 0 },
     };
 
- size_t remove_substitutions(std::vector<wchar_t> &v)
- {
- v.erase(std::remove(v.begin(), v.end(), wchar_t(0xFFFD)), v.end());
- return v.size();
- }
-
     size_t remove_substitutions(std::vector<char> &v)
     {
         if(std::find(v.begin(),v.end(),0) == v.end()) {
@@ -107,23 +101,39 @@
         return v.size();
     }
 
+ void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
+ {
+ buf.reserve(end-begin);
+ while(begin!=end) {
+ wchar_t wide_buf[4];
+ int n = 0;
+ int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
+ if(len == 2 && begin+1==end)
+ return;
+ n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
+ for(int i=0;i<n;i++)
+ buf.push_back(wide_buf[i]);
+ begin+=len;
+ }
+ }
+
     
     void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
     {
         if(begin==end)
             return;
- DWORD flags = do_skip ? 0 : MB_ERR_INVALID_CHARS;
- if(50220 <= codepage && codepage <= 50229)
- flags = 0;
-
- int n = MultiByteToWideChar(codepage,flags,begin,end-begin,0,0);
- if(n == 0)
+ int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
+ if(n == 0) {
+ if(do_skip) {
+ multibyte_to_wide_one_by_one(codepage,begin,end,buf);
+ return;
+ }
             throw conversion_error();
+ }
+
         buf.resize(n,0);
- if(MultiByteToWideChar(codepage,flags,begin,end-begin,&buf.front(),buf.size())==0)
+ if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
             throw conversion_error();
- if(do_skip)
- remove_substitutions(buf);
     }
 
     void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
@@ -256,13 +266,37 @@
         }
         virtual std::string convert(char const *begin,char const *end)
         {
+ if(to_code_page_ == 65001 && from_code_page_ == 65001)
+ return utf_to_utf<char>(begin,end,how_);
+
             std::string res;
- std::vector<wchar_t> tmp;
- multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
- if(tmp.empty())
- return res;
+
+ std::vector<wchar_t> tmp; // buffer for mb2w
+ std::wstring tmps; // buffer for utf_to_utf
+ wchar_t const *wbegin=0;
+ wchar_t const *wend=0;
+
+ if(from_code_page_ == 65001) {
+ tmps = utf_to_utf<wchar_t>(begin,end,how_);
+ if(tmps.empty())
+ return res;
+ wbegin = tmps.c_str();
+ wend = wbegin + tmps.size();
+ }
+ else {
+ multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
+ if(tmp.empty())
+ return res;
+ wbegin = &tmp[0];
+ wend = wbegin + tmp.size();
+ }
+
+ if(to_code_page_ == 65001) {
+ return utf_to_utf<char>(wbegin,wend,how_);
+ }
+
             std::vector<char> ctmp;
- wide_to_multibyte(to_code_page_,&tmp.front(),&tmp.front()+tmp.size(),how_ == skip,ctmp);
+ wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
             if(ctmp.empty())
                 return res;
             res.assign(&ctmp.front(),ctmp.size());
@@ -328,6 +362,9 @@
 
         virtual string_type convert(char const *begin,char const *end)
         {
+ if(code_page_ == 65001) {
+ return utf_to_utf<char_type>(begin,end,how_);
+ }
             std::vector<wchar_t> tmp;
             multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
             string_type res;
@@ -363,6 +400,9 @@
 
         virtual std::string convert(CharType const *begin,CharType const *end)
         {
+ if(code_page_ == 65001) {
+ return utf_to_utf<char>(begin,end,how_);
+ }
             wchar_t const *wbegin = 0;
             wchar_t const *wend = 0;
             std::vector<wchar_t> buffer; // if needed
@@ -424,9 +464,11 @@
 
         virtual string_type convert(char const *begin,char const *end)
         {
+ if(code_page_ == 65001) {
+ return utf_to_utf<char_type>(begin,end,how_);
+ }
             std::vector<wchar_t> buf;
             multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
- remove_substitutions(buf);
 
             if(buf.empty())
                 return string_type();
@@ -460,6 +502,9 @@
 
         virtual std::string convert(CharType const *begin,CharType const *end)
         {
+ if(code_page_ == 65001) {
+ return utf_to_utf<char>(begin,end,how_);
+ }
             std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
 
             std::vector<char> ctmp;

Modified: branches/release/libs/locale/test/test_codepage.cpp
==============================================================================
--- branches/release/libs/locale/test/test_codepage.cpp (original)
+++ branches/release/libs/locale/test/test_codepage.cpp 2012-01-04 05:58:35 EST (Wed, 04 Jan 2012)
@@ -333,6 +333,51 @@
 }
 
 
+void test_skip(char const *enc,char const *utf,char const *name,char const *opt=0)
+{
+ if(opt!=0) {
+ if(boost::locale::conv::to_utf<char>(enc,name) == opt) {
+ test_skip(enc,opt,name);
+ return;
+ }
+ }
+ TEST(boost::locale::conv::to_utf<char>(enc,name) == utf);
+ TEST(boost::locale::conv::to_utf<wchar_t>(enc,name) == boost::locale::conv::utf_to_utf<wchar_t>(utf));
+ #ifdef BOOST_HAS_CHAR16_T
+ TEST(boost::locale::conv::to_utf<char16_t>(enc,name) == boost::locale::conv::utf_to_utf<char16_t>(utf));
+ #endif
+ #ifdef BOOST_HAS_CHAR32_T
+ TEST(boost::locale::conv::to_utf<char32_t>(enc,name) == boost::locale::conv::utf_to_utf<char32_t>(utf));
+ #endif
+}
+
+void test_simple_conversions()
+{
+ namespace blc=boost::locale::conv;
+ std::cout << "- Testing correct invalid bytes skipping" << std::endl;
+ try {
+ std::cout << "-- ISO-8859-8" << std::endl;
+ test_skip("test \xE0\xE1\xFB-","test \xd7\x90\xd7\x91-","ISO-8859-8");
+ test_skip("\xFB","","ISO-8859-8");
+ test_skip("test \xE0\xE1\xFB","test \xd7\x90\xd7\x91","ISO-8859-8");
+ test_skip("\xFB-","-","ISO-8859-8");
+ }
+ catch(blc::invalid_charset_error const &) {
+ std::cout <<"--- not supported" << std::endl;
+ }
+ try {
+ std::cout << "-- cp932" << std::endl;
+ test_skip("test\xE0\xA0 \x83\xF8-","test\xe7\x87\xbf -","cp932","test\xe7\x87\xbf ");
+ test_skip("\x83\xF8","","cp932");
+ test_skip("test\xE0\xA0 \x83\xF8","test\xe7\x87\xbf ","cp932");
+ test_skip("\x83\xF8-","-","cp932","");
+ }
+ catch(blc::invalid_charset_error const &) {
+ std::cout <<"--- not supported" << std::endl;
+ }
+}
+
+
 int main()
 {
     try {
@@ -353,6 +398,8 @@
         #if !defined(BOOST_LOCALE_WITH_ICU) && !defined(BOOST_LOCALE_WITH_ICONV) && (defined(BOOST_WINDOWS) || defined(__CYGWIN__))
         test_iso_8859_8 = IsValidCodePage(28598)!=0;
         #endif
+
+ test_simple_conversions();
         
         
         for(int type = 0; type < int(def.size()); type ++ ) {


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk