Boost-Commit :

Date view	Thread view	Subject view	Author view

Subject: [Boost-commit] svn:boost r59677 - in trunk/tools/quickbook: detail test
From: daniel_james_at_[hidden]
Date: 2010-02-14 08:09:53

Author: danieljames
Date: 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
New Revision: 59677
URL: http://svn.boost.org/trac/boost/changeset/59677

Log:
Support UTF-8 BOM if present.

I'm not sure if the utf-16/32 checks are the right thing to do, but as
quickbook only supports utf-8 for now, they'll work okay. Hopefully if
it ever support other encodings this should be offloaded to an
appropriate library.
Added:
   trunk/tools/quickbook/test/utf-16be-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-16le-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.gold (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-8.gold (contents, props changed)
   trunk/tools/quickbook/test/utf-8.quickbook (contents, props changed)
Text files modified:
   trunk/tools/quickbook/detail/utils.cpp | 75 +++++++++++++++++++++++++++++++++++++--
   trunk/tools/quickbook/test/Jamfile.v2 | 7 ++-
   2 files changed, 75 insertions(+), 7 deletions(-)

Modified: trunk/tools/quickbook/detail/utils.cpp
==============================================================================
--- trunk/tools/quickbook/detail/utils.cpp (original)
+++ trunk/tools/quickbook/detail/utils.cpp 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -181,13 +181,74 @@
         }
     }

+ // Read the first few bytes in a file to see it starts with a byte order
+ // mark. If it doesn't, then write the characters we've already read in.
+ // Although, given how UTF-8 works, if we've read anything in, the files
+ // probably broken.
+
+ template <class InputIterator, class OutputIterator>
+ bool check_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out, char const* chars, int length)
+ {
+ char const* ptr = chars;
+
+ while(begin != end && *begin == *ptr) {
+ ++begin;
+ ++ptr;
+ --length;
+ if(length == 0) return true;
+ }
+
+ // Failed to match, so write the skipped characters to storage:
+ while(chars != ptr) *out++ = *chars++;
+
+ return false;
+ }
+
+ template <class InputIterator, class OutputIterator>
+ std::string read_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out)
+ {
+ if(begin == end) return "";
+
+ const char utf8[] = {0xef, 0xbb, 0xbf};
+ const char utf32be[] = {0, 0, 0xfe, 0xff};
+ const char utf32le[] = {0xff, 0xfe, 0, 0};
+
+ unsigned char c = *begin;
+ switch(c)
+ {
+ case 0xEF: { // UTF-8
+ return check_bom(begin, end, out, utf8, 3) ? "UTF-8" : "";
+ }
+ case 0xFF: // UTF-16/UTF-32 little endian
+ return !check_bom(begin, end, out, utf32le, 2) ? "" :
+ check_bom(begin, end, out, utf32le + 2, 2) ? "UTF-32" : "UTF-16";
+ case 0: // UTF-32 big endian
+ return check_bom(begin, end, out, utf32be, 4) ? "UTF-32" : "";
+ case 0xFE: // UTF-16 big endian
+ return check_bom(begin, end, out, utf32be + 2, 2) ? "UTF-16" : "";
+ default:
+ return "";
+ }
+ }
+
     // Copy a string, converting mac and windows style newlines to unix
     // newlines.

     template <class InputIterator, class OutputIterator>
- void normalize_newlines(InputIterator begin, InputIterator end,
- OutputIterator out)
+ bool normalize(InputIterator begin, InputIterator end,
+ OutputIterator out, std::string const& filename)
     {
+ std::string encoding = read_bom(begin, end, out);
+
+ if(encoding != "UTF-8" && encoding != "") {
+ outerr(filename) << encoding << " is not supported. Please use UTF-8."
+ << std::endl;
+
+ return false;
+ }
+
         while(begin != end) {
             if(*begin == '\r') {
                 *out++ = '\n';
@@ -198,6 +259,8 @@
                 *out++ = *begin++;
             }
         }
+
+ return true;
     }

     int load(std::string const& filename, std::string& storage)
@@ -219,10 +282,14 @@
         // Turn off white space skipping on the stream
         in.unsetf(ios::skipws);

- normalize_newlines(
+ if(!normalize(
             istream_iterator<char>(in),
             istream_iterator<char>(),
- std::back_inserter(storage));
+ std::back_inserter(storage),
+ filename))
+ {
+ return 1;
+ }

         // ensure that we have enough trailing newlines to eliminate
         // the need to check for end of file in the grammar.

Modified: trunk/tools/quickbook/test/Jamfile.v2
==============================================================================
--- trunk/tools/quickbook/test/Jamfile.v2 (original)
+++ trunk/tools/quickbook/test/Jamfile.v2 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -43,7 +43,8 @@
     [ quickbook-fail-test fail-parse-error1 ]
     [ quickbook-fail-test fail-parse-error2 ]
     [ quickbook-fail-test fail-template-lookup1 ]
+ [ quickbook-test utf-8 ]
+ [ quickbook-test utf-8-bom ]
+ [ quickbook-fail-test utf-16be-bom ]
+ [ quickbook-fail-test utf-16le-bom ]
     ;
-
-
-

Added: trunk/tools/quickbook/test/utf-16be-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16be-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ

Added: trunk/tools/quickbook/test/utf-16le-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16le-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ

Added: trunk/tools/quickbook/test/utf-8-bom.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">IÃ±tÃ«rnÃ¢tiÃ´nÃ lizÃ¦tiÃ¸n</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Î‘Î± Alpha
+ </listitem>
+ <listitem>
+ Î’Î² Beta
+ </listitem>
+ <listitem>
+ Î“Î³ Gamma
+ </listitem>
+ <listitem>
+ Î”Î´ Delta
+ </listitem>
+ <listitem>
+ Î•Îµ Epsilon
+ </listitem>
+ <listitem>
+ Î–Î¶ Zeta
+ </listitem>
+ <listitem>
+ Î—Î· Eta
+ </listitem>
+ <listitem>
+ Î˜Î¸ Theta
+ </listitem>
+ <listitem>
+ Î™Î¹ Iota
+ </listitem>
+ <listitem>
+ ÎšÎº Kappa
+ </listitem>
+ <listitem>
+ Î›Î» Lambda
+ </listitem>
+ <listitem>
+ ÎœÎ¼ Mu
+ </listitem>
+ <listitem>
+ ÎÎ½ Nu
+ </listitem>
+ <listitem>
+ ÎžÎ¾ Xi
+ </listitem>
+ <listitem>
+ ÎŸÎ¿ Omicron
+ </listitem>
+ <listitem>
+ Î Ï€ Pi
+ </listitem>
+ <listitem>
+ Î¡Ï Rho
+ </listitem>
+ <listitem>
+ Î£ÏƒÏ‚ Sigma
+ </listitem>
+ <listitem>
+ Î¤Ï„ Tau
+ </listitem>
+ <listitem>
+ Î¥Ï… Upsilon
+ </listitem>
+ <listitem>
+ Î¦Ï† Phi
+ </listitem>
+ <listitem>
+ Î§Ï‡ Chi
+ </listitem>
+ <listitem>
+ Î¨Ïˆ Psi
+ </listitem>
+ <listitem>
+ Î©Ï‰ Omega
+ </listitem>
+ </itemizedlist>
+</article>

Added: trunk/tools/quickbook/test/utf-8-bom.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+ï»¿[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading IÃ±tÃ«rnÃ¢tiÃ´nÃ lizÃ¦tiÃ¸n]
+
+* Î‘Î± Alpha
+* Î’Î² Beta
+* Î“Î³ Gamma
+* Î”Î´ Delta
+* Î•Îµ Epsilon
+* Î–Î¶ Zeta
+* Î—Î· Eta
+* Î˜Î¸ Theta
+* Î™Î¹ Iota
+* ÎšÎº Kappa
+* Î›Î» Lambda
+* ÎœÎ¼ Mu
+* ÎÎ½ Nu
+* ÎžÎ¾ Xi
+* ÎŸÎ¿ Omicron
+* Î Ï€ Pi
+* Î¡Ï Rho
+* Î£ÏƒÏ‚ Sigma
+* Î¤Ï„ Tau
+* Î¥Ï… Upsilon
+* Î¦Ï† Phi
+* Î§Ï‡ Chi
+* Î¨Ïˆ Psi
+* Î©Ï‰ Omega
\ No newline at end of file

Added: trunk/tools/quickbook/test/utf-8.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">IÃ±tÃ«rnÃ¢tiÃ´nÃ lizÃ¦tiÃ¸n</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Î‘Î± Alpha
+ </listitem>
+ <listitem>
+ Î’Î² Beta
+ </listitem>
+ <listitem>
+ Î“Î³ Gamma
+ </listitem>
+ <listitem>
+ Î”Î´ Delta
+ </listitem>
+ <listitem>
+ Î•Îµ Epsilon
+ </listitem>
+ <listitem>
+ Î–Î¶ Zeta
+ </listitem>
+ <listitem>
+ Î—Î· Eta
+ </listitem>
+ <listitem>
+ Î˜Î¸ Theta
+ </listitem>
+ <listitem>
+ Î™Î¹ Iota
+ </listitem>
+ <listitem>
+ ÎšÎº Kappa
+ </listitem>
+ <listitem>
+ Î›Î» Lambda
+ </listitem>
+ <listitem>
+ ÎœÎ¼ Mu
+ </listitem>
+ <listitem>
+ ÎÎ½ Nu
+ </listitem>
+ <listitem>
+ ÎžÎ¾ Xi
+ </listitem>
+ <listitem>
+ ÎŸÎ¿ Omicron
+ </listitem>
+ <listitem>
+ Î Ï€ Pi
+ </listitem>
+ <listitem>
+ Î¡Ï Rho
+ </listitem>
+ <listitem>
+ Î£ÏƒÏ‚ Sigma
+ </listitem>
+ <listitem>
+ Î¤Ï„ Tau
+ </listitem>
+ <listitem>
+ Î¥Ï… Upsilon
+ </listitem>
+ <listitem>
+ Î¦Ï† Phi
+ </listitem>
+ <listitem>
+ Î§Ï‡ Chi
+ </listitem>
+ <listitem>
+ Î¨Ïˆ Psi
+ </listitem>
+ <listitem>
+ Î©Ï‰ Omega
+ </listitem>
+ </itemizedlist>
+</article>

Added: trunk/tools/quickbook/test/utf-8.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading IÃ±tÃ«rnÃ¢tiÃ´nÃ lizÃ¦tiÃ¸n]
+
+* Î‘Î± Alpha
+* Î’Î² Beta
+* Î“Î³ Gamma
+* Î”Î´ Delta
+* Î•Îµ Epsilon
+* Î–Î¶ Zeta
+* Î—Î· Eta
+* Î˜Î¸ Theta
+* Î™Î¹ Iota
+* ÎšÎº Kappa
+* Î›Î» Lambda
+* ÎœÎ¼ Mu
+* ÎÎ½ Nu
+* ÎžÎ¾ Xi
+* ÎŸÎ¿ Omicron
+* Î Ï€ Pi
+* Î¡Ï Rho
+* Î£ÏƒÏ‚ Sigma
+* Î¤Ï„ Tau
+* Î¥Ï… Upsilon
+* Î¦Ï† Phi
+* Î§Ï‡ Chi
+* Î¨Ïˆ Psi
+* Î©Ï‰ Omega
\ No newline at end of file

Date view	Thread view	Subject view	Author view

Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk