Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r59677 - in trunk/tools/quickbook: detail test
From: daniel_james_at_[hidden]
Date: 2010-02-14 08:09:53


Author: danieljames
Date: 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
New Revision: 59677
URL: http://svn.boost.org/trac/boost/changeset/59677

Log:
Support UTF-8 BOM if present.

I'm not sure if the utf-16/32 checks are the right thing to do, but as
quickbook only supports utf-8 for now, they'll work okay. Hopefully if
it ever support other encodings this should be offloaded to an
appropriate library.
Added:
   trunk/tools/quickbook/test/utf-16be-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-16le-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.gold (contents, props changed)
   trunk/tools/quickbook/test/utf-8-bom.quickbook (contents, props changed)
   trunk/tools/quickbook/test/utf-8.gold (contents, props changed)
   trunk/tools/quickbook/test/utf-8.quickbook (contents, props changed)
Text files modified:
   trunk/tools/quickbook/detail/utils.cpp | 75 +++++++++++++++++++++++++++++++++++++--
   trunk/tools/quickbook/test/Jamfile.v2 | 7 ++-
   2 files changed, 75 insertions(+), 7 deletions(-)

Modified: trunk/tools/quickbook/detail/utils.cpp
==============================================================================
--- trunk/tools/quickbook/detail/utils.cpp (original)
+++ trunk/tools/quickbook/detail/utils.cpp 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -181,13 +181,74 @@
         }
     }
 
+ // Read the first few bytes in a file to see it starts with a byte order
+ // mark. If it doesn't, then write the characters we've already read in.
+ // Although, given how UTF-8 works, if we've read anything in, the files
+ // probably broken.
+
+ template <class InputIterator, class OutputIterator>
+ bool check_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out, char const* chars, int length)
+ {
+ char const* ptr = chars;
+
+ while(begin != end && *begin == *ptr) {
+ ++begin;
+ ++ptr;
+ --length;
+ if(length == 0) return true;
+ }
+
+ // Failed to match, so write the skipped characters to storage:
+ while(chars != ptr) *out++ = *chars++;
+
+ return false;
+ }
+
+ template <class InputIterator, class OutputIterator>
+ std::string read_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out)
+ {
+ if(begin == end) return "";
+
+ const char utf8[] = {0xef, 0xbb, 0xbf};
+ const char utf32be[] = {0, 0, 0xfe, 0xff};
+ const char utf32le[] = {0xff, 0xfe, 0, 0};
+
+ unsigned char c = *begin;
+ switch(c)
+ {
+ case 0xEF: { // UTF-8
+ return check_bom(begin, end, out, utf8, 3) ? "UTF-8" : "";
+ }
+ case 0xFF: // UTF-16/UTF-32 little endian
+ return !check_bom(begin, end, out, utf32le, 2) ? "" :
+ check_bom(begin, end, out, utf32le + 2, 2) ? "UTF-32" : "UTF-16";
+ case 0: // UTF-32 big endian
+ return check_bom(begin, end, out, utf32be, 4) ? "UTF-32" : "";
+ case 0xFE: // UTF-16 big endian
+ return check_bom(begin, end, out, utf32be + 2, 2) ? "UTF-16" : "";
+ default:
+ return "";
+ }
+ }
+
     // Copy a string, converting mac and windows style newlines to unix
     // newlines.
 
     template <class InputIterator, class OutputIterator>
- void normalize_newlines(InputIterator begin, InputIterator end,
- OutputIterator out)
+ bool normalize(InputIterator begin, InputIterator end,
+ OutputIterator out, std::string const& filename)
     {
+ std::string encoding = read_bom(begin, end, out);
+
+ if(encoding != "UTF-8" && encoding != "") {
+ outerr(filename) << encoding << " is not supported. Please use UTF-8."
+ << std::endl;
+
+ return false;
+ }
+
         while(begin != end) {
             if(*begin == '\r') {
                 *out++ = '\n';
@@ -198,6 +259,8 @@
                 *out++ = *begin++;
             }
         }
+
+ return true;
     }
 
     int load(std::string const& filename, std::string& storage)
@@ -219,10 +282,14 @@
         // Turn off white space skipping on the stream
         in.unsetf(ios::skipws);
 
- normalize_newlines(
+ if(!normalize(
             istream_iterator<char>(in),
             istream_iterator<char>(),
- std::back_inserter(storage));
+ std::back_inserter(storage),
+ filename))
+ {
+ return 1;
+ }
 
         // ensure that we have enough trailing newlines to eliminate
         // the need to check for end of file in the grammar.

Modified: trunk/tools/quickbook/test/Jamfile.v2
==============================================================================
--- trunk/tools/quickbook/test/Jamfile.v2 (original)
+++ trunk/tools/quickbook/test/Jamfile.v2 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -43,7 +43,8 @@
     [ quickbook-fail-test fail-parse-error1 ]
     [ quickbook-fail-test fail-parse-error2 ]
     [ quickbook-fail-test fail-template-lookup1 ]
+ [ quickbook-test utf-8 ]
+ [ quickbook-test utf-8-bom ]
+ [ quickbook-fail-test utf-16be-bom ]
+ [ quickbook-fail-test utf-16le-bom ]
     ;
-
-
-

Added: trunk/tools/quickbook/test/utf-16be-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16be-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ

Added: trunk/tools/quickbook/test/utf-16le-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16le-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ

Added: trunk/tools/quickbook/test/utf-8-bom.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônàlizætiøn</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Αα Alpha
+ </listitem>
+ <listitem>
+ Ββ Beta
+ </listitem>
+ <listitem>
+ Γγ Gamma
+ </listitem>
+ <listitem>
+ Δδ Delta
+ </listitem>
+ <listitem>
+ Εε Epsilon
+ </listitem>
+ <listitem>
+ Ζζ Zeta
+ </listitem>
+ <listitem>
+ Ηη Eta
+ </listitem>
+ <listitem>
+ Θθ Theta
+ </listitem>
+ <listitem>
+ Ιι Iota
+ </listitem>
+ <listitem>
+ Κκ Kappa
+ </listitem>
+ <listitem>
+ Λλ Lambda
+ </listitem>
+ <listitem>
+ Μμ Mu
+ </listitem>
+ <listitem>
+ Νν Nu
+ </listitem>
+ <listitem>
+ Ξξ Xi
+ </listitem>
+ <listitem>
+ Οο Omicron
+ </listitem>
+ <listitem>
+ Ππ Pi
+ </listitem>
+ <listitem>
+ Ρρ Rho
+ </listitem>
+ <listitem>
+ Σσς Sigma
+ </listitem>
+ <listitem>
+ Ττ Tau
+ </listitem>
+ <listitem>
+ Υυ Upsilon
+ </listitem>
+ <listitem>
+ Φφ Phi
+ </listitem>
+ <listitem>
+ Χχ Chi
+ </listitem>
+ <listitem>
+ Ψψ Psi
+ </listitem>
+ <listitem>
+ Ωω Omega
+ </listitem>
+ </itemizedlist>
+</article>

Added: trunk/tools/quickbook/test/utf-8-bom.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônàlizætiøn]
+
+* Αα Alpha
+* Ββ Beta
+* Γγ Gamma
+* Δδ Delta
+* Εε Epsilon
+* Ζζ Zeta
+* Ηη Eta
+* Θθ Theta
+* Ιι Iota
+* Κκ Kappa
+* Λλ Lambda
+* Μμ Mu
+* Νν Nu
+* Ξξ Xi
+* Οο Omicron
+* Ππ Pi
+* Ρρ Rho
+* Σσς Sigma
+* Ττ Tau
+* Υυ Upsilon
+* Φφ Phi
+* Χχ Chi
+* Ψψ Psi
+* Ωω Omega
\ No newline at end of file

Added: trunk/tools/quickbook/test/utf-8.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônàlizætiøn</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Αα Alpha
+ </listitem>
+ <listitem>
+ Ββ Beta
+ </listitem>
+ <listitem>
+ Γγ Gamma
+ </listitem>
+ <listitem>
+ Δδ Delta
+ </listitem>
+ <listitem>
+ Εε Epsilon
+ </listitem>
+ <listitem>
+ Ζζ Zeta
+ </listitem>
+ <listitem>
+ Ηη Eta
+ </listitem>
+ <listitem>
+ Θθ Theta
+ </listitem>
+ <listitem>
+ Ιι Iota
+ </listitem>
+ <listitem>
+ Κκ Kappa
+ </listitem>
+ <listitem>
+ Λλ Lambda
+ </listitem>
+ <listitem>
+ Μμ Mu
+ </listitem>
+ <listitem>
+ Νν Nu
+ </listitem>
+ <listitem>
+ Ξξ Xi
+ </listitem>
+ <listitem>
+ Οο Omicron
+ </listitem>
+ <listitem>
+ Ππ Pi
+ </listitem>
+ <listitem>
+ Ρρ Rho
+ </listitem>
+ <listitem>
+ Σσς Sigma
+ </listitem>
+ <listitem>
+ Ττ Tau
+ </listitem>
+ <listitem>
+ Υυ Upsilon
+ </listitem>
+ <listitem>
+ Φφ Phi
+ </listitem>
+ <listitem>
+ Χχ Chi
+ </listitem>
+ <listitem>
+ Ψψ Psi
+ </listitem>
+ <listitem>
+ Ωω Omega
+ </listitem>
+ </itemizedlist>
+</article>

Added: trunk/tools/quickbook/test/utf-8.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônàlizætiøn]
+
+* Αα Alpha
+* Ββ Beta
+* Γγ Gamma
+* Δδ Delta
+* Εε Epsilon
+* Ζζ Zeta
+* Ηη Eta
+* Θθ Theta
+* Ιι Iota
+* Κκ Kappa
+* Λλ Lambda
+* Μμ Mu
+* Νν Nu
+* Ξξ Xi
+* Οο Omicron
+* Ππ Pi
+* Ρρ Rho
+* Σσς Sigma
+* Ττ Tau
+* Υυ Upsilon
+* Φφ Phi
+* Χχ Chi
+* Ψψ Psi
+* Ωω Omega
\ No newline at end of file


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk