|
Boost-Commit : |
Subject: [Boost-commit] svn:boost r59677 - in trunk/tools/quickbook: detail test
From: daniel_james_at_[hidden]
Date: 2010-02-14 08:09:53
Author: danieljames
Date: 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
New Revision: 59677
URL: http://svn.boost.org/trac/boost/changeset/59677
Log:
Support UTF-8 BOM if present.
I'm not sure if the utf-16/32 checks are the right thing to do, but as
quickbook only supports utf-8 for now, they'll work okay. Hopefully if
it ever support other encodings this should be offloaded to an
appropriate library.
Added:
trunk/tools/quickbook/test/utf-16be-bom.quickbook (contents, props changed)
trunk/tools/quickbook/test/utf-16le-bom.quickbook (contents, props changed)
trunk/tools/quickbook/test/utf-8-bom.gold (contents, props changed)
trunk/tools/quickbook/test/utf-8-bom.quickbook (contents, props changed)
trunk/tools/quickbook/test/utf-8.gold (contents, props changed)
trunk/tools/quickbook/test/utf-8.quickbook (contents, props changed)
Text files modified:
trunk/tools/quickbook/detail/utils.cpp | 75 +++++++++++++++++++++++++++++++++++++--
trunk/tools/quickbook/test/Jamfile.v2 | 7 ++-
2 files changed, 75 insertions(+), 7 deletions(-)
Modified: trunk/tools/quickbook/detail/utils.cpp
==============================================================================
--- trunk/tools/quickbook/detail/utils.cpp (original)
+++ trunk/tools/quickbook/detail/utils.cpp 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -181,13 +181,74 @@
}
}
+ // Read the first few bytes in a file to see it starts with a byte order
+ // mark. If it doesn't, then write the characters we've already read in.
+ // Although, given how UTF-8 works, if we've read anything in, the files
+ // probably broken.
+
+ template <class InputIterator, class OutputIterator>
+ bool check_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out, char const* chars, int length)
+ {
+ char const* ptr = chars;
+
+ while(begin != end && *begin == *ptr) {
+ ++begin;
+ ++ptr;
+ --length;
+ if(length == 0) return true;
+ }
+
+ // Failed to match, so write the skipped characters to storage:
+ while(chars != ptr) *out++ = *chars++;
+
+ return false;
+ }
+
+ template <class InputIterator, class OutputIterator>
+ std::string read_bom(InputIterator& begin, InputIterator end,
+ OutputIterator out)
+ {
+ if(begin == end) return "";
+
+ const char utf8[] = {0xef, 0xbb, 0xbf};
+ const char utf32be[] = {0, 0, 0xfe, 0xff};
+ const char utf32le[] = {0xff, 0xfe, 0, 0};
+
+ unsigned char c = *begin;
+ switch(c)
+ {
+ case 0xEF: { // UTF-8
+ return check_bom(begin, end, out, utf8, 3) ? "UTF-8" : "";
+ }
+ case 0xFF: // UTF-16/UTF-32 little endian
+ return !check_bom(begin, end, out, utf32le, 2) ? "" :
+ check_bom(begin, end, out, utf32le + 2, 2) ? "UTF-32" : "UTF-16";
+ case 0: // UTF-32 big endian
+ return check_bom(begin, end, out, utf32be, 4) ? "UTF-32" : "";
+ case 0xFE: // UTF-16 big endian
+ return check_bom(begin, end, out, utf32be + 2, 2) ? "UTF-16" : "";
+ default:
+ return "";
+ }
+ }
+
// Copy a string, converting mac and windows style newlines to unix
// newlines.
template <class InputIterator, class OutputIterator>
- void normalize_newlines(InputIterator begin, InputIterator end,
- OutputIterator out)
+ bool normalize(InputIterator begin, InputIterator end,
+ OutputIterator out, std::string const& filename)
{
+ std::string encoding = read_bom(begin, end, out);
+
+ if(encoding != "UTF-8" && encoding != "") {
+ outerr(filename) << encoding << " is not supported. Please use UTF-8."
+ << std::endl;
+
+ return false;
+ }
+
while(begin != end) {
if(*begin == '\r') {
*out++ = '\n';
@@ -198,6 +259,8 @@
*out++ = *begin++;
}
}
+
+ return true;
}
int load(std::string const& filename, std::string& storage)
@@ -219,10 +282,14 @@
// Turn off white space skipping on the stream
in.unsetf(ios::skipws);
- normalize_newlines(
+ if(!normalize(
istream_iterator<char>(in),
istream_iterator<char>(),
- std::back_inserter(storage));
+ std::back_inserter(storage),
+ filename))
+ {
+ return 1;
+ }
// ensure that we have enough trailing newlines to eliminate
// the need to check for end of file in the grammar.
Modified: trunk/tools/quickbook/test/Jamfile.v2
==============================================================================
--- trunk/tools/quickbook/test/Jamfile.v2 (original)
+++ trunk/tools/quickbook/test/Jamfile.v2 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -43,7 +43,8 @@
[ quickbook-fail-test fail-parse-error1 ]
[ quickbook-fail-test fail-parse-error2 ]
[ quickbook-fail-test fail-template-lookup1 ]
+ [ quickbook-test utf-8 ]
+ [ quickbook-test utf-8-bom ]
+ [ quickbook-fail-test utf-16be-bom ]
+ [ quickbook-fail-test utf-16le-bom ]
;
-
-
-
Added: trunk/tools/quickbook/test/utf-16be-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16be-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ
Added: trunk/tools/quickbook/test/utf-16le-bom.quickbook
==============================================================================
Binary files (empty file) and trunk/tools/quickbook/test/utf-16le-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010) differ
Added: trunk/tools/quickbook/test/utf-8-bom.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônà lizætiøn</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Îα Alpha
+ </listitem>
+ <listitem>
+ Îβ Beta
+ </listitem>
+ <listitem>
+ Îγ Gamma
+ </listitem>
+ <listitem>
+ Îδ Delta
+ </listitem>
+ <listitem>
+ Îε Epsilon
+ </listitem>
+ <listitem>
+ Îζ Zeta
+ </listitem>
+ <listitem>
+ Îη Eta
+ </listitem>
+ <listitem>
+ Îθ Theta
+ </listitem>
+ <listitem>
+ Îι Iota
+ </listitem>
+ <listitem>
+ Îκ Kappa
+ </listitem>
+ <listitem>
+ Îλ Lambda
+ </listitem>
+ <listitem>
+ Îμ Mu
+ </listitem>
+ <listitem>
+ Îν Nu
+ </listitem>
+ <listitem>
+ Îξ Xi
+ </listitem>
+ <listitem>
+ Îο Omicron
+ </listitem>
+ <listitem>
+ Î Ï Pi
+ </listitem>
+ <listitem>
+ Î¡Ï Rho
+ </listitem>
+ <listitem>
+ ΣÏÏ Sigma
+ </listitem>
+ <listitem>
+ Î¤Ï Tau
+ </listitem>
+ <listitem>
+ Î¥Ï
Upsilon
+ </listitem>
+ <listitem>
+ Î¦Ï Phi
+ </listitem>
+ <listitem>
+ Î§Ï Chi
+ </listitem>
+ <listitem>
+ Î¨Ï Psi
+ </listitem>
+ <listitem>
+ Î©Ï Omega
+ </listitem>
+ </itemizedlist>
+</article>
Added: trunk/tools/quickbook/test/utf-8-bom.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8-bom.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônà lizætiøn]
+
+* Îα Alpha
+* Îβ Beta
+* Îγ Gamma
+* Îδ Delta
+* Îε Epsilon
+* Îζ Zeta
+* Îη Eta
+* Îθ Theta
+* Îι Iota
+* Îκ Kappa
+* Îλ Lambda
+* Îμ Mu
+* Îν Nu
+* Îξ Xi
+* Îο Omicron
+* Î Ï Pi
+* Î¡Ï Rho
+* ΣÏÏ Sigma
+* Î¤Ï Tau
+* Î¥Ï
Upsilon
+* Î¦Ï Phi
+* Î§Ï Chi
+* Î¨Ï Psi
+* Î©Ï Omega
\ No newline at end of file
Added: trunk/tools/quickbook/test/utf-8.gold
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.gold 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE library PUBLIC "-//Boost//DTD BoostBook XML V1.0//EN" "http://www.boost.org/tools/boostbook/dtd/boostbook.dtd">
+<article id="utf_8_test" last-revision="DEBUG MODE Date: 2000/12/20 12:00:00 $" xmlns:xi="http://www.w3.org/2001/XInclude">
+ <title>UTF-8 test</title>
+ <articleinfo>
+ </articleinfo>
+ <anchor id="utf_8_test.i__t__rn__ti__n__liz__ti__n"/>
+ <bridgehead renderas="sect2">
+ <link linkend="utf_8_test.i__t__rn__ti__n__liz__ti__n">Iñtërnâtiônà lizætiøn</link>
+ </bridgehead>
+ <itemizedlist>
+ <listitem>
+ Îα Alpha
+ </listitem>
+ <listitem>
+ Îβ Beta
+ </listitem>
+ <listitem>
+ Îγ Gamma
+ </listitem>
+ <listitem>
+ Îδ Delta
+ </listitem>
+ <listitem>
+ Îε Epsilon
+ </listitem>
+ <listitem>
+ Îζ Zeta
+ </listitem>
+ <listitem>
+ Îη Eta
+ </listitem>
+ <listitem>
+ Îθ Theta
+ </listitem>
+ <listitem>
+ Îι Iota
+ </listitem>
+ <listitem>
+ Îκ Kappa
+ </listitem>
+ <listitem>
+ Îλ Lambda
+ </listitem>
+ <listitem>
+ Îμ Mu
+ </listitem>
+ <listitem>
+ Îν Nu
+ </listitem>
+ <listitem>
+ Îξ Xi
+ </listitem>
+ <listitem>
+ Îο Omicron
+ </listitem>
+ <listitem>
+ Î Ï Pi
+ </listitem>
+ <listitem>
+ Î¡Ï Rho
+ </listitem>
+ <listitem>
+ ΣÏÏ Sigma
+ </listitem>
+ <listitem>
+ Î¤Ï Tau
+ </listitem>
+ <listitem>
+ Î¥Ï
Upsilon
+ </listitem>
+ <listitem>
+ Î¦Ï Phi
+ </listitem>
+ <listitem>
+ Î§Ï Chi
+ </listitem>
+ <listitem>
+ Î¨Ï Psi
+ </listitem>
+ <listitem>
+ Î©Ï Omega
+ </listitem>
+ </itemizedlist>
+</article>
Added: trunk/tools/quickbook/test/utf-8.quickbook
==============================================================================
--- (empty file)
+++ trunk/tools/quickbook/test/utf-8.quickbook 2010-02-14 08:09:52 EST (Sun, 14 Feb 2010)
@@ -0,0 +1,30 @@
+[article UTF-8 test
+ [quickbook 1.5]
+]
+
+[heading Iñtërnâtiônà lizætiøn]
+
+* Îα Alpha
+* Îβ Beta
+* Îγ Gamma
+* Îδ Delta
+* Îε Epsilon
+* Îζ Zeta
+* Îη Eta
+* Îθ Theta
+* Îι Iota
+* Îκ Kappa
+* Îλ Lambda
+* Îμ Mu
+* Îν Nu
+* Îξ Xi
+* Îο Omicron
+* Î Ï Pi
+* Î¡Ï Rho
+* ΣÏÏ Sigma
+* Î¤Ï Tau
+* Î¥Ï
Upsilon
+* Î¦Ï Phi
+* Î§Ï Chi
+* Î¨Ï Psi
+* Î©Ï Omega
\ No newline at end of file
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk