Boost logo

Boost-Commit :

Subject: [Boost-commit] svn:boost r76790 - branches/quickbook-dev/tools/quickbook/src
From: dnljms_at_[hidden]
Date: 2012-01-29 19:06:37


Author: danieljames
Date: 2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
New Revision: 76790
URL: http://svn.boost.org/trac/boost/changeset/76790

Log:
Quickbook: Fix handling UTF-8 characters in the syntax highlighter.

So far, quickbook has just pretended that it's dealing with a single
byte encoding. For the most part this works quite well due to the
design of UTF-8, but the syntax highlighter needs to pick out individual
characters correctly, so implement just enough to do that. Does odd
things with invalid UTF-8.
Text files modified:
   branches/quickbook-dev/tools/quickbook/src/parsers.hpp | 42 ++++++++++++++++++++++++++++++++++++++++
   branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp | 10 ++++----
   2 files changed, 47 insertions(+), 5 deletions(-)

Modified: branches/quickbook-dev/tools/quickbook/src/parsers.hpp
==============================================================================
--- branches/quickbook-dev/tools/quickbook/src/parsers.hpp (original)
+++ branches/quickbook-dev/tools/quickbook/src/parsers.hpp 2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
@@ -253,6 +253,48 @@
     };
     
     lookback_gen const lookback = lookback_gen();
+
+ ///////////////////////////////////////////////////////////////////////////
+ //
+ // UTF-8 code point
+ //
+ // Very crude, it doesn't check that the code point is in any way valid.
+ // Just looks for the beginning of the next character. This is just for
+ // implementing some crude fixes, rather than full unicode support. I'm
+ // sure experts would be appalled.
+ //
+ ///////////////////////////////////////////////////////////////////////////
+
+ struct utf8_char_parser : public cl::parser<utf8_char_parser>
+ {
+ typedef utf8_char_parser self_t;
+
+ template <typename Scanner>
+ struct result
+ {
+ typedef cl::match<> type;
+ };
+
+ template <typename Scanner>
+ typename result<Scanner>::type parse(Scanner const& scan) const
+ {
+ typedef typename Scanner::iterator_t iterator_t;
+
+ if (scan.at_end()) return scan.no_match();
+
+ iterator_t save(scan.first);
+
+ do {
+ ++scan.first;
+ } while (!scan.at_end() &&
+ ((unsigned char) *scan.first & 0xc0) == 0x80);
+
+ return scan.create_match(scan.first.base() - save.base(),
+ cl::nil_t(), save, scan.first);
+ }
+ };
+
+ utf8_char_parser const utf8_char_p = utf8_char_parser();
 }
 
 #endif // BOOST_QUICKBOOK_SCOPED_BLOCK_HPP

Modified: branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp
==============================================================================
--- branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp (original)
+++ branches/quickbook-dev/tools/quickbook/src/syntax_highlight.cpp 2012-01-29 19:06:36 EST (Sun, 29 Jan 2012)
@@ -287,7 +287,7 @@
                     | string_ [span("string")]
                     | char_ [span("char")]
                     | number [span("number")]
- | cl::repeat_p(1)[cl::anychar_p] [unexpected_char]
+ | utf8_char_p [unexpected_char]
                     )
                     ;
 
@@ -362,7 +362,7 @@
                     = +cl::chset_p("~!%^&*()+={[}]:;,<.>?/|\\-")
                     ;
 
- string_char = ('\\' >> cl::anychar_p) | (cl::anychar_p - '\\');
+ string_char = ('\\' >> utf8_char_p) | (cl::anychar_p - '\\');
 
                 string_
                     = !cl::as_lower_d['l'] >> cl::confix_p('"', *string_char, '"')
@@ -442,7 +442,7 @@
                     | special [span("special")]
                     | string_ [span("string")]
                     | number [span("number")]
- | cl::repeat_p(1)[cl::anychar_p] [unexpected_char]
+ | utf8_char_p [unexpected_char]
                     )
                     ;
 
@@ -498,7 +498,7 @@
                     = ! string_prefix >> (long_string | short_string)
                     ;
 
- string_char = ('\\' >> cl::anychar_p) | (cl::anychar_p - '\\');
+ string_char = ('\\' >> utf8_char_p) | (cl::anychar_p - '\\');
             
                 short_string
                     = cl::confix_p('\'', * string_char, '\'') |
@@ -564,7 +564,7 @@
                     =
                     *( macro
                     | escape
- | cl::repeat_p(1)[cl::anychar_p] [plain_char]
+ | utf8_char_p [plain_char]
                     )
                     ;
 


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk