Boost logo

Boost :

Subject: [boost] [string] Yet another Unicode string class
From: Anders Dalvander (boost_at_[hidden])
Date: 2011-02-10 15:38:11


I'm working on yet another Unicode string class/library from another set
of features and requirements.

* It is designed around the codepoint concept.
* It uses (currently forward-) iterators for encoding and decoding.
* It has a minimal interface, mostly constructors and iterator access.
* Most other functions can (hopefully) be free functions.
* It uses basic_string as backend.
* It has fast access to underlying basic_string.
* It is (currently) using some C++0X features (mainly decltype).
* It is (currently) immutable and shares data, and thus fast to copy.

Some of these features and requirements may be unacceptable to some of
you, but I'm open to suggestions and comments.

// Copyright (c) 2011 Anders Dalvander.
//
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)

template <typename encoding>
class basic_text
{
public:
    typedef encoding encoding_type;
    typedef typename encoding_type::codeunit_type codeunit_type;
    typedef typename encoding_type::codepoint_type codepoint_type;
    typedef std::basic_string<codeunit_type> string_type;
    typedef typename string_type::const_iterator codeunit_iterator;
    typedef typename encoding_type::decode_iterator<codeunit_iterator>
       codepoint_iterator;
    typedef codepoint_iterator const_iterator;
    typedef codepoint_iterator iterator;

    basic_text()
       : s(std::make_shared<string_type>())
    {
    }

    template <typename other_encoding>
    basic_text(const basic_text<other_encoding>& text)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<decltype(std::begin(text))>
       (std::begin(text), std::begin(text), std::end(text)),
       encoding_type::encode_iterator<decltype(std::begin(text))>
       (std::end(text), std::begin(text), std::end(text))))
    {
    }

    // TODO: Use some default_encoding traits type.
    template <typename container>
    explicit basic_text(const container& c)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<decltype(std::begin(c))>
       (std::begin(c), std::begin(c), std::end(c)),
       encoding_type::encode_iterator<decltype(std::begin(c))>
       (std::end(c), std::begin(c), std::end(c))))
    {
    }

    template <typename codepoint_iterator>
    basic_text(codepoint_iterator first, codepoint_iterator last)
       : s(std::make_shared<string_type>(
       encoding_type::encode_iterator<codepoint_iterator>
       (first, first, last),
       encoding_type::encode_iterator<codepoint_iterator>
       (last, first, last)))
    {
    }

    codepoint_iterator begin() const
    {
       return codepoint_iterator
          (codeunit_begin(), codeunit_begin(), codeunit_end());
    }

    codepoint_iterator end() const
    {
       return codepoint_iterator
          (codeunit_end(), codeunit_begin(), codeunit_end());
    }

    codeunit_iterator codeunit_begin() const
    {
       return std::begin(*s);
    }

    codeunit_iterator codeunit_end() const
    {
       return std::end(*s);
    }

    const string_type& str() const
    {
       return *s;
    }

    const codeunit_type* c_str() const
    {
       return s->c_str();
    }

private:
    typedef std::shared_ptr<const string_type> pointer_type;

    pointer_type s;
};

typedef undefined-type utf8_encoding;
typedef basic_text<utf8_encoding> u8text;
typedef undefined-type utf16_encoding;
typedef basic_text<utf16_encoding> u16text;
typedef undefined-type utf32_encoding;
typedef basic_text<utf32_encoding> u32text;
typedef undefined-type wchar_encoding;
typedef basic_text<wchar_encoding> wtext;
typedef undefined-type ascii_encoding;
typedef basic_text<ascii_encoding> ascii_text;

Usage:

int main()
{
    const uint32_t cps[] = {0x41,0x42,0x80,0x800,0x10000,0x10ffff};

    // construct from codepoint range
    u8text u8txt(std::begin(cps), std::end(cps));

    // construct from encoded container,
    // currently treats each element as a codepoint
    u8text u8txt2("test");

    // sharing is caring
    u8text u8txt3 = u8txt;

    // construct from codepoint range
    u16text u16txt(std::begin(cps), std::end(cps));

    // construct from text, transcodes range
    u16text u16txt2 = u8txt;

    // construct from text, transcodes range
    u32text u32txt = u8txt;

    // using policy (possible extension)
    ascii_text ascii(u8txt, replace_policy(0xff));
}

void OpenFileWin32(const u8text& txt)
{
    CloseHandle(CreateFileW(wtext(txt).c_str(), ...))
}

typedef undefined-type posix_encoding;
typedef basic_text<posix_encoding> posixtext;

void OpenFilePosix(const u8text& txt)
{
    close(open(posixtext(txt).c_str(), ...))
}

Regards,
Anders Dalvander

-- 
WWFSMD?

Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk