Boost logo

Boost :

Subject: [boost] [string] Realistic API proposal
From: Artyom (artyomtnk_at_[hidden])
Date: 2011-01-28 05:41:42


Hello All,

I'd like to provide a realistic string API proposal:

1. It is Unicode aware extension of std::string
2. It is fully compatible with std::string giving
   an ability to be included in C++XYZ
3. It can be implemented today!

Basically it extends std::string with

  a) const_code_point_iterator - for iterating over string - bidirectional
  b) code_point_iterator - back inserter
  c) provides Unicode and Locale functionality:
     - normalization
     - case handling
     - comparison
     - search
  d) marks non const operator[],at() as deprecated

Advantages:
-----------

1. It can be implemented easily (when boost.locale is given)
2. It **is** compatible replacement of std::string
3. It allows to use std::string meanwhile under the hood as storage
   giving high efficiency when assigning boost::string to std::string
   when the implementation is COW (almost all implementations with
   exception of MSVC)
4. It is full unicode aware
5. It pushes "UTF-8" idea to standard C++
6. You don't pay for what you do not need.

Proposed API:
-------------

namespace boost {

    // Fully bidirectional iterator
    template<typename UnitsIterator>
    class const_code_point_iterator {
    public:
        
        const_code_point_iterator(UnitsIterator begin,UnitsIterator end); //
begin
        const_code_point_iterator(UnitsIterator begin,UnitsIterator
end,UnitsIterator location); // current pos
        const_code_point_iterator(); // end

        #ifdef C++0x
        typedef char32_t const_code_point_type;
        #else
        typedef unsigned const_code_point_type;
        #endif

        const_code_point_type operator*() const;
        ...
 
    };

    /// Output iterator
    template<typename BackInserter>
    class code_point_iterator {
    public:
        
        code_point_iterator(BackInserter out); // begin
        code_point_iterator(); // end

        #ifdef C++0x
        typedef char32_t code_point_type;
        #else
        typedef unsigned code_point_type;
        #endif

        code_point_type operator*() const;
        ...
 
    };

    template<typename Char,typename Traits=std::char_traits<Char>, typename
Alloc=std::allocator<Char> >
    class basic_string {
    public:
        // { boost specific
        typedef std::basic_string<Char,Traits,Alloc> std_string_type;
        // } boost specific

        // All std::string standard functions based

        // Deprecated interfaces that exist for backward compatibility
        // as they not Unicode aware

        value_type &at(size_type indx);
        value_type &operator[](size_type indx);
        iterator begin();
        iterator end();

        // { boost specific compatibility functions with std::string, they would
go
        // as std::string becode extended with boost::string new interfaces
        //
        basic_string(std_string_type const &other) : data_(other) {}
        basic_string(std_string_type const &other,size_type index,size_type len)
: data_(other,index,len) {}

        ...

        operator std_string_type() const
        {
            return data_;
        }

        // } boost specific compatibility functions

        //
        // Unicode Support
        //
        // ------------------------
        //

        //
        // UTF Codepoint iteration
        //

        #ifdef C++0x
        typedef char32_t code_point_type;
        #else
        typedef unsigned code_point_type;
        #endif

        typedef boost::const_code_point_iterator<const_iterator>
const_code_point_iterator;

        const_code_point_iterator code_point_begin() const
        {
            return const_code_point_iterator(begin(),end());
        }
        const_code_point_iterator code_point_end() const
        {
            return const_code_point_iterator(begin(),end(),end());
        }
        
        typedef boost::code_point_iterator<std::back_inserter<basic_string> >
code_point_iterator;

        code_point_iterator back_inserter()
        {
            return code_point_iterator(std::back_inserter<basic_string>(*this));
        }

        basic_string &operator+=(code_point_type code_point);
        basic_string operator+(code_point_type code_point) const;
        void append(code_point_type code_point);

        //
        // Lexical operations on string
        //

        // Case handling

        basic_string upper_case(std::locale const &l=std::locale()) const;
        basic_string lower_case(std::locale const &l=std::locale()) const;
        basic_string title_case(std::locale const &l=std::locale()) const;
        basic_string fold_case() const; // locale independent
        
        // Unicode normalization

        typedef enum {
            nfc,
            nfkc,
            nfd,
            nfkd
        } normalization_mode;

        basic_string normalize(normalization_mode mode = nfc) const;
         
        // normalized string constructor

        basic_string(basic_string const &,normalization_mode mode);
        basic_string(Char const *,normalization_mode mode);
        basic_string(Char const *,size_t n,normalization_mode mode);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,normalization_mode mode);
        
        void append_normalized(basic_string const &other,normalization_mode mode
= nfc);
        void append_normalized(Char const *,normalization_mode mode = nfc);
        void append_normalized(Char const *,size_t n,normalization_mode mode =
nfc);
        
        basic_string concat_normalized(basic_string const
&other,normalization_mode mode = nfc) const;
        basic_string concat_normalized(Char const *,normalization_mode mode =
nfc) const;
        basic_string concat_normalized(Char const *,size_t n,normalization_mode
mode = nfc) const;

        // Unicode validation

        bool valid_utf() const;

        typedef struct validate_utf{}; // Unicode validation tag

        // Create string validating it
        basic_string(basic_string const &,validate_utf const &);
        basic_string(Char const *,validate_utf const &);
        basic_string(Char const *,size_t n,validate_utf const &);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,validate_utf const &);

        // Create string validating and normalazing it

        basic_string(basic_string const &,validate_utf const
&,normalization_mode mode);
        basic_string(Char const *,validate_utf const &,normalization_mode mode);
        basic_string(Char const *,size_t n,validate_utf const
&,normalization_mode mode);
        template<Iterator>
        basic_string(Iterator begin,Iterator end,validate_utf const
&,normalization_mode mode);

        // Search and comparison
        
        typedef enum {
            primary = 0, ///< 1st collation level: base letters
            secondary = 1, ///< 2nd collation level: letters and accents
            tertiary = 2, ///< 3rd collation level: letters, accents and case
            quaternary = 3, ///< 4th collation level: letters, accents, case
and punctuation
            identical = 4 ///< identical collation level: include code-point
comparison
        } level_type;

        //
        // search(...) return pair of index and size as string you search for
may have different size
        //
        std::pair<size_type,size_type> search(basic_string const &other) const;
        std::pair<size_type,size_type> search(basic_string const
&other,level_type level) const;
        std::pair<size_type,size_type> search(basic_string const
&other,std::locale const &l) const ;
        std::pair<size_type,size_type> search(basic_string const
&other,level_type level,std::locale const &l) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t
index,size_t size) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t
index,size_t size,level_type level) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t
index,size_t size,std::locale const &l) const;
        std::pair<size_type,size_type> search(basic_string const &other,size_t
index,size_t size,level_type level,std::locale const &l) const;

        std::pair<size_type,size_type> search(Char const *) const;
        std::pair<size_type,size_type> search(Char const *,level_type level)
const;
        std::pair<size_type,size_type> search(Char const *,std::locale const &l)
const ;
        std::pair<size_type,size_type> search(Char const *,level_type
level,std::locale const &l) const;
        std::pair<size_type,size_type> search(Char const *,size_t size) const;
        std::pair<size_type,size_type> search(Char const *,size_t
size,level_type level) const;
        std::pair<size_type,size_type> search(Char const *,size_t
size,std::locale const &l) const;
        std::pair<size_type,size_type> search(Char const *,size_t
size,level_type level,std::locale const &l) const;

        
        int compare_to(basic_string const &other) const;
        int compare_to(basic_string const &other,level_type level) const;
        int compare_to(basic_string const &other,std::locale const &l) const ;
        int compare_to(basic_string const &other,level_type level,std::locale
const &l) const;
        int compare_to(basic_string const &other,size_t index,size_t size)
const;
        int compare_to(basic_string const &other,size_t index,size_t
size,level_type level) const;
        int compare_to(basic_string const &other,size_t index,size_t
size,std::locale const &l) const;
        int compare_to(basic_string const &other,size_t index,size_t
size,level_type level,std::locale const &l) const;

        int compare_to(size_t index,size_t size,basic_string const &other)
const;
        int compare_to(size_t index,size_t size,basic_string const
&other,level_type level) const;
        int compare_to(size_t index,size_t size,basic_string const
&other,std::locale const &l) const ;
        int compare_to(size_t index,size_t size,basic_string const
&other,level_type level,std::locale const &l) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t
index,size_t size) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t
index,size_t size,level_type level) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t
index,size_t size,std::locale const &l) const;
        int compare_to(size_t index,size_t size,basic_string const &other,size_t
index,size_t size,level_type level,std::locale const &l) const;

        int compare_to(Char const *) const;
        int compare_to(Char const *,level_type level) const;
        int compare_to(Char const *,std::locale const &l) const ;
        int compare_to(Char const *,level_type level,std::locale const &l)
const;
        int compare_to(Char const *,size_t size) const;
        int compare_to(Char const *,size_t size,level_type level) const;
        int compare_to(Char const *,size_t size,std::locale const &l) const;
        int compare_to(Char const *,size_t size,level_type level,std::locale
const &l) const;

        int compare_to(size_t size,Char const *) const;
        int compare_to(size_t size,Char const *,level_type level) const;
        int compare_to(size_t size,Char const *,std::locale const &l) const ;
        int compare_to(size_t size,Char const *,level_type level,std::locale
const &l) const;
        int compare_to(size_t size,Char const *,size_t size) const;
        int compare_to(size_t size,Char const *,size_t size,level_type level)
const;
        int compare_to(size_t size,Char const *,size_t size,std::locale const
&l) const;
        int compare_to(size_t size,Char const *,size_t size,level_type
level,std::locale const &l) const;

        // UTF validation

        bool is_valid_utf() const;

    private:
        std_string_type data_;
    };

} // boost

      


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk