Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/boost_1_34_1/boost/detail/utf8_codecvt_facet.hpp @ 58

Last change on this file since 58 was 29, checked in by landauf, 17 years ago

updated boost from 1_33_1 to 1_34_1

File size: 6.9 KB
Line 
1// Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
2// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
3// Distributed under the Boost Software License, Version 1.0. (See accompany-
4// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6#ifndef BOOST_UTF8_CODECVT_FACET_HPP
7#define BOOST_UTF8_CODECVT_FACET_HPP
8
9// MS compatible compilers support #pragma once
10#if defined(_MSC_VER) && (_MSC_VER >= 1020)
11# pragma once
12#endif
13
14/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
15// utf8_codecvt_facet.hpp
16
17// This header defines class utf8_codecvt_facet, derived fro
18// std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
19// files into wchar_t strings in the application.
20//
21// The header is NOT STANDALONE, and is not to be included by the USER.
22// There are at least two libraries which want to use this functionality, and
23// we want to avoid code duplication. It would be possible to create utf8
24// library, but:
25// - this requires review process first
26// - in the case, when linking the a library which uses utf8
27//   (say 'program_options'), user should also link to the utf8 library.
28//   This seems inconvenient, and asking a user to link to an unrevieved
29//   library is strange.
30// Until the above points are fixed, a library which wants to use utf8 must:
31// - include this header from one of it's headers or sources
32// - include the corresponding .cpp file from one of the sources
33// - before including either file, the library must define
34//   - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
35//   - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
36//   - declaration.
37//   - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
38//     symbols.
39//
40// For example, program_options library might contain:
41//    #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character>
42//             namespace boost { namespace program_options {
43//    #define BOOST_UTF8_END_NAMESPACE }}
44//    #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL
45//    #include "../../detail/utf8/utf8_codecvt.cpp"
46//
47// Essentially, each library will have its own copy of utf8 code, in
48// different namespaces.
49
50// Note:(Robert Ramey).  I have made the following alterations in the original
51// code.
52// a) Rendered utf8_codecvt<wchar_t, char>  with using templates
53// b) Move longer functions outside class definition to prevent inlining
54// and make code smaller
55// c) added on a derived class to permit translation to/from current
56// locale to utf8
57
58//  See http://www.boost.org for updates, documentation, and revision history.
59
60// archives stored as text - note these ar templated on the basic
61// stream templates to accommodate wide (and other?) kind of characters
62//
63// note the fact that on libraries without wide characters, ostream is
64// is not a specialization of basic_ostream which in fact is not defined
65// in such cases.   So we can't use basic_ostream<OStream::char_type> but rather
66// use two template parameters
67//
68// utf8_codecvt_facet
69//   This is an implementation of a std::codecvt facet for translating
70//   from UTF-8 externally to UCS-4.  Note that this is not tied to
71//   any specific types in order to allow customization on platforms
72//   where wchar_t is not big enough.
73//
74// NOTES:  The current implementation jumps through some unpleasant hoops in
75// order to deal with signed character types.  As a std::codecvt_base::result,
76// it is necessary  for the ExternType to be convertible to unsigned  char.
77// I chose not to tie the extern_type explicitly to char. But if any combination
78// of types other than <wchar_t,char_t> is used, then std::codecvt must be
79// specialized on those types for this to work.
80
81#include <locale>
82// for mbstate_t
83#include <wchar.h>
84// for std::size_t
85#include <cstddef>
86
87#include <boost/config.hpp>
88#include <boost/detail/workaround.hpp>
89
90namespace std {
91    #if defined(__LIBCOMO__)
92        using ::mbstate_t;
93    #elif defined(BOOST_DINKUMWARE_STDLIB) && !defined(__BORLANDC__)
94        using ::mbstate_t;
95    #elif defined(__SGI_STL_PORT)
96    #elif defined(BOOST_NO_STDC_NAMESPACE)
97        using ::mbstate_t;
98        using ::codecvt;
99    #endif
100} // namespace std
101
102#if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
103    #define BOOST_CODECVT_DO_LENGTH_CONST const
104#else
105    #define BOOST_CODECVT_DO_LENGTH_CONST
106#endif
107
108// maximum lenght of a multibyte string
109#define MB_LENGTH_MAX 8
110
111BOOST_UTF8_BEGIN_NAMESPACE
112
113struct BOOST_UTF8_DECL utf8_codecvt_facet :
114    public std::codecvt<wchar_t, char, std::mbstate_t> 
115{
116public:
117    explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
118        : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 
119    {}
120protected:
121    virtual std::codecvt_base::result do_in(
122        std::mbstate_t& state, 
123        const char * from,
124        const char * from_end, 
125        const char * & from_next,
126        wchar_t * to, 
127        wchar_t * to_end, 
128        wchar_t*& to_next
129    ) const;
130
131    virtual std::codecvt_base::result do_out(
132        std::mbstate_t & state, const wchar_t * from,
133        const wchar_t * from_end, const wchar_t*  & from_next,
134        char * to, char * to_end, char * & to_next
135    ) const;
136
137    bool invalid_continuing_octet(unsigned char octet_1) const {
138        return (octet_1 < 0x80|| 0xbf< octet_1);
139    }
140
141    bool invalid_leading_octet(unsigned char octet_1)   const {
142        return (0x7f < octet_1 && octet_1 < 0xc0) ||
143            (octet_1 > 0xfd);
144    }
145
146    // continuing octets = octets except for the leading octet
147    static unsigned int get_cont_octet_count(unsigned   char lead_octet) {
148        return get_octet_count(lead_octet) - 1;
149    }
150
151    static unsigned int get_octet_count(unsigned char   lead_octet);
152
153    // How many "continuing octets" will be needed for this word
154    // ==   total octets - 1.
155    int get_cont_octet_out_count(wchar_t word) const ;
156
157    virtual bool do_always_noconv() const throw() { return false; }
158
159    // UTF-8 isn't really stateful since we rewind on partial conversions
160    virtual std::codecvt_base::result do_unshift(
161        std::mbstate_t&,
162        char * from,
163        char * /*to*/,
164        char * & next
165    ) const 
166    {
167        next = from;
168        return ok;
169    }
170
171    virtual int do_encoding() const throw() {
172        const int variable_byte_external_encoding=0;
173        return variable_byte_external_encoding;
174    }
175
176    // How many char objects can I process to get <= max_limit
177    // wchar_t objects?
178    virtual int do_length(
179        BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
180        const char * from,
181        const char * from_end, 
182        std::size_t max_limit
183#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
184        ) const throw();
185#else
186        ) const;
187#endif
188
189    // Largest possible value do_length(state,from,from_end,1) could return.
190    virtual int do_max_length() const throw () {
191        return 6; // largest UTF-8 encoding of a UCS-4 character
192    }
193};
194
195BOOST_UTF8_END_NAMESPACE
196
197#endif // BOOST_UTF8_CODECVT_FACET_HPP
Note: See TracBrowser for help on using the repository browser.