navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/boost_1_34_1/libs/regex/doc/icu_strings.html @ 29

Last change on this file since 29 was 29, checked in by landauf, 17 years ago

updated boost from 1_33_1 to 1_34_1

File size: 24.2 KB
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
3   <head>
4      <title>Boost.Regex: Working With Unicode and ICU String Types</title>
5      <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
6      <LINK href="../../../boost.css" type="text/css" rel="stylesheet"></head>
7   <body>
8      <P>
9         <TABLE id="Table1" cellSpacing="1" cellPadding="1" width="100%" border="0">
10            <TR>
11               <td vAlign="top" width="300">
12                  <h3><A href="../../../index.htm"><IMG height="86" alt="C++ Boost" src="../../../boost.png" width="277" border="0"></A></h3>
13               </td>
14               <TD width="353">
15                  <H1 align="center">Boost.Regex</H1>
16                  <H2 align="center">Working With Unicode and ICU String Types.</H2>
17               </TD>
18               <td width="50">
19                  <h3><A href="index.html"><IMG height="45" alt="Boost.Regex Index" src="uarrow.gif" width="43" border="0"></A></h3>
20               </td>
21            </TR>
22         </TABLE>
23      </P>
24      <HR>
25      <p></p>
26      <H3>Contents</H3>
27      <dl class="index">
28         <dt><a href="#introduction">Introduction</a></dt> 
29         <dt><a href="#types">Unicode regular expression types</a></dt> 
30         <dt><a href="#algo">Regular Expression Algorithms</a>
31            <dd>
32               <dl class="index">
33                  <dt><a href="#u32regex_match">u32regex_match</a></dt> 
34                  <dt><a href="#u32regex_search">u32regex_search</a></dt> 
35                  <dt><a href="#u32regex_replace">u32regex_replace</a></dt> 
36               </dl>
37            </dd>
38         </dt>
39         <dt><a href="#iterators">Iterators</a>
40            <dd>
41               <dl class="index">
42                  <dt><a href="#u32regex_iterator">u32regex_iterator</a></dt> 
43                  <dt><a href="#u32regex_token_iterator">u32regex_token_iterator</a></dt> 
44               </dl>
45            </dd>
46         </dt>
47      </dl>
48      <H3><A name="introduction"></A>Introduction</H3>
49      <P>The header:</P>
50      <PRE>&lt;boost/regex/icu.hpp&gt;</PRE>
51      <P>contains the data types and algorithms necessary for working with regular
52         expressions in a Unicode aware environment.&nbsp;
53      </P>
54      <P>In order to use this header you will need <A href="">
55            the ICU library</A>, and you will need to have built the Boost.Regex library
56         with <A href="install.html#unicode">ICU support enabled</A>.</P>
57      <P>The header will enable you to:</P>
58      <UL>
59         <LI>
60         Create regular expressions that treat Unicode strings as sequences of UTF-32
61         code points.
62         <LI>
63         Create regular expressions that support various Unicode data properties,
64         including character classification.
65         <LI>
66            Transparently search Unicode strings that are encoded as either UTF-8, UTF-16
67            or UTF-32.</LI></UL>
68      <H3><A name="types"></A>Unicode regular expression types</H3>
69      <P>Header &lt;boost/regex/icu.hpp&gt; provides a regular expression&nbsp;traits
70         class that handles UTF-32 characters:</P>
71      <PRE>class icu_regex_traits;</PRE>
72      <P>and a regular expression type based upon that:</P>
73      <PRE>typedef basic_regex&lt;UChar32,icu_regex_traits&gt; u32regex;</PRE>
74      <P>The type <EM>u32regex</EM> is regular expression type to use for all Unicode
75         regular expressions; internally it uses UTF-32 code points, but can be created
76         from, and used to search, either UTF-8, or UTF-16 encoded strings as well as
77         UTF-32 ones.</P>
78      <P>The <A href="basic_regex.html#c2">constructors</A>, and <A href="basic_regex.html#a1">
79            assign</A> member functions of u32regex, require UTF-32 encoded strings, but
80         there are a series of overloaded algorithms called make_u32regex which allow
81         regular expressions to be created from UTF-8, UTF-16, or UTF-32 encoded
82         strings:</P>
83      <PRE>template &lt;class InputIterator&gt; 
84u32regex make_u32regex(InputIterator i, InputIterator j, boost::regex_constants::syntax_option_type opt);
86      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the iterator
87         sequence [i,j). The character encoding of the sequence is determined based upon <code>
88            sizeof(*i)</code>: 1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P>
89      <PRE>u32regex make_u32regex(const char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);
91      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the
92         Null-terminated UTF-8 characater sequence <EM>p</EM>.</P>
93      <PRE>u32regex make_u32regex(const unsigned char* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE>
94      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the
95         Null-terminated UTF-8 characater sequence <EM>p</EM>.u32regex
96         make_u32regex(const wchar_t* p, boost::regex_constants::syntax_option_type opt
97         = boost::regex_constants::perl);</P>
98      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the
99         Null-terminated characater sequence <EM>p</EM>.&nbsp; The character encoding of
100         the sequence is determined based upon <CODE>sizeof(wchar_t)</CODE>: 1 implies
101         UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P>
102      <PRE>u32regex make_u32regex(const UChar* p, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE>
103      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the
104         Null-terminated UTF-16 characater sequence <EM>p</EM>.</P>
105      <PRE>template&lt;class C, class T, class A&gt;
106u32regex make_u32regex(const std::basic_string&lt;C, T, A&gt;&amp; s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE>
107      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the string <EM>s</EM>.&nbsp; 
108         The character encoding of the string is determined based upon <CODE>sizeof(C)</CODE>:
109         1 implies UTF-8, 2 implies UTF-16, and 4 implies UTF-32.</P>
110      <PRE>u32regex make_u32regex(const UnicodeString&amp; s, boost::regex_constants::syntax_option_type opt = boost::regex_constants::perl);</PRE>
111      <P><STRONG>Effects:</STRONG> Creates a regular expression object from the UTF-16
112         encoding string <EM>s</EM>.</P>
113      <H3><A name="algo"></A>Regular Expression Algorithms</H3>
114      <P>The regular expression algorithms <A href="regex_match.html">regex_match</A>, <A href="regex_search.html">
115            regex_search</A> and <A href="regex_replace.html">regex_replace</A> all
116         expect that the character sequence upon which they operate, is encoded in the
117         same character encoding as the regular expression object with which they are
118         used.&nbsp; For Unicode regular expressions that behavior is undesirable: while
119         we may want to process the data in UTF-32 "chunks", the actual data is much
120         more likely to encoded as either UTF-8 or UTF-16.&nbsp; Therefore the header
121         &lt;boost/regex/icu.hpp&gt; provides a series of thin wrappers around these
122         algorithms, called u32regex_match, u32regex_search, and u32regex_replace.&nbsp; 
123         These wrappers use iterator-adapters internally to make external UTF-8 or
124         UTF-16 data look as though it's really a UTF-32 sequence, that can then be
125         passed on to the "real" algorithm.</P>
126      <H4><A name="u32regex_match"></A>u32regex_match</H4>
127      <P>For each <A href="regex_match.html">regex_match</A> algorithm defined by
128         &lt;boost/regex.hpp&gt;, then &lt;boost/regex/icu.hpp&gt; defines an overloaded
129         algorithm that takes the same arguments, but which is called <EM>u32regex_match</EM>,
130         and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
131         ICU&nbsp;UnicodeString as input.</P>
132      <P><STRONG>Example: </STRONG>match a password, encoded in a UTF-16 UnicodeString:</P>
133      <PRE>//
134// Find out if *password* meets our password requirements,
135// as defined by the regular expression *requirements*.
137bool is_valid_password(const UnicodeString&amp; password, const UnicodeString&amp; requirements)
139   return boost::u32regex_match(password, boost::make_u32regex(requirements));
142      <P>
143      <P><STRONG>Example: </STRONG>match a UTF-8 encoded filename:</P>
144      <PRE>//
145// Extract filename part of a path from a UTF-8 encoded std::string and return the result
146// as another std::string:
148std::string get_filename(const std::string&amp; path)
150   boost::u32regex r = boost::make_u32regex("(?:\\A|.*\\\\)([^\\\\]+)");
151   boost::smatch what;
152   if(boost::u32regex_match(path, what, r))
153   {
154      // extract $1 as a CString:
155      return what.str(1);
156   }
157   else
158   {
159      throw std::runtime_error("Invalid pathname");
160   }
163      <H4><A name="u32regex_search"></A>u32regex_search</H4>
164      <P>For each <A href="regex_search.html">regex_search</A> algorithm defined by
165         &lt;boost/regex.hpp&gt;, then &lt;boost/regex/icu.hpp&gt; defines an overloaded
166         algorithm that takes the same arguments, but which is called <EM>u32regex_search</EM>,
167         and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
168         ICU&nbsp;UnicodeString as input.</P>
169      <P><STRONG>Example: </STRONG>search for a character sequence in a specific
170         language block:
171      </P>
172      <PRE>UnicodeString extract_greek(const UnicodeString&amp; text)
174   // searches through some UTF-16 encoded text for a block encoded in Greek,
175   // this expression is imperfect, but the best we can do for now - searching
176   // for specific scripts is actually pretty hard to do right.
177   //
178   // Here we search for a character sequence that begins with a Greek letter,
179   // and continues with characters that are either not-letters ( [^[:L*:]] )
180   // or are characters in the Greek character block ( [\\x{370}-\\x{3FF}] ).
181   //
182   boost::u32regex r = boost::make_u32regex(L"[\\x{370}-\\x{3FF}](?:[^[:L*:]]|[\\x{370}-\\x{3FF}])*");
183   boost::u16match what;
184   if(boost::u32regex_search(text, what, r))
185   {
186      // extract $0 as a CString:
187      return UnicodeString(what[0].first, what.length(0));
188   }
189   else
190   {
191      throw std::runtime_error("No Greek found!");
192   }
194      <H4><A name="u32regex_replace"></A>u32regex_replace</H4>
195      <P>For each <A href="regex_replace.html">regex_replace</A> algorithm defined by
196         &lt;boost/regex.hpp&gt;, then &lt;boost/regex/icu.hpp&gt; defines an overloaded
197         algorithm that takes the same arguments, but which is called <EM>u32regex_replace</EM>,
198         and which will accept UTF-8, UTF-16 or UTF-32 encoded data, as well as an
199         ICU&nbsp;UnicodeString as input.&nbsp; The input sequence and the format string
200         specifier passed to the algorithm, can be encoded independently (for example
201         one can be UTF-8, the other in UTF-16), but the result string / output iterator
202         argument must use the same character encoding as the text being searched.</P>
203      <P><STRONG>Example: </STRONG>Credit card number reformatting:</P>
204      <PRE>//
205// Take a credit card number as a string of digits,
206// and reformat it as a human readable string with "-"
207// separating each group of four digit;,
208// note that we're mixing a UTF-32 regex, with a UTF-16
209// string and a UTF-8 format specifier, and it still all
210// just works:
212const boost::u32regex e = boost::make_u32regex("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z");
213const char* human_format = "$1-$2-$3-$4";
215UnicodeString human_readable_card_number(const UnicodeString&amp; s)
217   return boost::u32regex_replace(s, e, human_format);
219      <P>
220         <H2><A name="iterators"></A>Iterators</H2>
221         <H3><A name="u32regex_iterator"></A>u32regex_iterator</H3>
222      <P>Type u32regex_iterator is in all respects the same as <A href="regex_iterator.html">
223            regex_iterator</A> except that since the regular expression type is always
224         u32regex it only takes one template parameter (the iterator type). It also
225         calls u32regex_search internally, allowing it to interface correctly with
226         UTF-8, UTF-16, and UTF-32 data:</P>
227      <PRE>
228template &lt;class BidirectionalIterator&gt;
229class u32regex_iterator
231   // for members see <A href="regex_iterator.html">regex_iterator</A>
234typedef u32regex_iterator&lt;const char*&gt;     utf8regex_iterator;
235typedef u32regex_iterator&lt;const UChar*&gt;    utf16regex_iterator;
236typedef u32regex_iterator&lt;const UChar32*&gt;  utf32regex_iterator;
238      <P>In order to simplify the construction of a u32regex_iterator from a string,
239         there are a series of non-member helper functions called
240         make_u32regex_iterator:</P>
241      <PRE>
242u32regex_iterator&lt;const char*&gt; 
243   make_u32regex_iterator(const char* s,
244                          const u32regex&amp; e,
245                          regex_constants::match_flag_type m = regex_constants::match_default);
247u32regex_iterator&lt;const wchar_t*&gt; 
248   make_u32regex_iterator(const wchar_t* s,
249                          const u32regex&amp; e,
250                          regex_constants::match_flag_type m = regex_constants::match_default);
252u32regex_iterator&lt;const UChar*&gt; 
253   make_u32regex_iterator(const UChar* s,
254                          const u32regex&amp; e,
255                          regex_constants::match_flag_type m = regex_constants::match_default);
257template &lt;class charT, class Traits, class Alloc&gt;
258u32regex_iterator&lt;typename std::basic_string&lt;charT, Traits, Alloc&gt;::const_iterator&gt; 
259   make_u32regex_iterator(const std::basic_string&lt;charT, Traits, Alloc&gt;&amp; s,
260                          const u32regex&amp; e,
261                          regex_constants::match_flag_type m = regex_constants::match_default);
263u32regex_iterator&lt;const UChar*&gt; 
264   make_u32regex_iterator(const UnicodeString&amp; s,
265                          const u32regex&amp; e,
266                          regex_constants::match_flag_type m = regex_constants::match_default);</PRE>
267      <P>
268      <P>Each of these overloads returns an iterator that enumerates all occurrences of
269         expression <EM>e</EM>, in text <EM>s</EM>, using match_flags <EM>m.</EM></P>
270      <P><STRONG>Example</STRONG>: search for international currency symbols, along with
271         their associated numeric value:</P>
272      <PRE>
273void enumerate_currencies(const std::string&amp; text)
275   // enumerate and print all the currency symbols, along
276   // with any associated numeric values:
277   const char* re =
278      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
279      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
280      "(?(1)"
281         "|(?(2)"
282            "[[:Cf:][:Cc:][:Z*:]]*"
283         ")"
284         "[[:Sc:]]"
285      ")";
286   boost::u32regex r = boost::make_u32regex(re);
287   boost::u32regex_iterator&lt;std::string::const_iterator&gt; i(boost::make_u32regex_iterator(text, r)), j;
288   while(i != j)
289   {
290      std::cout &lt;&lt; (*i)[0] &lt;&lt; std::endl;
291      ++i;
292   }
294      <P>
295      <P>Calling
296      </P>
297      <PRE>enumerate_currencies(" $100.23 or £198.12 ");</PRE>
298      <P>Yields the output:</P>
299      <PRE>$100.23<BR>£198.12</PRE>
300      <P>Provided of course that the input is encoded as UTF-8.</P>
301      <H3><A name="u32regex_token_iterator"></A>u32regex_token_iterator</H3>
302      <P>Type u32regex_token_iterator is in all respects the same as <A href="regex_token_iterator.html">
303            regex_token_iterator</A> except that since the regular expression type is
304         always u32regex it only takes one template parameter (the iterator type).&nbsp; 
305         It also calls u32regex_search internally, allowing it to interface correctly
306         with UTF-8, UTF-16, and UTF-32 data:</P>
307      <PRE>template &lt;class BidirectionalIterator&gt;
308class u32regex_token_iterator
310   // for members see <A href="regex_token_iterator.html">regex_token_iterator</A>
313typedef u32regex_token_iterator&lt;const char*&gt;     utf8regex_token_iterator;
314typedef u32regex_token_iterator&lt;const UChar*&gt;    utf16regex_token_iterator;
315typedef u32regex_token_iterator&lt;const UChar32*&gt;  utf32regex_token_iterator;
317      <P>In order to simplify the construction of a u32regex_token_iterator from a
318         string, there are a series of non-member helper functions called
319         make_u32regex_token_iterator:</P>
320      <PRE>
321u32regex_token_iterator&lt;const char*&gt; 
322   make_u32regex_token_iterator(const char* s,
323                                const u32regex&amp; e,
324                                int sub,
325                                regex_constants::match_flag_type m = regex_constants::match_default);
327u32regex_token_iterator&lt;const wchar_t*&gt; 
328   make_u32regex_token_iterator(const wchar_t* s,
329                                const u32regex&amp; e,
330                                int sub,
331                                regex_constants::match_flag_type m = regex_constants::match_default);
333u32regex_token_iterator&lt;const UChar*&gt; 
334   make_u32regex_token_iterator(const UChar* s,
335                                const u32regex&amp; e,
336                                int sub,
337                                regex_constants::match_flag_type m = regex_constants::match_default);
339template &lt;class charT, class Traits, class Alloc&gt;
340u32regex_token_iterator&lt;typename std::basic_string&lt;charT, Traits, Alloc&gt;::const_iterator&gt; 
341   make_u32regex_token_iterator(const std::basic_string&lt;charT, Traits, Alloc&gt;&amp; s,
342                                const u32regex&amp; e,
343                                int sub,
344                                regex_constants::match_flag_type m = regex_constants::match_default);
346u32regex_token_iterator&lt;const UChar*&gt; 
347   make_u32regex_token_iterator(const UnicodeString&amp; s,
348                                const u32regex&amp; e,
349                                int sub,
350                                regex_constants::match_flag_type m = regex_constants::match_default);</PRE>
351      <P>
352      <P>Each of these overloads returns an iterator that enumerates all occurrences of
353         marked sub-expression <EM>sub</EM> in regular expression&nbsp;<EM>e</EM>, found
354         in text <EM>s</EM>, using match_flags <EM>m.</EM></P>
355      <PRE>
356template &lt;std::size_t N&gt;
357u32regex_token_iterator&lt;const char*&gt; 
358   make_u32regex_token_iterator(const char* p,
359                                const u32regex&amp; e,
360                                const int (&amp;submatch)[N],
361                                regex_constants::match_flag_type m = regex_constants::match_default);
363template &lt;std::size_t N&gt;
364u32regex_token_iterator&lt;const wchar_t*&gt; 
365   make_u32regex_token_iterator(const wchar_t* p,
366                                const u32regex&amp; e,
367                                const int (&amp;submatch)[N],
368                                regex_constants::match_flag_type m = regex_constants::match_default);
370template &lt;std::size_t N&gt;
371u32regex_token_iterator&lt;const UChar*&gt; 
372   make_u32regex_token_iterator(const UChar* p,
373                                const u32regex&amp; e,
374                                const int (&amp;submatch)[N],
375                                regex_constants::match_flag_type m = regex_constants::match_default);
377template &lt;class charT, class Traits, class Alloc, std::size_t N&gt;
378u32regex_token_iterator&lt;typename std::basic_string&lt;charT, Traits, Alloc&gt;::const_iterator&gt; 
379   make_u32regex_token_iterator(const std::basic_string&lt;charT, Traits, Alloc&gt;&amp; p,
380                                const u32regex&amp; e,
381                                const int (&amp;submatch)[N],
382                                regex_constants::match_flag_type m = regex_constants::match_default);
384template &lt;std::size_t N&gt;
385u32regex_token_iterator&lt;const UChar*&gt; 
386   make_u32regex_token_iterator(const UnicodeString&amp; s,
387                                const u32regex&amp; e,
388                                const int (&amp;submatch)[N],
389                                regex_constants::match_flag_type m = regex_constants::match_default);
391      <P>Each of these overloads returns an iterator that enumerates one sub-expression
392         for each&nbsp;<EM>submatch</EM> in regular expression&nbsp;<EM>e</EM>, found in
393         text <EM>s</EM>, using match_flags <EM>m.</EM></P>
394      <PRE>
395u32regex_token_iterator&lt;const char*&gt; 
396   make_u32regex_token_iterator(const char* p,
397                                const u32regex&amp; e,
398                                const std::vector&lt;int&gt;&amp; submatch,
399                                regex_constants::match_flag_type m = regex_constants::match_default);
401u32regex_token_iterator&lt;const wchar_t*&gt; 
402   make_u32regex_token_iterator(const wchar_t* p,
403                                const u32regex&amp; e,
404                                const std::vector&lt;int&gt;&amp; submatch,
405                                regex_constants::match_flag_type m = regex_constants::match_default);
407u32regex_token_iterator&lt;const UChar*&gt; 
408   make_u32regex_token_iterator(const UChar* p,
409                                const u32regex&amp; e,
410                                const std::vector&lt;int&gt;&amp; submatch,
411                                regex_constants::match_flag_type m = regex_constants::match_default);
413template &lt;class charT, class Traits, class Alloc&gt;
414u32regex_token_iterator&lt;typename std::basic_string&lt;charT, Traits, Alloc&gt;::const_iterator&gt; 
415   make_u32regex_token_iterator(const std::basic_string&lt;charT, Traits, Alloc&gt;&amp; p,
416                                const u32regex&amp; e,
417                                const std::vector&lt;int&gt;&amp; submatch,
418                                regex_constants::match_flag_type m = regex_constants::match_default);
420u32regex_token_iterator&lt;const UChar*&gt; 
421   make_u32regex_token_iterator(const UnicodeString&amp; s,
422                                const u32regex&amp; e,
423                                const std::vector&lt;int&gt;&amp; submatch,
424                                regex_constants::match_flag_type m = regex_constants::match_default);
426      <P>Each of these overloads returns an iterator that enumerates one sub-expression
427         for each&nbsp;<EM>submatch</EM> in regular expression&nbsp;<EM>e</EM>, found in
428         text <EM>s</EM>, using match_flags <EM>m.</EM></P>
429      <P><STRONG>Example</STRONG>: search for international currency symbols, along with
430         their associated numeric value:</P>
431      <PRE>
432void enumerate_currencies2(const std::string&amp; text)
434   // enumerate and print all the currency symbols, along
435   // with any associated numeric values:
436   const char* re =
437      "([[:Sc:]][[:Cf:][:Cc:][:Z*:]]*)?"
438      "([[:Nd:]]+(?:[[:Po:]][[:Nd:]]+)?)?"
439      "(?(1)"
440         "|(?(2)"
441            "[[:Cf:][:Cc:][:Z*:]]*"
442         ")"
443         "[[:Sc:]]"
444      ")";
445   boost::u32regex r = boost::make_u32regex(re);
446   boost::u32regex_token_iterator&lt;std::string::const_iterator&gt; 
447      i(boost::make_u32regex_token_iterator(text, r, 1)), j;
448   while(i != j)
449   {
450      std::cout &lt;&lt; *i &lt;&lt; std::endl;
451      ++i;
452   }
455      <P>
456         <HR>
457      <p>Revised&nbsp; 
458         <!--webbot bot="Timestamp" S-Type="EDITED" S-Format="%d %B, %Y" startspan -->
459         05 Jan 2005&nbsp; 
460         <!--webbot bot="Timestamp" endspan i-checksum="39359" --></p>
461      <p><i>© Copyright John Maddock&nbsp;2005</i></p>
462      <P><I>Use, modification and distribution are subject to the Boost Software License,
463            Version 1.0. (See accompanying file <A href="../../../LICENSE_1_0.txt">LICENSE_1_0.txt</A>
464            or copy at <A href=""></A>)</I></P>
465   </body>
Note: See TracBrowser for help on using the repository browser.