KiCad PCB EDA Suite
utf8.cpp
Go to the documentation of this file.
1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013 KiCad Developers, see CHANGELOG.TXT for contributors.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, you may find one here:
19  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20  * or you may search the http://www.gnu.org website for the version 2 license,
21  * or you may write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23  */
24 
25 #include <utf8.h>
26 
27 /* THROW_IO_ERROR needs this, but it includes this file, so until some
28  factoring of THROW_IO_ERROR into a separate header, defer and use the asserts.
29 #include <richio.h>
30 */
31 
32 #include <assert.h>
33 
34 /*
35  These are not inlined so that code space is saved by encapsulating the
36  creation of intermediate objects and the referencing of wxConvUTF8.
37 */
38 
39 
40 UTF8::UTF8( const wxString& o ) :
41  std::string( (const char*) o.utf8_str() )
42 {
43 }
44 
45 
46 UTF8::operator wxString () const
47 {
48  return wxString( c_str(), wxConvUTF8 );
49 }
50 
51 
52 UTF8& UTF8::operator=( const wxString& o )
53 {
54  std::string::operator=( (const char*) o.utf8_str() );
55  return *this;
56 }
57 
58 
59 #ifndef THROW_IO_ERROR
60  #define THROW_IO_ERROR(x) // nothing
61 #endif
62 
63 // There is no wxWidgets function that does this, because wchar_t is 16 bits
64 // on windows and wx wants to encode the output in UTF16 for such.
65 
66 int UTF8::uni_forward( const unsigned char* aSequence, unsigned* aResult )
67 {
68  unsigned ch = *aSequence;
69 
70  if( ch < 0x80 )
71  {
72  if( aResult )
73  *aResult = ch;
74  return 1;
75  }
76 
77  const unsigned char* s = aSequence;
78 
79  static const unsigned char utf8_len[] = {
80  // Map encoded prefix byte to sequence length. Zero means
81  // illegal prefix. See RFC 3629 for details
82  /*
83  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 00-0F
84  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
85  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
86  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
87  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
88  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
89  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
90  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 70-7F
91  */
92  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 80-8F
93  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B0-BF
96  0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C0-C1 + C2-CF
97  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D0-DF
98  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E0-EF
99  4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // F0-F4 + F5-FF
100  };
101 
102  int len = utf8_len[ *s - 0x80 /* top half of table is missing */ ];
103 
104  switch( len )
105  {
106  default:
107  case 0:
108  THROW_IO_ERROR( "invalid start byte" );
109  break;
110 
111  case 2:
112  if( ( s[1] & 0xc0 ) != 0x80 )
113  {
114  THROW_IO_ERROR( "invalid continuation byte" );
115  }
116 
117  ch = ((s[0] & 0x1f) << 6) +
118  ((s[1] & 0x3f) << 0);
119 
120  assert( ch > 0x007F && ch <= 0x07FF );
121  break;
122 
123  case 3:
124  if( (s[1] & 0xc0) != 0x80 ||
125  (s[2] & 0xc0) != 0x80 ||
126  (s[0] == 0xE0 && s[1] < 0xA0)
127  // || (s[0] == 0xED && s[1] > 0x9F)
128  )
129  {
130  THROW_IO_ERROR( "invalid continuation byte" );
131  }
132 
133  ch = ((s[0] & 0x0f) << 12) +
134  ((s[1] & 0x3f) << 6 ) +
135  ((s[2] & 0x3f) << 0 );
136 
137  assert( ch > 0x07FF && ch <= 0xFFFF );
138  break;
139 
140  case 4:
141  if( (s[1] & 0xc0) != 0x80 ||
142  (s[2] & 0xc0) != 0x80 ||
143  (s[3] & 0xc0) != 0x80 ||
144  (s[0] == 0xF0 && s[1] < 0x90) ||
145  (s[0] == 0xF4 && s[1] > 0x8F) )
146  {
147  THROW_IO_ERROR( "invalid continuation byte" );
148  }
149 
150  ch = ((s[0] & 0x7) << 18) +
151  ((s[1] & 0x3f) << 12) +
152  ((s[2] & 0x3f) << 6 ) +
153  ((s[3] & 0x3f) << 0 );
154 
155  assert( ch > 0xFFFF && ch <= 0x10ffff );
156  break;
157  }
158 
159  if( aResult )
160  *aResult = ch;
161 
162  return len;
163 }
164 
165 
166 UTF8::UTF8( const wchar_t* txt ) :
167  // size initial string safely large enough, then shrink to known size later.
168  std::string( wcslen( txt ) * 4, 0 )
169 {
170  /*
171 
172  "this" string was sized to hold the worst case UTF8 encoded byte
173  sequence, and was initialized with all nul bytes. Overwrite some of
174  those nuls, then resize, shrinking down to actual size.
175 
176  Use the wx 2.8 function, not new FromWChar(). It knows about wchar_t
177  possibly being 16 bits wide on Windows and holding UTF16 input.
178 
179  */
180 
181  int sz = wxConvUTF8.WC2MB( (char*) data(), txt, size() );
182 
183  resize( sz );
184 }
185 
186 
187 #if 0 // some unit tests:
188 
189 #include <stdio.h>
190 
191 wxString wxFunctionTaking_wxString( const wxString& wx )
192 {
193  printf( "%s:'%s'\n", __func__, (char*) UTF8( wx ) );
194  printf( "%s:'%s'\n", __func__, (const char*) UTF8( wx ) );
195  printf( "%s:'%s'\n", __func__, UTF8( wx ).c_str() );
196 
197  return wx;
198 }
199 
200 int main()
201 {
202  std::string str = "input";
203 
204  UTF8 u0 = L"wide string";
205  UTF8 u1 = "initial";
206  wxString wx = wxT( "input2" );
207 
208  printf( "u0:'%s'\n", u0.c_str() );
209  printf( "u1:'%s'\n", u1.c_str() );
210 
211  u1 = str;
212 
213  wxString wx2 = u1;
214 
215  // force a std::string into a UTF8, then into a wxString, then copy construct:
216  wxString wx3 = (UTF8&) u1;
217 
218  UTF8 u2 = wx2;
219 
220  u2 += 'X';
221 
222  printf( "u2:'%s'\n", u2.c_str() );
223 
224  // key accomplishments here:
225  // 1) passing a UTF8 to a function which normally takes a wxString.
226  // 2) return a wxString back into a UTF8.
227  UTF8 result = wxFunctionTaking_wxString( u2 );
228 
229  printf( "result:'%s'\n", result.c_str() );
230 
231  // test the unicode iterator:
232  for( UTF8::uni_iter it = u2.ubegin(); it < u2.uend(); )
233  {
234  // test post-increment:
235  printf( " _%02x_", *it++ );
236  }
237 
238  printf( "\n" );
239 
240  UTF8::uni_iter it = u2.ubegin();
241 
242  UTF8::uni_iter it2 = it++;
243 
244  printf( "post_inc:'%c' should be 'i'\n", *it2 );
245 
246  it2 = ++it;
247 
248  printf( "pre_inc:'%c' should be 'p'\n", *it2 );
249 
250  printf( "u[1]:'%c' should be 'n'\n", u2[1] );
251 
252  return 0;
253 }
254 
255 #endif
Class UTF8 is an 8 bit std::string that is assuredly encoded in UTF8, and supplies special conversion...
Definition: utf8.h:53
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:52
UTF8()
Definition: utf8.h:75
uni_iter ubegin() const
Function ubegin returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition: utf8.h:216
int main(int argc, char **argv)
#define THROW_IO_ERROR(x)
Definition: utf8.cpp:60
class uni_iter is a non-muting iterator that walks through unicode code points in the UTF8 encoded st...
Definition: utf8.h:137
uni_iter uend() const
Function uend returns a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition: utf8.h:225
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=NULL)
Function uni_forward advances over a single UTF8 encoded multibyte character, capturing the unicode c...
Definition: utf8.cpp:66