KiCad PCB EDA Suite
utf8.h
Go to the documentation of this file.
1 /*
2  * This program source code file is part of KiCad, a free EDA CAD application.
3  *
4  * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
5  * Copyright (C) 2013 KiCad Developers, see CHANGELOG.TXT for contributors.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, you may find one here:
19  * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
20  * or you may search the http://www.gnu.org website for the version 2 license,
21  * or you may write to the Free Software Foundation, Inc.,
22  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
23  */
24 
25 #ifndef UTF8_H_
26 #define UTF8_H_
27 
28 #include <string>
29 #include <wx/string.h>
30 
31 #if defined(DEBUG)
32  #define UTF8_VERIFY // Might someday be a hidden cmake config option
33 #endif
34 
35 
41 bool IsUTF8( const char* aString );
42 
43 
44 #if defined(UTF8_VERIFY)
45  #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
46 #else
47  #define MAYBE_VERIFY_UTF8(x) // nothing
48 #endif
49 
50 
73 class UTF8
74 {
75 public:
76 
77  UTF8( const wxString& o );
78 
81  UTF8( const char* txt ) :
82  m_s( txt )
83  {
85  }
86 
89  UTF8( const wchar_t* txt );
90 
91  UTF8( const std::string& o ) :
92  m_s( o )
93  {
95  }
96 
97  UTF8()
98  {
99  }
100 
101  ~UTF8() // Needed mainly to build python wrapper
102  {
103  }
104 
105  // expose some std::string functions publicly, since base class must be private.
106 
107  const char* c_str() const { return m_s.c_str(); }
108  bool empty() const { return m_s.empty(); }
109 
110  std::string::size_type find( char c ) const { return m_s.find( c ); }
111  std::string::size_type find( char c, size_t& s ) const { return m_s.find( c, s ); }
112 
113  void clear() { m_s.clear(); }
114  std::string::size_type length() const { return m_s.length(); }
115  std::string::size_type size() const { return m_s.size(); }
116  int compare( const std::string& s ) const { return m_s.compare( s ); }
117 
118  bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
119  bool operator==( const std::string& rhs ) const { return m_s == rhs; }
120  bool operator==( const char* s ) const { return m_s == s; }
121 
122  std::string::size_type find_first_of( const std::string& str, std::string::size_type pos = 0 ) const
123  {
124  return m_s.find_first_of( str, pos );
125  }
126 
127  UTF8& operator+=( const UTF8& str )
128  {
129  m_s += str.m_s;
131  return (UTF8&) *this;
132  }
133 
134  UTF8& operator+=( char ch )
135  {
136  m_s.operator+=( ch );
138  return (UTF8&) *this;
139  }
140 
141  UTF8& operator+=( const char* s )
142  {
143  m_s.operator+=( s );
145  return (UTF8&) *this;
146  }
147 
148  static const std::string::size_type npos = std::string::npos;
149 
150  UTF8& operator=( const wxString& o );
151 
152  UTF8& operator=( const std::string& o )
153  {
154  m_s = o;
156  return *this;
157  }
158 
159  UTF8& operator=( const char* s )
160  {
161  m_s = s;
163  return *this;
164  }
165 
166  UTF8& operator=( char c )
167  {
168  m_s = c;
170  return *this;
171  }
172 
173  // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
174  // was split, so return std::string not UTF8
175  std::string substr( size_t pos = 0, size_t len = npos ) const
176  {
177  return m_s.substr( pos, len );
178  }
179 
180  operator const std::string& () const { return m_s; }
181  //operator std::string& () { return m_s; }
182  //operator std::string () const { return m_s; }
183 
184  wxString wx_str() const;
185  operator wxString () const;
186 
187  // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
188  // over UTF8 (multi-byte) characters
189  std::string::const_iterator begin() const { return m_s.begin(); }
190  std::string::const_iterator end() const { return m_s.end(); }
191 
192 #ifndef SWIG
193 
200  class uni_iter
201  {
202  friend class UTF8;
203 
204  const unsigned char* it;
205 
206  // private constructor
207  uni_iter( const char* start ) :
208  it( (const unsigned char*) start )
209  {
210  }
211 
212 
213  public:
214 
215  uni_iter() // Needed only to build python wrapper, not used outside the wrapper
216  {
217  it = NULL;
218  }
219 
220  uni_iter( const uni_iter& o )
221  {
222  it = o.it;
223  }
224 
227  {
228  it += uni_forward( it );
229  return *this;
230  }
231 
234  {
235  uni_iter ret = *this;
236 
237  it += uni_forward( it );
238  return ret;
239  }
240 
242  unsigned operator->() const
243  {
244  unsigned result;
245 
246  // grab the result, do not advance
247  uni_forward( it, &result );
248  return result;
249  }
250 
252  unsigned operator*() const
253  {
254  unsigned result;
255 
256  // grab the result, do not advance
257  uni_forward( it, &result );
258  return result;
259  }
260 
261  uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
262 
263  bool operator==( const uni_iter& other ) const { return it == other.it; }
264  bool operator!=( const uni_iter& other ) const { return it != other.it; }
265 
268  bool operator< ( const uni_iter& other ) const { return it < other.it; }
269  bool operator<=( const uni_iter& other ) const { return it <= other.it; }
270  bool operator> ( const uni_iter& other ) const { return it > other.it; }
271  bool operator>=( const uni_iter& other ) const { return it >= other.it; }
272  };
273 
278  uni_iter ubegin() const
279  {
280  return uni_iter( m_s.data() );
281  }
282 
287  uni_iter uend() const
288  {
289  return uni_iter( m_s.data() + m_s.size() );
290  }
291 
301  static int uni_forward( const unsigned char* aSequence, unsigned* aResult = NULL );
302 #endif // SWIG
303 
304 protected:
305  std::string m_s;
306 };
307 
308 
309 #endif // UTF8_H_
bool operator!=(const uni_iter &other) const
Definition: utf8.h:264
Class UTF8 is an 8 bit string that is assuredly encoded in UTF8, and supplies special conversion supp...
Definition: utf8.h:73
uni_iter operator-(int aVal) const
Definition: utf8.h:261
UTF8 & operator=(const wxString &o)
Definition: utf8.cpp:60
bool operator<(const uni_iter &other) const
Since the ++ operators advance more than one byte, this is your best loop termination test...
Definition: utf8.h:268
bool operator==(const std::string &rhs) const
Definition: utf8.h:119
std::string substr(size_t pos=0, size_t len=npos) const
Definition: utf8.h:175
uni_iter(const uni_iter &o)
Definition: utf8.h:220
std::string::size_type size() const
Definition: utf8.h:115
bool IsUTF8(const char *aString)
Function IsUTF8 tests a c-string to see if it is UTF8 encoded.
Definition: utf8.cpp:170
UTF8()
Definition: utf8.h:97
uni_iter ubegin() const
Function ubegin returns a uni_iter initialized to the start of "this" UTF8 byte sequence.
Definition: utf8.h:278
bool operator==(const UTF8 &rhs) const
Definition: utf8.h:118
bool operator<=(const uni_iter &other) const
Definition: utf8.h:269
unsigned operator*() const
return unicode at current position
Definition: utf8.h:252
wxString wx_str() const
Definition: utf8.cpp:48
unsigned operator->() const
return unicode at current position
Definition: utf8.h:242
bool operator==(const uni_iter &other) const
Definition: utf8.h:263
bool empty() const
Definition: utf8.h:108
UTF8 & operator=(const std::string &o)
Definition: utf8.h:152
std::string::size_type find_first_of(const std::string &str, std::string::size_type pos=0) const
Definition: utf8.h:122
UTF8(const char *txt)
This is a constructor for which you could end up with non-UTF8 encoding, but that would be your fault...
Definition: utf8.h:81
std::string m_s
Definition: utf8.h:305
std::string::size_type find(char c) const
Definition: utf8.h:110
int compare(const std::string &s) const
Definition: utf8.h:116
UTF8 & operator=(const char *s)
Definition: utf8.h:159
std::string::size_type length() const
Definition: utf8.h:114
std::string::const_iterator end() const
Definition: utf8.h:190
const uni_iter & operator++()
pre-increment and return uni_iter at new position
Definition: utf8.h:226
bool operator>(const uni_iter &other) const
Definition: utf8.h:270
UTF8 & operator+=(const char *s)
Definition: utf8.h:141
UTF8 & operator+=(const UTF8 &str)
Definition: utf8.h:127
class uni_iter is a non-mutating iterator that walks through unicode code points in the UTF8 encoded ...
Definition: utf8.h:200
std::string::const_iterator begin() const
Definition: utf8.h:189
uni_iter uend() const
Function uend returns a uni_iter initialized to the end of "this" UTF8 byte sequence.
Definition: utf8.h:287
UTF8 & operator=(char c)
Definition: utf8.h:166
uni_iter(const char *start)
Definition: utf8.h:207
UTF8 & operator+=(char ch)
Definition: utf8.h:134
uni_iter operator++(int)
post-increment and return uni_iter at initial position
Definition: utf8.h:233
static const std::string::size_type npos
Definition: utf8.h:148
const char * c_str() const
Definition: utf8.h:107
UTF8(const std::string &o)
Definition: utf8.h:91
static int uni_forward(const unsigned char *aSequence, unsigned *aResult=NULL)
Function uni_forward advances over a single UTF8 encoded multibyte character, capturing the unicode c...
Definition: utf8.cpp:70
bool operator>=(const uni_iter &other) const
Definition: utf8.h:271
bool operator==(const char *s) const
Definition: utf8.h:120
const unsigned char * it
Definition: utf8.h:204
void clear()
Definition: utf8.h:113
~UTF8()
Definition: utf8.h:101
#define MAYBE_VERIFY_UTF8(x)
Definition: utf8.h:47
std::string::size_type find(char c, size_t &s) const
Definition: utf8.h:111