encodings.h
1 // Tencent is pleased to support the open source community by making RapidJSON available.
2 //
3 // Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. All rights reserved.
4 //
5 // Licensed under the MIT License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // http://opensource.org/licenses/MIT
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifndef RAPIDJSON_ENCODINGS_H_
16 #define RAPIDJSON_ENCODINGS_H_
17 
18 #include "rapidjson.h"
19 
20 #ifdef _MSC_VER
21 RAPIDJSON_DIAG_PUSH
22 RAPIDJSON_DIAG_OFF(4244) // conversion from 'type1' to 'type2', possible loss of data
23 RAPIDJSON_DIAG_OFF(4702) // unreachable code
24 #elif defined(__GNUC__)
25 RAPIDJSON_DIAG_PUSH
26 RAPIDJSON_DIAG_OFF(effc++)
27 RAPIDJSON_DIAG_OFF(overflow)
28 #endif
29 
30 RAPIDJSON_NAMESPACE_BEGIN
31 
32 ///////////////////////////////////////////////////////////////////////////////
33 // Encoding
34 
35 /*! \class rapidjson::Encoding
36  \brief Concept for encoding of Unicode characters.
37 
38 \code
39 concept Encoding {
40  typename Ch; //! Type of character. A "character" is actually a code unit in unicode's definition.
41 
42  enum { supportUnicode = 1 }; // or 0 if not supporting unicode
43 
44  //! \brief Encode a Unicode codepoint to an output stream.
45  //! \param os Output stream.
46  //! \param codepoint An unicode codepoint, ranging from 0x0 to 0x10FFFF inclusively.
47  template<typename OutputStream>
48  static void Encode(OutputStream& os, unsigned codepoint);
49 
50  //! \brief Decode a Unicode codepoint from an input stream.
51  //! \param is Input stream.
52  //! \param codepoint Output of the unicode codepoint.
53  //! \return true if a valid codepoint can be decoded from the stream.
54  template <typename InputStream>
55  static bool Decode(InputStream& is, unsigned* codepoint);
56 
57  //! \brief Validate one Unicode codepoint from an encoded stream.
58  //! \param is Input stream to obtain codepoint.
59  //! \param os Output for copying one codepoint.
60  //! \return true if it is valid.
61  //! \note This function just validating and copying the codepoint without actually decode it.
62  template <typename InputStream, typename OutputStream>
63  static bool Validate(InputStream& is, OutputStream& os);
64 
65  // The following functions are deal with byte streams.
66 
67  //! Take a character from input byte stream, skip BOM if exist.
68  template <typename InputByteStream>
69  static CharType TakeBOM(InputByteStream& is);
70 
71  //! Take a character from input byte stream.
72  template <typename InputByteStream>
73  static Ch Take(InputByteStream& is);
74 
75  //! Put BOM to output byte stream.
76  template <typename OutputByteStream>
77  static void PutBOM(OutputByteStream& os);
78 
79  //! Put a character to output byte stream.
80  template <typename OutputByteStream>
81  static void Put(OutputByteStream& os, Ch c);
82 };
83 \endcode
84 */
85 
86 ///////////////////////////////////////////////////////////////////////////////
87 // UTF8
88 
89 //! UTF-8 encoding.
90 /*! http://en.wikipedia.org/wiki/UTF-8
91  http://tools.ietf.org/html/rfc3629
92  \tparam CharType Code unit for storing 8-bit UTF-8 data. Default is char.
93  \note implements Encoding concept
94 */
95 template<typename CharType = char>
96 struct UTF8 {
97  typedef CharType Ch;
98 
99  enum { supportUnicode = 1 };
100 
101  template<typename OutputStream>
102  static void Encode(OutputStream& os, unsigned codepoint) {
103  if (codepoint <= 0x7F)
104  os.Put(static_cast<Ch>(codepoint & 0xFF));
105  else if (codepoint <= 0x7FF) {
106  os.Put(static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
107  os.Put(static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
108  }
109  else if (codepoint <= 0xFFFF) {
110  os.Put(static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
111  os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
112  os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
113  }
114  else {
115  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
116  os.Put(static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
117  os.Put(static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
118  os.Put(static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
119  os.Put(static_cast<Ch>(0x80 | (codepoint & 0x3F)));
120  }
121  }
122 
123  template<typename OutputStream>
124  static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
125  if (codepoint <= 0x7F)
126  PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
127  else if (codepoint <= 0x7FF) {
128  PutUnsafe(os, static_cast<Ch>(0xC0 | ((codepoint >> 6) & 0xFF)));
129  PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint & 0x3F))));
130  }
131  else if (codepoint <= 0xFFFF) {
132  PutUnsafe(os, static_cast<Ch>(0xE0 | ((codepoint >> 12) & 0xFF)));
133  PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
134  PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
135  }
136  else {
137  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
138  PutUnsafe(os, static_cast<Ch>(0xF0 | ((codepoint >> 18) & 0xFF)));
139  PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 12) & 0x3F)));
140  PutUnsafe(os, static_cast<Ch>(0x80 | ((codepoint >> 6) & 0x3F)));
141  PutUnsafe(os, static_cast<Ch>(0x80 | (codepoint & 0x3F)));
142  }
143  }
144 
145  template <typename InputStream>
146  static bool Decode(InputStream& is, unsigned* codepoint) {
147 #define COPY() c = is.Take(); *codepoint = (*codepoint << 6) | (static_cast<unsigned char>(c) & 0x3Fu)
148 #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
149 #define TAIL() COPY(); TRANS(0x70)
150  typename InputStream::Ch c = is.Take();
151  if (!(c & 0x80)) {
152  *codepoint = static_cast<unsigned char>(c);
153  return true;
154  }
155 
156  unsigned char type = GetRange(static_cast<unsigned char>(c));
157  if (type >= 32) {
158  *codepoint = 0;
159  } else {
160  *codepoint = (0xFF >> type) & static_cast<unsigned char>(c);
161  }
162  bool result = true;
163  switch (type) {
164  case 2: TAIL(); return result;
165  case 3: TAIL(); TAIL(); return result;
166  case 4: COPY(); TRANS(0x50); TAIL(); return result;
167  case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result;
168  case 6: TAIL(); TAIL(); TAIL(); return result;
169  case 10: COPY(); TRANS(0x20); TAIL(); return result;
170  case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result;
171  default: return false;
172  }
173 #undef COPY
174 #undef TRANS
175 #undef TAIL
176  }
177 
178  template <typename InputStream, typename OutputStream>
179  static bool Validate(InputStream& is, OutputStream& os) {
180 #define COPY() os.Put(c = is.Take())
181 #define TRANS(mask) result &= ((GetRange(static_cast<unsigned char>(c)) & mask) != 0)
182 #define TAIL() COPY(); TRANS(0x70)
183  Ch c;
184  COPY();
185  if (!(c & 0x80))
186  return true;
187 
188  bool result = true;
189  switch (GetRange(static_cast<unsigned char>(c))) {
190  case 2: TAIL(); return result;
191  case 3: TAIL(); TAIL(); return result;
192  case 4: COPY(); TRANS(0x50); TAIL(); return result;
193  case 5: COPY(); TRANS(0x10); TAIL(); TAIL(); return result;
194  case 6: TAIL(); TAIL(); TAIL(); return result;
195  case 10: COPY(); TRANS(0x20); TAIL(); return result;
196  case 11: COPY(); TRANS(0x60); TAIL(); TAIL(); return result;
197  default: return false;
198  }
199 #undef COPY
200 #undef TRANS
201 #undef TAIL
202  }
203 
204  static unsigned char GetRange(unsigned char c) {
205  // Referring to DFA of http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
206  // With new mapping 1 -> 0x10, 7 -> 0x20, 9 -> 0x40, such that AND operation can test multiple types.
207  static const unsigned char type[] = {
208  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
209  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
211  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212  0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,0x10,
213  0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,0x40,
214  0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
215  0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
216  8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
217  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
218  };
219  return type[c];
220  }
221 
222  template <typename InputByteStream>
223  static CharType TakeBOM(InputByteStream& is) {
224  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
225  typename InputByteStream::Ch c = Take(is);
226  if (static_cast<unsigned char>(c) != 0xEFu) return c;
227  c = is.Take();
228  if (static_cast<unsigned char>(c) != 0xBBu) return c;
229  c = is.Take();
230  if (static_cast<unsigned char>(c) != 0xBFu) return c;
231  c = is.Take();
232  return c;
233  }
234 
235  template <typename InputByteStream>
236  static Ch Take(InputByteStream& is) {
237  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
238  return static_cast<Ch>(is.Take());
239  }
240 
241  template <typename OutputByteStream>
242  static void PutBOM(OutputByteStream& os) {
243  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
244  os.Put(static_cast<typename OutputByteStream::Ch>(0xEFu));
245  os.Put(static_cast<typename OutputByteStream::Ch>(0xBBu));
246  os.Put(static_cast<typename OutputByteStream::Ch>(0xBFu));
247  }
248 
249  template <typename OutputByteStream>
250  static void Put(OutputByteStream& os, Ch c) {
251  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
252  os.Put(static_cast<typename OutputByteStream::Ch>(c));
253  }
254 };
255 
256 ///////////////////////////////////////////////////////////////////////////////
257 // UTF16
258 
259 //! UTF-16 encoding.
260 /*! http://en.wikipedia.org/wiki/UTF-16
261  http://tools.ietf.org/html/rfc2781
262  \tparam CharType Type for storing 16-bit UTF-16 data. Default is wchar_t. C++11 may use char16_t instead.
263  \note implements Encoding concept
264 
265  \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
266  For streaming, use UTF16LE and UTF16BE, which handle endianness.
267 */
268 template<typename CharType = wchar_t>
269 struct UTF16 {
270  typedef CharType Ch;
271  RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 2);
272 
273  enum { supportUnicode = 1 };
274 
275  template<typename OutputStream>
276  static void Encode(OutputStream& os, unsigned codepoint) {
277  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
278  if (codepoint <= 0xFFFF) {
279  RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
280  os.Put(static_cast<typename OutputStream::Ch>(codepoint));
281  }
282  else {
283  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
284  unsigned v = codepoint - 0x10000;
285  os.Put(static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
286  os.Put((v & 0x3FF) | 0xDC00);
287  }
288  }
289 
290 
291  template<typename OutputStream>
292  static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
293  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
294  if (codepoint <= 0xFFFF) {
295  RAPIDJSON_ASSERT(codepoint < 0xD800 || codepoint > 0xDFFF); // Code point itself cannot be surrogate pair
296  PutUnsafe(os, static_cast<typename OutputStream::Ch>(codepoint));
297  }
298  else {
299  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
300  unsigned v = codepoint - 0x10000;
301  PutUnsafe(os, static_cast<typename OutputStream::Ch>((v >> 10) | 0xD800));
302  PutUnsafe(os, (v & 0x3FF) | 0xDC00);
303  }
304  }
305 
306  template <typename InputStream>
307  static bool Decode(InputStream& is, unsigned* codepoint) {
308  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
309  typename InputStream::Ch c = is.Take();
310  if (c < 0xD800 || c > 0xDFFF) {
311  *codepoint = static_cast<unsigned>(c);
312  return true;
313  }
314  else if (c <= 0xDBFF) {
315  *codepoint = (static_cast<unsigned>(c) & 0x3FF) << 10;
316  c = is.Take();
317  *codepoint |= (static_cast<unsigned>(c) & 0x3FF);
318  *codepoint += 0x10000;
319  return c >= 0xDC00 && c <= 0xDFFF;
320  }
321  return false;
322  }
323 
324  template <typename InputStream, typename OutputStream>
325  static bool Validate(InputStream& is, OutputStream& os) {
326  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 2);
327  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 2);
328  typename InputStream::Ch c;
329  os.Put(static_cast<typename OutputStream::Ch>(c = is.Take()));
330  if (c < 0xD800 || c > 0xDFFF)
331  return true;
332  else if (c <= 0xDBFF) {
333  os.Put(c = is.Take());
334  return c >= 0xDC00 && c <= 0xDFFF;
335  }
336  return false;
337  }
338 };
339 
340 //! UTF-16 little endian encoding.
341 template<typename CharType = wchar_t>
342 struct UTF16LE : UTF16<CharType> {
343  template <typename InputByteStream>
344  static CharType TakeBOM(InputByteStream& is) {
345  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
346  CharType c = Take(is);
347  return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
348  }
349 
350  template <typename InputByteStream>
351  static CharType Take(InputByteStream& is) {
352  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
353  unsigned c = static_cast<uint8_t>(is.Take());
354  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
355  return static_cast<CharType>(c);
356  }
357 
358  template <typename OutputByteStream>
359  static void PutBOM(OutputByteStream& os) {
360  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
361  os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
362  os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
363  }
364 
365  template <typename OutputByteStream>
366  static void Put(OutputByteStream& os, CharType c) {
367  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
368  os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
369  os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
370  }
371 };
372 
373 //! UTF-16 big endian encoding.
374 template<typename CharType = wchar_t>
375 struct UTF16BE : UTF16<CharType> {
376  template <typename InputByteStream>
377  static CharType TakeBOM(InputByteStream& is) {
378  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
379  CharType c = Take(is);
380  return static_cast<uint16_t>(c) == 0xFEFFu ? Take(is) : c;
381  }
382 
383  template <typename InputByteStream>
384  static CharType Take(InputByteStream& is) {
385  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
386  unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
387  c |= static_cast<uint8_t>(is.Take());
388  return static_cast<CharType>(c);
389  }
390 
391  template <typename OutputByteStream>
392  static void PutBOM(OutputByteStream& os) {
393  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
394  os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
395  os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
396  }
397 
398  template <typename OutputByteStream>
399  static void Put(OutputByteStream& os, CharType c) {
400  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
401  os.Put(static_cast<typename OutputByteStream::Ch>((static_cast<unsigned>(c) >> 8) & 0xFFu));
402  os.Put(static_cast<typename OutputByteStream::Ch>(static_cast<unsigned>(c) & 0xFFu));
403  }
404 };
405 
406 ///////////////////////////////////////////////////////////////////////////////
407 // UTF32
408 
409 //! UTF-32 encoding.
410 /*! http://en.wikipedia.org/wiki/UTF-32
411  \tparam CharType Type for storing 32-bit UTF-32 data. Default is unsigned. C++11 may use char32_t instead.
412  \note implements Encoding concept
413 
414  \note For in-memory access, no need to concern endianness. The code units and code points are represented by CPU's endianness.
415  For streaming, use UTF32LE and UTF32BE, which handle endianness.
416 */
417 template<typename CharType = unsigned>
418 struct UTF32 {
419  typedef CharType Ch;
420  RAPIDJSON_STATIC_ASSERT(sizeof(Ch) >= 4);
421 
422  enum { supportUnicode = 1 };
423 
424  template<typename OutputStream>
425  static void Encode(OutputStream& os, unsigned codepoint) {
426  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
427  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
428  os.Put(codepoint);
429  }
430 
431  template<typename OutputStream>
432  static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
433  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputStream::Ch) >= 4);
434  RAPIDJSON_ASSERT(codepoint <= 0x10FFFF);
435  PutUnsafe(os, codepoint);
436  }
437 
438  template <typename InputStream>
439  static bool Decode(InputStream& is, unsigned* codepoint) {
440  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
441  Ch c = is.Take();
442  *codepoint = c;
443  return c <= 0x10FFFF;
444  }
445 
446  template <typename InputStream, typename OutputStream>
447  static bool Validate(InputStream& is, OutputStream& os) {
448  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputStream::Ch) >= 4);
449  Ch c;
450  os.Put(c = is.Take());
451  return c <= 0x10FFFF;
452  }
453 };
454 
455 //! UTF-32 little endian enocoding.
456 template<typename CharType = unsigned>
457 struct UTF32LE : UTF32<CharType> {
458  template <typename InputByteStream>
459  static CharType TakeBOM(InputByteStream& is) {
460  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
461  CharType c = Take(is);
462  return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
463  }
464 
465  template <typename InputByteStream>
466  static CharType Take(InputByteStream& is) {
467  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
468  unsigned c = static_cast<uint8_t>(is.Take());
469  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
470  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
471  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
472  return static_cast<CharType>(c);
473  }
474 
475  template <typename OutputByteStream>
476  static void PutBOM(OutputByteStream& os) {
477  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
478  os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
479  os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
480  os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
481  os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
482  }
483 
484  template <typename OutputByteStream>
485  static void Put(OutputByteStream& os, CharType c) {
486  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
487  os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
488  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
489  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
490  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
491  }
492 };
493 
494 //! UTF-32 big endian encoding.
495 template<typename CharType = unsigned>
496 struct UTF32BE : UTF32<CharType> {
497  template <typename InputByteStream>
498  static CharType TakeBOM(InputByteStream& is) {
499  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
500  CharType c = Take(is);
501  return static_cast<uint32_t>(c) == 0x0000FEFFu ? Take(is) : c;
502  }
503 
504  template <typename InputByteStream>
505  static CharType Take(InputByteStream& is) {
506  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
507  unsigned c = static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 24;
508  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 16;
509  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take())) << 8;
510  c |= static_cast<unsigned>(static_cast<uint8_t>(is.Take()));
511  return static_cast<CharType>(c);
512  }
513 
514  template <typename OutputByteStream>
515  static void PutBOM(OutputByteStream& os) {
516  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
517  os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
518  os.Put(static_cast<typename OutputByteStream::Ch>(0x00u));
519  os.Put(static_cast<typename OutputByteStream::Ch>(0xFEu));
520  os.Put(static_cast<typename OutputByteStream::Ch>(0xFFu));
521  }
522 
523  template <typename OutputByteStream>
524  static void Put(OutputByteStream& os, CharType c) {
525  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
526  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 24) & 0xFFu));
527  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 16) & 0xFFu));
528  os.Put(static_cast<typename OutputByteStream::Ch>((c >> 8) & 0xFFu));
529  os.Put(static_cast<typename OutputByteStream::Ch>(c & 0xFFu));
530  }
531 };
532 
533 ///////////////////////////////////////////////////////////////////////////////
534 // ASCII
535 
536 //! ASCII encoding.
537 /*! http://en.wikipedia.org/wiki/ASCII
538  \tparam CharType Code unit for storing 7-bit ASCII data. Default is char.
539  \note implements Encoding concept
540 */
541 template<typename CharType = char>
542 struct ASCII {
543  typedef CharType Ch;
544 
545  enum { supportUnicode = 0 };
546 
547  template<typename OutputStream>
548  static void Encode(OutputStream& os, unsigned codepoint) {
549  RAPIDJSON_ASSERT(codepoint <= 0x7F);
550  os.Put(static_cast<Ch>(codepoint & 0xFF));
551  }
552 
553  template<typename OutputStream>
554  static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
555  RAPIDJSON_ASSERT(codepoint <= 0x7F);
556  PutUnsafe(os, static_cast<Ch>(codepoint & 0xFF));
557  }
558 
559  template <typename InputStream>
560  static bool Decode(InputStream& is, unsigned* codepoint) {
561  uint8_t c = static_cast<uint8_t>(is.Take());
562  *codepoint = c;
563  return c <= 0X7F;
564  }
565 
566  template <typename InputStream, typename OutputStream>
567  static bool Validate(InputStream& is, OutputStream& os) {
568  uint8_t c = static_cast<uint8_t>(is.Take());
569  os.Put(static_cast<typename OutputStream::Ch>(c));
570  return c <= 0x7F;
571  }
572 
573  template <typename InputByteStream>
574  static CharType TakeBOM(InputByteStream& is) {
575  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
576  uint8_t c = static_cast<uint8_t>(Take(is));
577  return static_cast<Ch>(c);
578  }
579 
580  template <typename InputByteStream>
581  static Ch Take(InputByteStream& is) {
582  RAPIDJSON_STATIC_ASSERT(sizeof(typename InputByteStream::Ch) == 1);
583  return static_cast<Ch>(is.Take());
584  }
585 
586  template <typename OutputByteStream>
587  static void PutBOM(OutputByteStream& os) {
588  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
589  (void)os;
590  }
591 
592  template <typename OutputByteStream>
593  static void Put(OutputByteStream& os, Ch c) {
594  RAPIDJSON_STATIC_ASSERT(sizeof(typename OutputByteStream::Ch) == 1);
595  os.Put(static_cast<typename OutputByteStream::Ch>(c));
596  }
597 };
598 
599 ///////////////////////////////////////////////////////////////////////////////
600 // AutoUTF
601 
602 //! Runtime-specified UTF encoding type of a stream.
603 enum UTFType {
604  kUTF8 = 0, //!< UTF-8.
605  kUTF16LE = 1, //!< UTF-16 little endian.
606  kUTF16BE = 2, //!< UTF-16 big endian.
607  kUTF32LE = 3, //!< UTF-32 little endian.
608  kUTF32BE = 4 //!< UTF-32 big endian.
609 };
610 
611 //! Dynamically select encoding according to stream's runtime-specified UTF encoding type.
612 /*! \note This class can be used with AutoUTFInputtStream and AutoUTFOutputStream, which provides GetType().
613 */
614 template<typename CharType>
615 struct AutoUTF {
616  typedef CharType Ch;
617 
618  enum { supportUnicode = 1 };
619 
620 #define RAPIDJSON_ENCODINGS_FUNC(x) UTF8<Ch>::x, UTF16LE<Ch>::x, UTF16BE<Ch>::x, UTF32LE<Ch>::x, UTF32BE<Ch>::x
621 
622  template<typename OutputStream>
623  RAPIDJSON_FORCEINLINE static void Encode(OutputStream& os, unsigned codepoint) {
624  typedef void (*EncodeFunc)(OutputStream&, unsigned);
625  static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Encode) };
626  (*f[os.GetType()])(os, codepoint);
627  }
628 
629  template<typename OutputStream>
630  RAPIDJSON_FORCEINLINE static void EncodeUnsafe(OutputStream& os, unsigned codepoint) {
631  typedef void (*EncodeFunc)(OutputStream&, unsigned);
632  static const EncodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(EncodeUnsafe) };
633  (*f[os.GetType()])(os, codepoint);
634  }
635 
636  template <typename InputStream>
637  RAPIDJSON_FORCEINLINE static bool Decode(InputStream& is, unsigned* codepoint) {
638  typedef bool (*DecodeFunc)(InputStream&, unsigned*);
639  static const DecodeFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Decode) };
640  return (*f[is.GetType()])(is, codepoint);
641  }
642 
643  template <typename InputStream, typename OutputStream>
644  RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
645  typedef bool (*ValidateFunc)(InputStream&, OutputStream&);
646  static const ValidateFunc f[] = { RAPIDJSON_ENCODINGS_FUNC(Validate) };
647  return (*f[is.GetType()])(is, os);
648  }
649 
650 #undef RAPIDJSON_ENCODINGS_FUNC
651 };
652 
653 ///////////////////////////////////////////////////////////////////////////////
654 // Transcoder
655 
656 //! Encoding conversion.
657 template<typename SourceEncoding, typename TargetEncoding>
658 struct Transcoder {
659  //! Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the output stream.
660  template<typename InputStream, typename OutputStream>
661  RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) {
662  unsigned codepoint;
663  if (!SourceEncoding::Decode(is, &codepoint))
664  return false;
665  TargetEncoding::Encode(os, codepoint);
666  return true;
667  }
668 
669  template<typename InputStream, typename OutputStream>
670  RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
671  unsigned codepoint;
672  if (!SourceEncoding::Decode(is, &codepoint))
673  return false;
674  TargetEncoding::EncodeUnsafe(os, codepoint);
675  return true;
676  }
677 
678  //! Validate one Unicode codepoint from an encoded stream.
679  template<typename InputStream, typename OutputStream>
680  RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
681  return Transcode(is, os); // Since source/target encoding is different, must transcode.
682  }
683 };
684 
685 // Forward declaration.
686 template<typename Stream>
687 inline void PutUnsafe(Stream& stream, typename Stream::Ch c);
688 
689 //! Specialization of Transcoder with same source and target encoding.
690 template<typename Encoding>
692  template<typename InputStream, typename OutputStream>
693  RAPIDJSON_FORCEINLINE static bool Transcode(InputStream& is, OutputStream& os) {
694  os.Put(is.Take()); // Just copy one code unit. This semantic is different from primary template class.
695  return true;
696  }
697 
698  template<typename InputStream, typename OutputStream>
699  RAPIDJSON_FORCEINLINE static bool TranscodeUnsafe(InputStream& is, OutputStream& os) {
700  PutUnsafe(os, is.Take()); // Just copy one code unit. This semantic is different from primary template class.
701  return true;
702  }
703 
704  template<typename InputStream, typename OutputStream>
705  RAPIDJSON_FORCEINLINE static bool Validate(InputStream& is, OutputStream& os) {
706  return Encoding::Validate(is, os); // source/target encoding are the same
707  }
708 };
709 
710 RAPIDJSON_NAMESPACE_END
711 
712 #if defined(__GNUC__) || defined(_MSC_VER)
713 RAPIDJSON_DIAG_POP
714 #endif
715 
716 #endif // RAPIDJSON_ENCODINGS_H_
rapidjson::kUTF16LE
@ kUTF16LE
UTF-16 little endian.
Definition: encodings.h:605
Stream
Concept for reading and writing characters.
rapidjson.h
common definitions and configuration
rapidjson::UTF16
UTF-16 encoding.
Definition: encodings.h:269
rapidjson::kUTF32LE
@ kUTF32LE
UTF-32 little endian.
Definition: encodings.h:607
rapidjson::ASCII
ASCII encoding.
Definition: encodings.h:542
rapidjson::Transcoder
Encoding conversion.
Definition: encodings.h:658
RAPIDJSON_ASSERT
#define RAPIDJSON_ASSERT(x)
Assertion.
Definition: rapidjson.h:402
rapidjson::Transcoder::Transcode
static RAPIDJSON_FORCEINLINE bool Transcode(InputStream &is, OutputStream &os)
Take one Unicode codepoint from source encoding, convert it to target encoding and put it to the outp...
Definition: encodings.h:661
rapidjson::AutoUTF
Dynamically select encoding according to stream's runtime-specified UTF encoding type.
Definition: encodings.h:615
rapidjson::PutUnsafe
void PutUnsafe(Stream &stream, typename Stream::Ch c)
Write character to a stream, presuming buffer is reserved.
Definition: stream.h:91
rapidjson::kUTF16BE
@ kUTF16BE
UTF-16 big endian.
Definition: encodings.h:606
rapidjson::Transcoder::Validate
static RAPIDJSON_FORCEINLINE bool Validate(InputStream &is, OutputStream &os)
Validate one Unicode codepoint from an encoded stream.
Definition: encodings.h:680
rapidjson::UTF32LE
UTF-32 little endian enocoding.
Definition: encodings.h:457
rapidjson::UTFType
UTFType
Runtime-specified UTF encoding type of a stream.
Definition: encodings.h:603
rapidjson::kUTF8
@ kUTF8
UTF-8.
Definition: encodings.h:604
rapidjson::UTF8
UTF-8 encoding.
Definition: encodings.h:96
rapidjson::UTF32
UTF-32 encoding.
Definition: encodings.h:418
rapidjson::UTF32BE
UTF-32 big endian encoding.
Definition: encodings.h:496
rapidjson::kUTF32BE
@ kUTF32BE
UTF-32 big endian.
Definition: encodings.h:608
rapidjson::UTF16LE
UTF-16 little endian encoding.
Definition: encodings.h:342
Encoding
Concept for encoding of Unicode characters.
rapidjson::UTF16BE
UTF-16 big endian encoding.
Definition: encodings.h:375
RAPIDJSON_STATIC_ASSERT
#define RAPIDJSON_STATIC_ASSERT(x)
(Internal) macro to check for conditions at compile-time
Definition: rapidjson.h:437