Added pugixml in preparation for new config system.
[skeleton.git] / libs / pugixml / pugixml.cpp
1 /**
2  * pugixml parser - version 1.2
3  * --------------------------------------------------------
4  * Copyright (C) 2006-2012, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
5  * Report bugs and download new versions at http://pugixml.org/
6  *
7  * This library is distributed under the MIT License. See notice at the end
8  * of this file.
9  *
10  * This work is based on the pugxml parser, which is:
11  * Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
12  */
13
14 #ifndef SOURCE_PUGIXML_CPP
15 #define SOURCE_PUGIXML_CPP
16
17 #include "pugixml.hpp"
18
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <assert.h>
23 #include <wchar.h>
24
25 #ifndef PUGIXML_NO_XPATH
26 #       include <math.h>
27 #       include <float.h>
28 #       ifdef PUGIXML_NO_EXCEPTIONS
29 #               include <setjmp.h>
30 #       endif
31 #endif
32
33 #ifndef PUGIXML_NO_STL
34 #       include <istream>
35 #       include <ostream>
36 #       include <string>
37 #endif
38
39 // For placement new
40 #include <new>
41
42 #ifdef _MSC_VER
43 #       pragma warning(push)
44 #       pragma warning(disable: 4127) // conditional expression is constant
45 #       pragma warning(disable: 4324) // structure was padded due to __declspec(align())
46 #       pragma warning(disable: 4611) // interaction between '_setjmp' and C++ object destruction is non-portable
47 #       pragma warning(disable: 4702) // unreachable code
48 #       pragma warning(disable: 4996) // this function or variable may be unsafe
49 #       pragma warning(disable: 4793) // function compiled as native: presence of '_setjmp' makes a function unmanaged
50 #endif
51
52 #ifdef __INTEL_COMPILER
53 #       pragma warning(disable: 177) // function was declared but never referenced 
54 #       pragma warning(disable: 279) // controlling expression is constant
55 #       pragma warning(disable: 1478 1786) // function was declared "deprecated"
56 #       pragma warning(disable: 1684) // conversion from pointer to same-sized integral type
57 #endif
58
59 #if defined(__BORLANDC__) && defined(PUGIXML_HEADER_ONLY)
60 #       pragma warn -8080 // symbol is declared but never used; disabling this inside push/pop bracket does not make the warning go away
61 #endif
62
63 #ifdef __BORLANDC__
64 #       pragma option push
65 #       pragma warn -8008 // condition is always false
66 #       pragma warn -8066 // unreachable code
67 #endif
68
69 #ifdef __SNC__
70 // Using diag_push/diag_pop does not disable the warnings inside templates due to a compiler bug
71 #       pragma diag_suppress=178 // function was declared but never referenced
72 #       pragma diag_suppress=237 // controlling expression is constant
73 #endif
74
75 // Inlining controls
76 #if defined(_MSC_VER) && _MSC_VER >= 1300
77 #       define PUGI__NO_INLINE __declspec(noinline)
78 #elif defined(__GNUC__)
79 #       define PUGI__NO_INLINE __attribute__((noinline))
80 #else
81 #       define PUGI__NO_INLINE 
82 #endif
83
84 // Simple static assertion
85 #define PUGI__STATIC_ASSERT(cond) { static const char condition_failed[(cond) ? 1 : -1] = {0}; (void)condition_failed[0]; }
86
87 // Digital Mars C++ bug workaround for passing char loaded from memory via stack
88 #ifdef __DMC__
89 #       define PUGI__DMC_VOLATILE volatile
90 #else
91 #       define PUGI__DMC_VOLATILE
92 #endif
93
94 // Borland C++ bug workaround for not defining ::memcpy depending on header include order (can't always use std::memcpy because some compilers don't have it at all)
95 #if defined(__BORLANDC__) && !defined(__MEM_H_USING_LIST)
96 using std::memcpy;
97 using std::memmove;
98 #endif
99
100 // In some environments MSVC is a compiler but the CRT lacks certain MSVC-specific features
101 #if defined(_MSC_VER) && !defined(__S3E__)
102 #       define PUGI__MSVC_CRT_VERSION _MSC_VER
103 #endif
104
105 #ifdef PUGIXML_HEADER_ONLY
106 #       define PUGI__NS_BEGIN namespace pugi { namespace impl {
107 #       define PUGI__NS_END } }
108 #       define PUGI__FN inline
109 #       define PUGI__FN_NO_INLINE inline
110 #else
111 #       if defined(_MSC_VER) && _MSC_VER < 1300 // MSVC6 seems to have an amusing bug with anonymous namespaces inside namespaces
112 #               define PUGI__NS_BEGIN namespace pugi { namespace impl {
113 #               define PUGI__NS_END } }
114 #       else
115 #               define PUGI__NS_BEGIN namespace pugi { namespace impl { namespace {
116 #               define PUGI__NS_END } } }
117 #       endif
118 #       define PUGI__FN
119 #       define PUGI__FN_NO_INLINE PUGI__NO_INLINE
120 #endif
121
122 // uintptr_t
123 #if !defined(_MSC_VER) || _MSC_VER >= 1600
124 #       include <stdint.h>
125 #else
126 #       ifndef _UINTPTR_T_DEFINED
127 // No native uintptr_t in MSVC6 and in some WinCE versions
128 typedef size_t uintptr_t;
129 #define _UINTPTR_T_DEFINED
130 #       endif
131 PUGI__NS_BEGIN
132         typedef unsigned __int8 uint8_t;
133         typedef unsigned __int16 uint16_t;
134         typedef unsigned __int32 uint32_t;
135 PUGI__NS_END
136 #endif
137
138 // Memory allocation
139 PUGI__NS_BEGIN
140         PUGI__FN void* default_allocate(size_t size)
141         {
142                 return malloc(size);
143         }
144
145         PUGI__FN void default_deallocate(void* ptr)
146         {
147                 free(ptr);
148         }
149
150         template <typename T>
151         struct xml_memory_management_function_storage
152         {
153                 static allocation_function allocate;
154                 static deallocation_function deallocate;
155         };
156
157         template <typename T> allocation_function xml_memory_management_function_storage<T>::allocate = default_allocate;
158         template <typename T> deallocation_function xml_memory_management_function_storage<T>::deallocate = default_deallocate;
159
160         typedef xml_memory_management_function_storage<int> xml_memory;
161 PUGI__NS_END
162
163 // String utilities
164 PUGI__NS_BEGIN
165         // Get string length
166         PUGI__FN size_t strlength(const char_t* s)
167         {
168                 assert(s);
169
170         #ifdef PUGIXML_WCHAR_MODE
171                 return wcslen(s);
172         #else
173                 return strlen(s);
174         #endif
175         }
176
177         // Compare two strings
178         PUGI__FN bool strequal(const char_t* src, const char_t* dst)
179         {
180                 assert(src && dst);
181
182         #ifdef PUGIXML_WCHAR_MODE
183                 return wcscmp(src, dst) == 0;
184         #else
185                 return strcmp(src, dst) == 0;
186         #endif
187         }
188
189         // Compare lhs with [rhs_begin, rhs_end)
190         PUGI__FN bool strequalrange(const char_t* lhs, const char_t* rhs, size_t count)
191         {
192                 for (size_t i = 0; i < count; ++i)
193                         if (lhs[i] != rhs[i])
194                                 return false;
195         
196                 return lhs[count] == 0;
197         }
198         
199 #ifdef PUGIXML_WCHAR_MODE
200         // Convert string to wide string, assuming all symbols are ASCII
201         PUGI__FN void widen_ascii(wchar_t* dest, const char* source)
202         {
203                 for (const char* i = source; *i; ++i) *dest++ = *i;
204                 *dest = 0;
205         }
206 #endif
207 PUGI__NS_END
208
209 #if !defined(PUGIXML_NO_STL) || !defined(PUGIXML_NO_XPATH)
210 // auto_ptr-like buffer holder for exception recovery
211 PUGI__NS_BEGIN
212         struct buffer_holder
213         {
214                 void* data;
215                 void (*deleter)(void*);
216
217                 buffer_holder(void* data_, void (*deleter_)(void*)): data(data_), deleter(deleter_)
218                 {
219                 }
220
221                 ~buffer_holder()
222                 {
223                         if (data) deleter(data);
224                 }
225
226                 void* release()
227                 {
228                         void* result = data;
229                         data = 0;
230                         return result;
231                 }
232         };
233 PUGI__NS_END
234 #endif
235
236 PUGI__NS_BEGIN
237         static const size_t xml_memory_page_size =
238         #ifdef PUGIXML_MEMORY_PAGE_SIZE
239                 PUGIXML_MEMORY_PAGE_SIZE
240         #else
241                 32768
242         #endif
243                 ;
244
245         static const uintptr_t xml_memory_page_alignment = 32;
246         static const uintptr_t xml_memory_page_pointer_mask = ~(xml_memory_page_alignment - 1);
247         static const uintptr_t xml_memory_page_name_allocated_mask = 16;
248         static const uintptr_t xml_memory_page_value_allocated_mask = 8;
249         static const uintptr_t xml_memory_page_type_mask = 7;
250
251         struct xml_allocator;
252
253         struct xml_memory_page
254         {
255                 static xml_memory_page* construct(void* memory)
256                 {
257                         if (!memory) return 0; //$ redundant, left for performance
258
259                         xml_memory_page* result = static_cast<xml_memory_page*>(memory);
260
261                         result->allocator = 0;
262                         result->memory = 0;
263                         result->prev = 0;
264                         result->next = 0;
265                         result->busy_size = 0;
266                         result->freed_size = 0;
267
268                         return result;
269                 }
270
271                 xml_allocator* allocator;
272
273                 void* memory;
274
275                 xml_memory_page* prev;
276                 xml_memory_page* next;
277
278                 size_t busy_size;
279                 size_t freed_size;
280
281                 char data[1];
282         };
283
284         struct xml_memory_string_header
285         {
286                 uint16_t page_offset; // offset from page->data
287                 uint16_t full_size; // 0 if string occupies whole page
288         };
289
290         struct xml_allocator
291         {
292                 xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size)
293                 {
294                 }
295
296                 xml_memory_page* allocate_page(size_t data_size)
297                 {
298                         size_t size = offsetof(xml_memory_page, data) + data_size;
299
300                         // allocate block with some alignment, leaving memory for worst-case padding
301                         void* memory = xml_memory::allocate(size + xml_memory_page_alignment);
302                         if (!memory) return 0;
303
304                         // align upwards to page boundary
305                         void* page_memory = reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(memory) + (xml_memory_page_alignment - 1)) & ~(xml_memory_page_alignment - 1));
306
307                         // prepare page structure
308                         xml_memory_page* page = xml_memory_page::construct(page_memory);
309
310                         page->memory = memory;
311                         page->allocator = _root->allocator;
312
313                         return page;
314                 }
315
316                 static void deallocate_page(xml_memory_page* page)
317                 {
318                         xml_memory::deallocate(page->memory);
319                 }
320
321                 void* allocate_memory_oob(size_t size, xml_memory_page*& out_page);
322
323                 void* allocate_memory(size_t size, xml_memory_page*& out_page)
324                 {
325                         if (_busy_size + size > xml_memory_page_size) return allocate_memory_oob(size, out_page);
326
327                         void* buf = _root->data + _busy_size;
328
329                         _busy_size += size;
330
331                         out_page = _root;
332
333                         return buf;
334                 }
335
336                 void deallocate_memory(void* ptr, size_t size, xml_memory_page* page)
337                 {
338                         if (page == _root) page->busy_size = _busy_size;
339
340                         assert(ptr >= page->data && ptr < page->data + page->busy_size);
341                         (void)!ptr;
342
343                         page->freed_size += size;
344                         assert(page->freed_size <= page->busy_size);
345
346                         if (page->freed_size == page->busy_size)
347                         {
348                                 if (page->next == 0)
349                                 {
350                                         assert(_root == page);
351
352                                         // top page freed, just reset sizes
353                                         page->busy_size = page->freed_size = 0;
354                                         _busy_size = 0;
355                                 }
356                                 else
357                                 {
358                                         assert(_root != page);
359                                         assert(page->prev);
360
361                                         // remove from the list
362                                         page->prev->next = page->next;
363                                         page->next->prev = page->prev;
364
365                                         // deallocate
366                                         deallocate_page(page);
367                                 }
368                         }
369                 }
370
371                 char_t* allocate_string(size_t length)
372                 {
373                         // allocate memory for string and header block
374                         size_t size = sizeof(xml_memory_string_header) + length * sizeof(char_t);
375                         
376                         // round size up to pointer alignment boundary
377                         size_t full_size = (size + (sizeof(void*) - 1)) & ~(sizeof(void*) - 1);
378
379                         xml_memory_page* page;
380                         xml_memory_string_header* header = static_cast<xml_memory_string_header*>(allocate_memory(full_size, page));
381
382                         if (!header) return 0;
383
384                         // setup header
385                         ptrdiff_t page_offset = reinterpret_cast<char*>(header) - page->data;
386
387                         assert(page_offset >= 0 && page_offset < (1 << 16));
388                         header->page_offset = static_cast<uint16_t>(page_offset);
389
390                         // full_size == 0 for large strings that occupy the whole page
391                         assert(full_size < (1 << 16) || (page->busy_size == full_size && page_offset == 0));
392                         header->full_size = static_cast<uint16_t>(full_size < (1 << 16) ? full_size : 0);
393
394                         // round-trip through void* to avoid 'cast increases required alignment of target type' warning
395                         // header is guaranteed a pointer-sized alignment, which should be enough for char_t
396                         return static_cast<char_t*>(static_cast<void*>(header + 1));
397                 }
398
399                 void deallocate_string(char_t* string)
400                 {
401                         // this function casts pointers through void* to avoid 'cast increases required alignment of target type' warnings
402                         // we're guaranteed the proper (pointer-sized) alignment on the input string if it was allocated via allocate_string
403
404                         // get header
405                         xml_memory_string_header* header = static_cast<xml_memory_string_header*>(static_cast<void*>(string)) - 1;
406
407                         // deallocate
408                         size_t page_offset = offsetof(xml_memory_page, data) + header->page_offset;
409                         xml_memory_page* page = reinterpret_cast<xml_memory_page*>(static_cast<void*>(reinterpret_cast<char*>(header) - page_offset));
410
411                         // if full_size == 0 then this string occupies the whole page
412                         size_t full_size = header->full_size == 0 ? page->busy_size : header->full_size;
413
414                         deallocate_memory(header, full_size, page);
415                 }
416
417                 xml_memory_page* _root;
418                 size_t _busy_size;
419         };
420
421         PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page)
422         {
423                 const size_t large_allocation_threshold = xml_memory_page_size / 4;
424
425                 xml_memory_page* page = allocate_page(size <= large_allocation_threshold ? xml_memory_page_size : size);
426                 out_page = page;
427
428                 if (!page) return 0;
429
430                 if (size <= large_allocation_threshold)
431                 {
432                         _root->busy_size = _busy_size;
433
434                         // insert page at the end of linked list
435                         page->prev = _root;
436                         _root->next = page;
437                         _root = page;
438
439                         _busy_size = size;
440                 }
441                 else
442                 {
443                         // insert page before the end of linked list, so that it is deleted as soon as possible
444                         // the last page is not deleted even if it's empty (see deallocate_memory)
445                         assert(_root->prev);
446
447                         page->prev = _root->prev;
448                         page->next = _root;
449
450                         _root->prev->next = page;
451                         _root->prev = page;
452                 }
453
454                 // allocate inside page
455                 page->busy_size = size;
456
457                 return page->data;
458         }
459 PUGI__NS_END
460
461 namespace pugi
462 {
463         /// A 'name=value' XML attribute structure.
464         struct xml_attribute_struct
465         {
466                 /// Default ctor
467                 xml_attribute_struct(impl::xml_memory_page* page): header(reinterpret_cast<uintptr_t>(page)), name(0), value(0), prev_attribute_c(0), next_attribute(0)
468                 {
469                 }
470
471                 uintptr_t header;
472
473                 char_t* name;   ///< Pointer to attribute name.
474                 char_t* value;  ///< Pointer to attribute value.
475
476                 xml_attribute_struct* prev_attribute_c; ///< Previous attribute (cyclic list)
477                 xml_attribute_struct* next_attribute;   ///< Next attribute
478         };
479
480         /// An XML document tree node.
481         struct xml_node_struct
482         {
483                 /// Default ctor
484                 /// \param type - node type
485                 xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(reinterpret_cast<uintptr_t>(page) | (type - 1)), parent(0), name(0), value(0), first_child(0), prev_sibling_c(0), next_sibling(0), first_attribute(0)
486                 {
487                 }
488
489                 uintptr_t header;
490
491                 xml_node_struct*                parent;                                 ///< Pointer to parent
492
493                 char_t*                                 name;                                   ///< Pointer to element name.
494                 char_t*                                 value;                                  ///< Pointer to any associated string data.
495
496                 xml_node_struct*                first_child;                    ///< First child
497                 
498                 xml_node_struct*                prev_sibling_c;                 ///< Left brother (cyclic list)
499                 xml_node_struct*                next_sibling;                   ///< Right brother
500                 
501                 xml_attribute_struct*   first_attribute;                ///< First attribute
502         };
503 }
504
505 PUGI__NS_BEGIN
506         struct xml_document_struct: public xml_node_struct, public xml_allocator
507         {
508                 xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0)
509                 {
510                 }
511
512                 const char_t* buffer;
513         };
514
515         inline xml_allocator& get_allocator(const xml_node_struct* node)
516         {
517                 assert(node);
518
519                 return *reinterpret_cast<xml_memory_page*>(node->header & xml_memory_page_pointer_mask)->allocator;
520         }
521 PUGI__NS_END
522
523 // Low-level DOM operations
524 PUGI__NS_BEGIN
525         inline xml_attribute_struct* allocate_attribute(xml_allocator& alloc)
526         {
527                 xml_memory_page* page;
528                 void* memory = alloc.allocate_memory(sizeof(xml_attribute_struct), page);
529
530                 return new (memory) xml_attribute_struct(page);
531         }
532
533         inline xml_node_struct* allocate_node(xml_allocator& alloc, xml_node_type type)
534         {
535                 xml_memory_page* page;
536                 void* memory = alloc.allocate_memory(sizeof(xml_node_struct), page);
537
538                 return new (memory) xml_node_struct(page, type);
539         }
540
541         inline void destroy_attribute(xml_attribute_struct* a, xml_allocator& alloc)
542         {
543                 uintptr_t header = a->header;
544
545                 if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(a->name);
546                 if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(a->value);
547
548                 alloc.deallocate_memory(a, sizeof(xml_attribute_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
549         }
550
551         inline void destroy_node(xml_node_struct* n, xml_allocator& alloc)
552         {
553                 uintptr_t header = n->header;
554
555                 if (header & impl::xml_memory_page_name_allocated_mask) alloc.deallocate_string(n->name);
556                 if (header & impl::xml_memory_page_value_allocated_mask) alloc.deallocate_string(n->value);
557
558                 for (xml_attribute_struct* attr = n->first_attribute; attr; )
559                 {
560                         xml_attribute_struct* next = attr->next_attribute;
561
562                         destroy_attribute(attr, alloc);
563
564                         attr = next;
565                 }
566
567                 for (xml_node_struct* child = n->first_child; child; )
568                 {
569                         xml_node_struct* next = child->next_sibling;
570
571                         destroy_node(child, alloc);
572
573                         child = next;
574                 }
575
576                 alloc.deallocate_memory(n, sizeof(xml_node_struct), reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask));
577         }
578
579         PUGI__FN_NO_INLINE xml_node_struct* append_node(xml_node_struct* node, xml_allocator& alloc, xml_node_type type = node_element)
580         {
581                 xml_node_struct* child = allocate_node(alloc, type);
582                 if (!child) return 0;
583
584                 child->parent = node;
585
586                 xml_node_struct* first_child = node->first_child;
587                         
588                 if (first_child)
589                 {
590                         xml_node_struct* last_child = first_child->prev_sibling_c;
591
592                         last_child->next_sibling = child;
593                         child->prev_sibling_c = last_child;
594                         first_child->prev_sibling_c = child;
595                 }
596                 else
597                 {
598                         node->first_child = child;
599                         child->prev_sibling_c = child;
600                 }
601                         
602                 return child;
603         }
604
605         PUGI__FN_NO_INLINE xml_attribute_struct* append_attribute_ll(xml_node_struct* node, xml_allocator& alloc)
606         {
607                 xml_attribute_struct* a = allocate_attribute(alloc);
608                 if (!a) return 0;
609
610                 xml_attribute_struct* first_attribute = node->first_attribute;
611
612                 if (first_attribute)
613                 {
614                         xml_attribute_struct* last_attribute = first_attribute->prev_attribute_c;
615
616                         last_attribute->next_attribute = a;
617                         a->prev_attribute_c = last_attribute;
618                         first_attribute->prev_attribute_c = a;
619                 }
620                 else
621                 {
622                         node->first_attribute = a;
623                         a->prev_attribute_c = a;
624                 }
625                         
626                 return a;
627         }
628 PUGI__NS_END
629
630 // Helper classes for code generation
631 PUGI__NS_BEGIN
632         struct opt_false
633         {
634                 enum { value = 0 };
635         };
636
637         struct opt_true
638         {
639                 enum { value = 1 };
640         };
641 PUGI__NS_END
642
643 // Unicode utilities
644 PUGI__NS_BEGIN
645         inline uint16_t endian_swap(uint16_t value)
646         {
647                 return static_cast<uint16_t>(((value & 0xff) << 8) | (value >> 8));
648         }
649
650         inline uint32_t endian_swap(uint32_t value)
651         {
652                 return ((value & 0xff) << 24) | ((value & 0xff00) << 8) | ((value & 0xff0000) >> 8) | (value >> 24);
653         }
654
655         struct utf8_counter
656         {
657                 typedef size_t value_type;
658
659                 static value_type low(value_type result, uint32_t ch)
660                 {
661                         // U+0000..U+007F
662                         if (ch < 0x80) return result + 1;
663                         // U+0080..U+07FF
664                         else if (ch < 0x800) return result + 2;
665                         // U+0800..U+FFFF
666                         else return result + 3;
667                 }
668
669                 static value_type high(value_type result, uint32_t)
670                 {
671                         // U+10000..U+10FFFF
672                         return result + 4;
673                 }
674         };
675
676         struct utf8_writer
677         {
678                 typedef uint8_t* value_type;
679
680                 static value_type low(value_type result, uint32_t ch)
681                 {
682                         // U+0000..U+007F
683                         if (ch < 0x80)
684                         {
685                                 *result = static_cast<uint8_t>(ch);
686                                 return result + 1;
687                         }
688                         // U+0080..U+07FF
689                         else if (ch < 0x800)
690                         {
691                                 result[0] = static_cast<uint8_t>(0xC0 | (ch >> 6));
692                                 result[1] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
693                                 return result + 2;
694                         }
695                         // U+0800..U+FFFF
696                         else
697                         {
698                                 result[0] = static_cast<uint8_t>(0xE0 | (ch >> 12));
699                                 result[1] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
700                                 result[2] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
701                                 return result + 3;
702                         }
703                 }
704
705                 static value_type high(value_type result, uint32_t ch)
706                 {
707                         // U+10000..U+10FFFF
708                         result[0] = static_cast<uint8_t>(0xF0 | (ch >> 18));
709                         result[1] = static_cast<uint8_t>(0x80 | ((ch >> 12) & 0x3F));
710                         result[2] = static_cast<uint8_t>(0x80 | ((ch >> 6) & 0x3F));
711                         result[3] = static_cast<uint8_t>(0x80 | (ch & 0x3F));
712                         return result + 4;
713                 }
714
715                 static value_type any(value_type result, uint32_t ch)
716                 {
717                         return (ch < 0x10000) ? low(result, ch) : high(result, ch);
718                 }
719         };
720
721         struct utf16_counter
722         {
723                 typedef size_t value_type;
724
725                 static value_type low(value_type result, uint32_t)
726                 {
727                         return result + 1;
728                 }
729
730                 static value_type high(value_type result, uint32_t)
731                 {
732                         return result + 2;
733                 }
734         };
735
736         struct utf16_writer
737         {
738                 typedef uint16_t* value_type;
739
740                 static value_type low(value_type result, uint32_t ch)
741                 {
742                         *result = static_cast<uint16_t>(ch);
743
744                         return result + 1;
745                 }
746
747                 static value_type high(value_type result, uint32_t ch)
748                 {
749                         uint32_t msh = static_cast<uint32_t>(ch - 0x10000) >> 10;
750                         uint32_t lsh = static_cast<uint32_t>(ch - 0x10000) & 0x3ff;
751
752                         result[0] = static_cast<uint16_t>(0xD800 + msh);
753                         result[1] = static_cast<uint16_t>(0xDC00 + lsh);
754
755                         return result + 2;
756                 }
757
758                 static value_type any(value_type result, uint32_t ch)
759                 {
760                         return (ch < 0x10000) ? low(result, ch) : high(result, ch);
761                 }
762         };
763
764         struct utf32_counter
765         {
766                 typedef size_t value_type;
767
768                 static value_type low(value_type result, uint32_t)
769                 {
770                         return result + 1;
771                 }
772
773                 static value_type high(value_type result, uint32_t)
774                 {
775                         return result + 1;
776                 }
777         };
778
779         struct utf32_writer
780         {
781                 typedef uint32_t* value_type;
782
783                 static value_type low(value_type result, uint32_t ch)
784                 {
785                         *result = ch;
786
787                         return result + 1;
788                 }
789
790                 static value_type high(value_type result, uint32_t ch)
791                 {
792                         *result = ch;
793
794                         return result + 1;
795                 }
796
797                 static value_type any(value_type result, uint32_t ch)
798                 {
799                         *result = ch;
800
801                         return result + 1;
802                 }
803         };
804
805         struct latin1_writer
806         {
807                 typedef uint8_t* value_type;
808
809                 static value_type low(value_type result, uint32_t ch)
810                 {
811                         *result = static_cast<uint8_t>(ch > 255 ? '?' : ch);
812
813                         return result + 1;
814                 }
815
816                 static value_type high(value_type result, uint32_t ch)
817                 {
818                         (void)ch;
819
820                         *result = '?';
821
822                         return result + 1;
823                 }
824         };
825
826         template <size_t size> struct wchar_selector;
827
828         template <> struct wchar_selector<2>
829         {
830                 typedef uint16_t type;
831                 typedef utf16_counter counter;
832                 typedef utf16_writer writer;
833         };
834
835         template <> struct wchar_selector<4>
836         {
837                 typedef uint32_t type;
838                 typedef utf32_counter counter;
839                 typedef utf32_writer writer;
840         };
841
842         typedef wchar_selector<sizeof(wchar_t)>::counter wchar_counter;
843         typedef wchar_selector<sizeof(wchar_t)>::writer wchar_writer;
844
845         template <typename Traits, typename opt_swap = opt_false> struct utf_decoder
846         {
847                 static inline typename Traits::value_type decode_utf8_block(const uint8_t* data, size_t size, typename Traits::value_type result)
848                 {
849                         const uint8_t utf8_byte_mask = 0x3f;
850
851                         while (size)
852                         {
853                                 uint8_t lead = *data;
854
855                                 // 0xxxxxxx -> U+0000..U+007F
856                                 if (lead < 0x80)
857                                 {
858                                         result = Traits::low(result, lead);
859                                         data += 1;
860                                         size -= 1;
861
862                                         // process aligned single-byte (ascii) blocks
863                                         if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
864                                         {
865                                                 // round-trip through void* to silence 'cast increases required alignment of target type' warnings
866                                                 while (size >= 4 && (*static_cast<const uint32_t*>(static_cast<const void*>(data)) & 0x80808080) == 0)
867                                                 {
868                                                         result = Traits::low(result, data[0]);
869                                                         result = Traits::low(result, data[1]);
870                                                         result = Traits::low(result, data[2]);
871                                                         result = Traits::low(result, data[3]);
872                                                         data += 4;
873                                                         size -= 4;
874                                                 }
875                                         }
876                                 }
877                                 // 110xxxxx -> U+0080..U+07FF
878                                 else if (static_cast<unsigned int>(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
879                                 {
880                                         result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
881                                         data += 2;
882                                         size -= 2;
883                                 }
884                                 // 1110xxxx -> U+0800-U+FFFF
885                                 else if (static_cast<unsigned int>(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
886                                 {
887                                         result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
888                                         data += 3;
889                                         size -= 3;
890                                 }
891                                 // 11110xxx -> U+10000..U+10FFFF
892                                 else if (static_cast<unsigned int>(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
893                                 {
894                                         result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
895                                         data += 4;
896                                         size -= 4;
897                                 }
898                                 // 10xxxxxx or 11111xxx -> invalid
899                                 else
900                                 {
901                                         data += 1;
902                                         size -= 1;
903                                 }
904                         }
905
906                         return result;
907                 }
908
909                 static inline typename Traits::value_type decode_utf16_block(const uint16_t* data, size_t size, typename Traits::value_type result)
910                 {
911                         const uint16_t* end = data + size;
912
913                         while (data < end)
914                         {
915                                 uint16_t lead = opt_swap::value ? endian_swap(*data) : *data;
916
917                                 // U+0000..U+D7FF
918                                 if (lead < 0xD800)
919                                 {
920                                         result = Traits::low(result, lead);
921                                         data += 1;
922                                 }
923                                 // U+E000..U+FFFF
924                                 else if (static_cast<unsigned int>(lead - 0xE000) < 0x2000)
925                                 {
926                                         result = Traits::low(result, lead);
927                                         data += 1;
928                                 }
929                                 // surrogate pair lead
930                                 else if (static_cast<unsigned int>(lead - 0xD800) < 0x400 && data + 1 < end)
931                                 {
932                                         uint16_t next = opt_swap::value ? endian_swap(data[1]) : data[1];
933
934                                         if (static_cast<unsigned int>(next - 0xDC00) < 0x400)
935                                         {
936                                                 result = Traits::high(result, 0x10000 + ((lead & 0x3ff) << 10) + (next & 0x3ff));
937                                                 data += 2;
938                                         }
939                                         else
940                                         {
941                                                 data += 1;
942                                         }
943                                 }
944                                 else
945                                 {
946                                         data += 1;
947                                 }
948                         }
949
950                         return result;
951                 }
952
953                 static inline typename Traits::value_type decode_utf32_block(const uint32_t* data, size_t size, typename Traits::value_type result)
954                 {
955                         const uint32_t* end = data + size;
956
957                         while (data < end)
958                         {
959                                 uint32_t lead = opt_swap::value ? endian_swap(*data) : *data;
960
961                                 // U+0000..U+FFFF
962                                 if (lead < 0x10000)
963                                 {
964                                         result = Traits::low(result, lead);
965                                         data += 1;
966                                 }
967                                 // U+10000..U+10FFFF
968                                 else
969                                 {
970                                         result = Traits::high(result, lead);
971                                         data += 1;
972                                 }
973                         }
974
975                         return result;
976                 }
977
978                 static inline typename Traits::value_type decode_latin1_block(const uint8_t* data, size_t size, typename Traits::value_type result)
979                 {
980                         for (size_t i = 0; i < size; ++i)
981                         {
982                                 result = Traits::low(result, data[i]);
983                         }
984
985                         return result;
986                 }
987
988                 static inline typename Traits::value_type decode_wchar_block_impl(const uint16_t* data, size_t size, typename Traits::value_type result)
989                 {
990                         return decode_utf16_block(data, size, result);
991                 }
992
993                 static inline typename Traits::value_type decode_wchar_block_impl(const uint32_t* data, size_t size, typename Traits::value_type result)
994                 {
995                         return decode_utf32_block(data, size, result);
996                 }
997
998                 static inline typename Traits::value_type decode_wchar_block(const wchar_t* data, size_t size, typename Traits::value_type result)
999                 {
1000                         return decode_wchar_block_impl(reinterpret_cast<const wchar_selector<sizeof(wchar_t)>::type*>(data), size, result);
1001                 }
1002         };
1003
1004         template <typename T> PUGI__FN void convert_utf_endian_swap(T* result, const T* data, size_t length)
1005         {
1006                 for (size_t i = 0; i < length; ++i) result[i] = endian_swap(data[i]);
1007         }
1008
1009 #ifdef PUGIXML_WCHAR_MODE
1010         PUGI__FN void convert_wchar_endian_swap(wchar_t* result, const wchar_t* data, size_t length)
1011         {
1012                 for (size_t i = 0; i < length; ++i) result[i] = static_cast<wchar_t>(endian_swap(static_cast<wchar_selector<sizeof(wchar_t)>::type>(data[i])));
1013         }
1014 #endif
1015 PUGI__NS_END
1016
1017 PUGI__NS_BEGIN
1018         enum chartype_t
1019         {
1020                 ct_parse_pcdata = 1,    // \0, &, \r, <
1021                 ct_parse_attr = 2,              // \0, &, \r, ', "
1022                 ct_parse_attr_ws = 4,   // \0, &, \r, ', ", \n, tab
1023                 ct_space = 8,                   // \r, \n, space, tab
1024                 ct_parse_cdata = 16,    // \0, ], >, \r
1025                 ct_parse_comment = 32,  // \0, -, >, \r
1026                 ct_symbol = 64,                 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
1027                 ct_start_symbol = 128   // Any symbol > 127, a-z, A-Z, _, :
1028         };
1029
1030         static const unsigned char chartype_table[256] =
1031         {
1032                 55,  0,   0,   0,   0,   0,   0,   0,      0,   12,  12,  0,   0,   63,  0,   0,   // 0-15
1033                 0,   0,   0,   0,   0,   0,   0,   0,      0,   0,   0,   0,   0,   0,   0,   0,   // 16-31
1034                 8,   0,   6,   0,   0,   0,   7,   6,      0,   0,   0,   0,   0,   96,  64,  0,   // 32-47
1035                 64,  64,  64,  64,  64,  64,  64,  64,     64,  64,  192, 0,   1,   0,   48,  0,   // 48-63
1036                 0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 64-79
1037                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0,   0,   16,  0,   192, // 80-95
1038                 0,   192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 96-111
1039                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 0, 0, 0, 0, 0,           // 112-127
1040
1041                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192, // 128+
1042                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1043                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1044                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1045                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1046                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1047                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192,
1048                 192, 192, 192, 192, 192, 192, 192, 192,    192, 192, 192, 192, 192, 192, 192, 192
1049         };
1050
1051         enum chartypex_t
1052         {
1053                 ctx_special_pcdata = 1,   // Any symbol >= 0 and < 32 (except \t, \r, \n), &, <, >
1054                 ctx_special_attr = 2,     // Any symbol >= 0 and < 32 (except \t), &, <, >, "
1055                 ctx_start_symbol = 4,     // Any symbol > 127, a-z, A-Z, _
1056                 ctx_digit = 8,                    // 0-9
1057                 ctx_symbol = 16                   // Any symbol > 127, a-z, A-Z, 0-9, _, -, .
1058         };
1059         
1060         static const unsigned char chartypex_table[256] =
1061         {
1062                 3,  3,  3,  3,  3,  3,  3,  3,     3,  0,  2,  3,  3,  2,  3,  3,     // 0-15
1063                 3,  3,  3,  3,  3,  3,  3,  3,     3,  3,  3,  3,  3,  3,  3,  3,     // 16-31
1064                 0,  0,  2,  0,  0,  0,  3,  0,     0,  0,  0,  0,  0, 16, 16,  0,     // 32-47
1065                 24, 24, 24, 24, 24, 24, 24, 24,    24, 24, 0,  0,  3,  0,  3,  0,     // 48-63
1066
1067                 0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 64-79
1068                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  20,    // 80-95
1069                 0,  20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 96-111
1070                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 0,  0,  0,  0,  0,     // 112-127
1071
1072                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,    // 128+
1073                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1074                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1075                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1076                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1077                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1078                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20,
1079                 20, 20, 20, 20, 20, 20, 20, 20,    20, 20, 20, 20, 20, 20, 20, 20
1080         };
1081         
1082 #ifdef PUGIXML_WCHAR_MODE
1083         #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) ((static_cast<unsigned int>(c) < 128 ? table[static_cast<unsigned int>(c)] : table[128]) & (ct))
1084 #else
1085         #define PUGI__IS_CHARTYPE_IMPL(c, ct, table) (table[static_cast<unsigned char>(c)] & (ct))
1086 #endif
1087
1088         #define PUGI__IS_CHARTYPE(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartype_table)
1089         #define PUGI__IS_CHARTYPEX(c, ct) PUGI__IS_CHARTYPE_IMPL(c, ct, chartypex_table)
1090
1091         PUGI__FN bool is_little_endian()
1092         {
1093                 unsigned int ui = 1;
1094
1095                 return *reinterpret_cast<unsigned char*>(&ui) == 1;
1096         }
1097
1098         PUGI__FN xml_encoding get_wchar_encoding()
1099         {
1100                 PUGI__STATIC_ASSERT(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4);
1101
1102                 if (sizeof(wchar_t) == 2)
1103                         return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
1104                 else 
1105                         return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
1106         }
1107
1108         PUGI__FN xml_encoding guess_buffer_encoding(uint8_t d0, uint8_t d1, uint8_t d2, uint8_t d3)
1109         {
1110                 // look for BOM in first few bytes
1111                 if (d0 == 0 && d1 == 0 && d2 == 0xfe && d3 == 0xff) return encoding_utf32_be;
1112                 if (d0 == 0xff && d1 == 0xfe && d2 == 0 && d3 == 0) return encoding_utf32_le;
1113                 if (d0 == 0xfe && d1 == 0xff) return encoding_utf16_be;
1114                 if (d0 == 0xff && d1 == 0xfe) return encoding_utf16_le;
1115                 if (d0 == 0xef && d1 == 0xbb && d2 == 0xbf) return encoding_utf8;
1116
1117                 // look for <, <? or <?xm in various encodings
1118                 if (d0 == 0 && d1 == 0 && d2 == 0 && d3 == 0x3c) return encoding_utf32_be;
1119                 if (d0 == 0x3c && d1 == 0 && d2 == 0 && d3 == 0) return encoding_utf32_le;
1120                 if (d0 == 0 && d1 == 0x3c && d2 == 0 && d3 == 0x3f) return encoding_utf16_be;
1121                 if (d0 == 0x3c && d1 == 0 && d2 == 0x3f && d3 == 0) return encoding_utf16_le;
1122                 if (d0 == 0x3c && d1 == 0x3f && d2 == 0x78 && d3 == 0x6d) return encoding_utf8;
1123
1124                 // look for utf16 < followed by node name (this may fail, but is better than utf8 since it's zero terminated so early)
1125                 if (d0 == 0 && d1 == 0x3c) return encoding_utf16_be;
1126                 if (d0 == 0x3c && d1 == 0) return encoding_utf16_le;
1127
1128                 // no known BOM detected, assume utf8
1129                 return encoding_utf8;
1130         }
1131
1132         PUGI__FN xml_encoding get_buffer_encoding(xml_encoding encoding, const void* contents, size_t size)
1133         {
1134                 // replace wchar encoding with utf implementation
1135                 if (encoding == encoding_wchar) return get_wchar_encoding();
1136
1137                 // replace utf16 encoding with utf16 with specific endianness
1138                 if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
1139
1140                 // replace utf32 encoding with utf32 with specific endianness
1141                 if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
1142
1143                 // only do autodetection if no explicit encoding is requested
1144                 if (encoding != encoding_auto) return encoding;
1145
1146                 // skip encoding autodetection if input buffer is too small
1147                 if (size < 4) return encoding_utf8;
1148
1149                 // try to guess encoding (based on XML specification, Appendix F.1)
1150                 const uint8_t* data = static_cast<const uint8_t*>(contents);
1151
1152                 PUGI__DMC_VOLATILE uint8_t d0 = data[0], d1 = data[1], d2 = data[2], d3 = data[3];
1153
1154                 return guess_buffer_encoding(d0, d1, d2, d3);
1155         }
1156
1157         PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
1158         {
1159                 if (is_mutable)
1160                 {
1161                         out_buffer = static_cast<char_t*>(const_cast<void*>(contents));
1162                 }
1163                 else
1164                 {
1165                         void* buffer = xml_memory::allocate(size > 0 ? size : 1);
1166                         if (!buffer) return false;
1167
1168                         memcpy(buffer, contents, size);
1169
1170                         out_buffer = static_cast<char_t*>(buffer);
1171                 }
1172
1173                 out_length = size / sizeof(char_t);
1174
1175                 return true;
1176         }
1177
1178 #ifdef PUGIXML_WCHAR_MODE
1179         PUGI__FN bool need_endian_swap_utf(xml_encoding le, xml_encoding re)
1180         {
1181                 return (le == encoding_utf16_be && re == encoding_utf16_le) || (le == encoding_utf16_le && re == encoding_utf16_be) ||
1182                            (le == encoding_utf32_be && re == encoding_utf32_le) || (le == encoding_utf32_le && re == encoding_utf32_be);
1183         }
1184
1185         PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
1186         {
1187                 const char_t* data = static_cast<const char_t*>(contents);
1188         
1189                 if (is_mutable)
1190                 {
1191                         out_buffer = const_cast<char_t*>(data);
1192                 }
1193                 else
1194                 {
1195                         out_buffer = static_cast<char_t*>(xml_memory::allocate(size > 0 ? size : 1));
1196                         if (!out_buffer) return false;
1197                 }
1198
1199                 out_length = size / sizeof(char_t);
1200
1201                 convert_wchar_endian_swap(out_buffer, data, out_length);
1202
1203                 return true;
1204         }
1205
1206         PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
1207         {
1208                 const uint8_t* data = static_cast<const uint8_t*>(contents);
1209
1210                 // first pass: get length in wchar_t units
1211                 out_length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
1212
1213                 // allocate buffer of suitable length
1214                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1215                 if (!out_buffer) return false;
1216
1217                 // second pass: convert utf8 input to wchar_t
1218                 wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
1219                 wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, out_begin);
1220
1221                 assert(out_end == out_begin + out_length);
1222                 (void)!out_end;
1223
1224                 return true;
1225         }
1226
1227         template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
1228         {
1229                 const uint16_t* data = static_cast<const uint16_t*>(contents);
1230                 size_t length = size / sizeof(uint16_t);
1231
1232                 // first pass: get length in wchar_t units
1233                 out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, length, 0);
1234
1235                 // allocate buffer of suitable length
1236                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1237                 if (!out_buffer) return false;
1238
1239                 // second pass: convert utf16 input to wchar_t
1240                 wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
1241                 wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
1242
1243                 assert(out_end == out_begin + out_length);
1244                 (void)!out_end;
1245
1246                 return true;
1247         }
1248
1249         template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
1250         {
1251                 const uint32_t* data = static_cast<const uint32_t*>(contents);
1252                 size_t length = size / sizeof(uint32_t);
1253
1254                 // first pass: get length in wchar_t units
1255                 out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, length, 0);
1256
1257                 // allocate buffer of suitable length
1258                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1259                 if (!out_buffer) return false;
1260
1261                 // second pass: convert utf32 input to wchar_t
1262                 wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
1263                 wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
1264
1265                 assert(out_end == out_begin + out_length);
1266                 (void)!out_end;
1267
1268                 return true;
1269         }
1270
1271         PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size)
1272         {
1273                 const uint8_t* data = static_cast<const uint8_t*>(contents);
1274
1275                 // get length in wchar_t units
1276                 out_length = size;
1277
1278                 // allocate buffer of suitable length
1279                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1280                 if (!out_buffer) return false;
1281
1282                 // convert latin1 input to wchar_t
1283                 wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer);
1284                 wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_latin1_block(data, size, out_begin);
1285
1286                 assert(out_end == out_begin + out_length);
1287                 (void)!out_end;
1288
1289                 return true;
1290         }
1291
1292         PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
1293         {
1294                 // get native encoding
1295                 xml_encoding wchar_encoding = get_wchar_encoding();
1296
1297                 // fast path: no conversion required
1298                 if (encoding == wchar_encoding) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
1299
1300                 // only endian-swapping is required
1301                 if (need_endian_swap_utf(encoding, wchar_encoding)) return convert_buffer_endian_swap(out_buffer, out_length, contents, size, is_mutable);
1302
1303                 // source encoding is utf8
1304                 if (encoding == encoding_utf8) return convert_buffer_utf8(out_buffer, out_length, contents, size);
1305
1306                 // source encoding is utf16
1307                 if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
1308                 {
1309                         xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
1310
1311                         return (native_encoding == encoding) ?
1312                                 convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
1313                                 convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
1314                 }
1315
1316                 // source encoding is utf32
1317                 if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
1318                 {
1319                         xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
1320
1321                         return (native_encoding == encoding) ?
1322                                 convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
1323                                 convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
1324                 }
1325
1326                 // source encoding is latin1
1327                 if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size);
1328
1329                 assert(!"Invalid encoding");
1330                 return false;
1331         }
1332 #else
1333         template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
1334         {
1335                 const uint16_t* data = static_cast<const uint16_t*>(contents);
1336                 size_t length = size / sizeof(uint16_t);
1337
1338                 // first pass: get length in utf8 units
1339                 out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, length, 0);
1340
1341                 // allocate buffer of suitable length
1342                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1343                 if (!out_buffer) return false;
1344
1345                 // second pass: convert utf16 input to utf8
1346                 uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
1347                 uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, length, out_begin);
1348
1349                 assert(out_end == out_begin + out_length);
1350                 (void)!out_end;
1351
1352                 return true;
1353         }
1354
1355         template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap)
1356         {
1357                 const uint32_t* data = static_cast<const uint32_t*>(contents);
1358                 size_t length = size / sizeof(uint32_t);
1359
1360                 // first pass: get length in utf8 units
1361                 out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, length, 0);
1362
1363                 // allocate buffer of suitable length
1364                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1365                 if (!out_buffer) return false;
1366
1367                 // second pass: convert utf32 input to utf8
1368                 uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
1369                 uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, length, out_begin);
1370
1371                 assert(out_end == out_begin + out_length);
1372                 (void)!out_end;
1373
1374                 return true;
1375         }
1376
1377         PUGI__FN size_t get_latin1_7bit_prefix_length(const uint8_t* data, size_t size)
1378         {
1379                 for (size_t i = 0; i < size; ++i)
1380                         if (data[i] > 127)
1381                                 return i;
1382
1383                 return size;
1384         }
1385
1386         PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable)
1387         {
1388                 const uint8_t* data = static_cast<const uint8_t*>(contents);
1389
1390                 // get size of prefix that does not need utf8 conversion
1391                 size_t prefix_length = get_latin1_7bit_prefix_length(data, size);
1392                 assert(prefix_length <= size);
1393
1394                 const uint8_t* postfix = data + prefix_length;
1395                 size_t postfix_length = size - prefix_length;
1396
1397                 // if no conversion is needed, just return the original buffer
1398                 if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
1399
1400                 // first pass: get length in utf8 units
1401                 out_length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0);
1402
1403                 // allocate buffer of suitable length
1404                 out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t)));
1405                 if (!out_buffer) return false;
1406
1407                 // second pass: convert latin1 input to utf8
1408                 memcpy(out_buffer, data, prefix_length);
1409
1410                 uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer);
1411                 uint8_t* out_end = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length);
1412
1413                 assert(out_end == out_begin + out_length);
1414                 (void)!out_end;
1415
1416                 return true;
1417         }
1418
1419         PUGI__FN bool convert_buffer(char_t*& out_buffer, size_t& out_length, xml_encoding encoding, const void* contents, size_t size, bool is_mutable)
1420         {
1421                 // fast path: no conversion required
1422                 if (encoding == encoding_utf8) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable);
1423
1424                 // source encoding is utf16
1425                 if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
1426                 {
1427                         xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
1428
1429                         return (native_encoding == encoding) ?
1430                                 convert_buffer_utf16(out_buffer, out_length, contents, size, opt_false()) :
1431                                 convert_buffer_utf16(out_buffer, out_length, contents, size, opt_true());
1432                 }
1433
1434                 // source encoding is utf32
1435                 if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
1436                 {
1437                         xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
1438
1439                         return (native_encoding == encoding) ?
1440                                 convert_buffer_utf32(out_buffer, out_length, contents, size, opt_false()) :
1441                                 convert_buffer_utf32(out_buffer, out_length, contents, size, opt_true());
1442                 }
1443
1444                 // source encoding is latin1
1445                 if (encoding == encoding_latin1) return convert_buffer_latin1(out_buffer, out_length, contents, size, is_mutable);
1446
1447                 assert(!"Invalid encoding");
1448                 return false;
1449         }
1450 #endif
1451
1452         PUGI__FN size_t as_utf8_begin(const wchar_t* str, size_t length)
1453         {
1454                 // get length in utf8 characters
1455                 return utf_decoder<utf8_counter>::decode_wchar_block(str, length, 0);
1456         }
1457
1458         PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length)
1459         {
1460                 // convert to utf8
1461                 uint8_t* begin = reinterpret_cast<uint8_t*>(buffer);
1462                 uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(str, length, begin);
1463         
1464                 assert(begin + size == end);
1465                 (void)!end;
1466
1467                 // zero-terminate
1468                 buffer[size] = 0;
1469         }
1470         
1471 #ifndef PUGIXML_NO_STL
1472         PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length)
1473         {
1474                 // first pass: get length in utf8 characters
1475                 size_t size = as_utf8_begin(str, length);
1476
1477                 // allocate resulting string
1478                 std::string result;
1479                 result.resize(size);
1480
1481                 // second pass: convert to utf8
1482                 if (size > 0) as_utf8_end(&result[0], size, str, length);
1483
1484                 return result;
1485         }
1486
1487         PUGI__FN std::basic_string<wchar_t> as_wide_impl(const char* str, size_t size)
1488         {
1489                 const uint8_t* data = reinterpret_cast<const uint8_t*>(str);
1490
1491                 // first pass: get length in wchar_t units
1492                 size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0);
1493
1494                 // allocate resulting string
1495                 std::basic_string<wchar_t> result;
1496                 result.resize(length);
1497
1498                 // second pass: convert to wchar_t
1499                 if (length > 0)
1500                 {
1501                         wchar_writer::value_type begin = reinterpret_cast<wchar_writer::value_type>(&result[0]);
1502                         wchar_writer::value_type end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, begin);
1503
1504                         assert(begin + length == end);
1505                         (void)!end;
1506                 }
1507
1508                 return result;
1509         }
1510 #endif
1511
1512         inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
1513         {
1514                 assert(target);
1515                 size_t target_length = strlength(target);
1516
1517                 // always reuse document buffer memory if possible
1518                 if (!allocated) return target_length >= length;
1519
1520                 // reuse heap memory if waste is not too great
1521                 const size_t reuse_threshold = 32;
1522
1523                 return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2);
1524         }
1525
1526         PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source)
1527         {
1528                 size_t source_length = strlength(source);
1529
1530                 if (source_length == 0)
1531                 {
1532                         // empty string and null pointer are equivalent, so just deallocate old memory
1533                         xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
1534
1535                         if (header & header_mask) alloc->deallocate_string(dest);
1536                         
1537                         // mark the string as not allocated
1538                         dest = 0;
1539                         header &= ~header_mask;
1540
1541                         return true;
1542                 }
1543                 else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
1544                 {
1545                         // we can reuse old buffer, so just copy the new data (including zero terminator)
1546                         memcpy(dest, source, (source_length + 1) * sizeof(char_t));
1547                         
1548                         return true;
1549                 }
1550                 else
1551                 {
1552                         xml_allocator* alloc = reinterpret_cast<xml_memory_page*>(header & xml_memory_page_pointer_mask)->allocator;
1553
1554                         // allocate new buffer
1555                         char_t* buf = alloc->allocate_string(source_length + 1);
1556                         if (!buf) return false;
1557
1558                         // copy the string (including zero terminator)
1559                         memcpy(buf, source, (source_length + 1) * sizeof(char_t));
1560
1561                         // deallocate old buffer (*after* the above to protect against overlapping memory and/or allocation failures)
1562                         if (header & header_mask) alloc->deallocate_string(dest);
1563                         
1564                         // the string is now allocated, so set the flag
1565                         dest = buf;
1566                         header |= header_mask;
1567
1568                         return true;
1569                 }
1570         }
1571
1572         struct gap
1573         {
1574                 char_t* end;
1575                 size_t size;
1576                         
1577                 gap(): end(0), size(0)
1578                 {
1579                 }
1580                         
1581                 // Push new gap, move s count bytes further (skipping the gap).
1582                 // Collapse previous gap.
1583                 void push(char_t*& s, size_t count)
1584                 {
1585                         if (end) // there was a gap already; collapse it
1586                         {
1587                                 // Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
1588                                 assert(s >= end);
1589                                 memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
1590                         }
1591                                 
1592                         s += count; // end of current gap
1593                                 
1594                         // "merge" two gaps
1595                         end = s;
1596                         size += count;
1597                 }
1598                         
1599                 // Collapse all gaps, return past-the-end pointer
1600                 char_t* flush(char_t* s)
1601                 {
1602                         if (end)
1603                         {
1604                                 // Move [old_gap_end, current_pos) to [old_gap_start, ...)
1605                                 assert(s >= end);
1606                                 memmove(end - size, end, reinterpret_cast<char*>(s) - reinterpret_cast<char*>(end));
1607
1608                                 return s - size;
1609                         }
1610                         else return s;
1611                 }
1612         };
1613         
1614         PUGI__FN char_t* strconv_escape(char_t* s, gap& g)
1615         {
1616                 char_t* stre = s + 1;
1617
1618                 switch (*stre)
1619                 {
1620                         case '#':       // &#...
1621                         {
1622                                 unsigned int ucsc = 0;
1623
1624                                 if (stre[1] == 'x') // &#x... (hex code)
1625                                 {
1626                                         stre += 2;
1627
1628                                         char_t ch = *stre;
1629
1630                                         if (ch == ';') return stre;
1631
1632                                         for (;;)
1633                                         {
1634                                                 if (static_cast<unsigned int>(ch - '0') <= 9)
1635                                                         ucsc = 16 * ucsc + (ch - '0');
1636                                                 else if (static_cast<unsigned int>((ch | ' ') - 'a') <= 5)
1637                                                         ucsc = 16 * ucsc + ((ch | ' ') - 'a' + 10);
1638                                                 else if (ch == ';')
1639                                                         break;
1640                                                 else // cancel
1641                                                         return stre;
1642
1643                                                 ch = *++stre;
1644                                         }
1645                                         
1646                                         ++stre;
1647                                 }
1648                                 else    // &#... (dec code)
1649                                 {
1650                                         char_t ch = *++stre;
1651
1652                                         if (ch == ';') return stre;
1653
1654                                         for (;;)
1655                                         {
1656                                                 if (static_cast<unsigned int>(ch - '0') <= 9)
1657                                                         ucsc = 10 * ucsc + (ch - '0');
1658                                                 else if (ch == ';')
1659                                                         break;
1660                                                 else // cancel
1661                                                         return stre;
1662
1663                                                 ch = *++stre;
1664                                         }
1665                                         
1666                                         ++stre;
1667                                 }
1668
1669                         #ifdef PUGIXML_WCHAR_MODE
1670                                 s = reinterpret_cast<char_t*>(wchar_writer::any(reinterpret_cast<wchar_writer::value_type>(s), ucsc));
1671                         #else
1672                                 s = reinterpret_cast<char_t*>(utf8_writer::any(reinterpret_cast<uint8_t*>(s), ucsc));
1673                         #endif
1674                                         
1675                                 g.push(s, stre - s);
1676                                 return stre;
1677                         }
1678
1679                         case 'a':       // &a
1680                         {
1681                                 ++stre;
1682
1683                                 if (*stre == 'm') // &am
1684                                 {
1685                                         if (*++stre == 'p' && *++stre == ';') // &amp;
1686                                         {
1687                                                 *s++ = '&';
1688                                                 ++stre;
1689                                                         
1690                                                 g.push(s, stre - s);
1691                                                 return stre;
1692                                         }
1693                                 }
1694                                 else if (*stre == 'p') // &ap
1695                                 {
1696                                         if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // &apos;
1697                                         {
1698                                                 *s++ = '\'';
1699                                                 ++stre;
1700
1701                                                 g.push(s, stre - s);
1702                                                 return stre;
1703                                         }
1704                                 }
1705                                 break;
1706                         }
1707
1708                         case 'g': // &g
1709                         {
1710                                 if (*++stre == 't' && *++stre == ';') // &gt;
1711                                 {
1712                                         *s++ = '>';
1713                                         ++stre;
1714                                         
1715                                         g.push(s, stre - s);
1716                                         return stre;
1717                                 }
1718                                 break;
1719                         }
1720
1721                         case 'l': // &l
1722                         {
1723                                 if (*++stre == 't' && *++stre == ';') // &lt;
1724                                 {
1725                                         *s++ = '<';
1726                                         ++stre;
1727                                                 
1728                                         g.push(s, stre - s);
1729                                         return stre;
1730                                 }
1731                                 break;
1732                         }
1733
1734                         case 'q': // &q
1735                         {
1736                                 if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // &quot;
1737                                 {
1738                                         *s++ = '"';
1739                                         ++stre;
1740                                         
1741                                         g.push(s, stre - s);
1742                                         return stre;
1743                                 }
1744                                 break;
1745                         }
1746
1747                         default:
1748                                 break;
1749                 }
1750                 
1751                 return stre;
1752         }
1753
1754         // Utility macro for last character handling
1755         #define ENDSWITH(c, e) ((c) == (e) || ((c) == 0 && endch == (e)))
1756
1757         PUGI__FN char_t* strconv_comment(char_t* s, char_t endch)
1758         {
1759                 gap g;
1760                 
1761                 while (true)
1762                 {
1763                         while (!PUGI__IS_CHARTYPE(*s, ct_parse_comment)) ++s;
1764                 
1765                         if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
1766                         {
1767                                 *s++ = '\n'; // replace first one with 0x0a
1768                                 
1769                                 if (*s == '\n') g.push(s, 1);
1770                         }
1771                         else if (s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>')) // comment ends here
1772                         {
1773                                 *g.flush(s) = 0;
1774                                 
1775                                 return s + (s[2] == '>' ? 3 : 2);
1776                         }
1777                         else if (*s == 0)
1778                         {
1779                                 return 0;
1780                         }
1781                         else ++s;
1782                 }
1783         }
1784
1785         PUGI__FN char_t* strconv_cdata(char_t* s, char_t endch)
1786         {
1787                 gap g;
1788                         
1789                 while (true)
1790                 {
1791                         while (!PUGI__IS_CHARTYPE(*s, ct_parse_cdata)) ++s;
1792                         
1793                         if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
1794                         {
1795                                 *s++ = '\n'; // replace first one with 0x0a
1796                                 
1797                                 if (*s == '\n') g.push(s, 1);
1798                         }
1799                         else if (s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>')) // CDATA ends here
1800                         {
1801                                 *g.flush(s) = 0;
1802                                 
1803                                 return s + 1;
1804                         }
1805                         else if (*s == 0)
1806                         {
1807                                 return 0;
1808                         }
1809                         else ++s;
1810                 }
1811         }
1812         
1813         typedef char_t* (*strconv_pcdata_t)(char_t*);
1814                 
1815         template <typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
1816         {
1817                 static char_t* parse(char_t* s)
1818                 {
1819                         gap g;
1820                         
1821                         while (true)
1822                         {
1823                                 while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
1824                                         
1825                                 if (*s == '<') // PCDATA ends here
1826                                 {
1827                                         *g.flush(s) = 0;
1828                                         
1829                                         return s + 1;
1830                                 }
1831                                 else if (opt_eol::value && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
1832                                 {
1833                                         *s++ = '\n'; // replace first one with 0x0a
1834                                         
1835                                         if (*s == '\n') g.push(s, 1);
1836                                 }
1837                                 else if (opt_escape::value && *s == '&')
1838                                 {
1839                                         s = strconv_escape(s, g);
1840                                 }
1841                                 else if (*s == 0)
1842                                 {
1843                                         return s;
1844                                 }
1845                                 else ++s;
1846                         }
1847                 }
1848         };
1849         
1850         PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
1851         {
1852                 PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20);
1853
1854                 switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes)
1855                 {
1856                 case 0: return strconv_pcdata_impl<opt_false, opt_false>::parse;
1857                 case 1: return strconv_pcdata_impl<opt_false, opt_true>::parse;
1858                 case 2: return strconv_pcdata_impl<opt_true, opt_false>::parse;
1859                 case 3: return strconv_pcdata_impl<opt_true, opt_true>::parse;
1860                 default: return 0; // should not get here
1861                 }
1862         }
1863
1864         typedef char_t* (*strconv_attribute_t)(char_t*, char_t);
1865         
1866         template <typename opt_escape> struct strconv_attribute_impl
1867         {
1868                 static char_t* parse_wnorm(char_t* s, char_t end_quote)
1869                 {
1870                         gap g;
1871
1872                         // trim leading whitespaces
1873                         if (PUGI__IS_CHARTYPE(*s, ct_space))
1874                         {
1875                                 char_t* str = s;
1876                                 
1877                                 do ++str;
1878                                 while (PUGI__IS_CHARTYPE(*str, ct_space));
1879                                 
1880                                 g.push(s, str - s);
1881                         }
1882
1883                         while (true)
1884                         {
1885                                 while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws | ct_space)) ++s;
1886                                 
1887                                 if (*s == end_quote)
1888                                 {
1889                                         char_t* str = g.flush(s);
1890                                         
1891                                         do *str-- = 0;
1892                                         while (PUGI__IS_CHARTYPE(*str, ct_space));
1893                                 
1894                                         return s + 1;
1895                                 }
1896                                 else if (PUGI__IS_CHARTYPE(*s, ct_space))
1897                                 {
1898                                         *s++ = ' ';
1899                 
1900                                         if (PUGI__IS_CHARTYPE(*s, ct_space))
1901                                         {
1902                                                 char_t* str = s + 1;
1903                                                 while (PUGI__IS_CHARTYPE(*str, ct_space)) ++str;
1904                                                 
1905                                                 g.push(s, str - s);
1906                                         }
1907                                 }
1908                                 else if (opt_escape::value && *s == '&')
1909                                 {
1910                                         s = strconv_escape(s, g);
1911                                 }
1912                                 else if (!*s)
1913                                 {
1914                                         return 0;
1915                                 }
1916                                 else ++s;
1917                         }
1918                 }
1919
1920                 static char_t* parse_wconv(char_t* s, char_t end_quote)
1921                 {
1922                         gap g;
1923
1924                         while (true)
1925                         {
1926                                 while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr_ws)) ++s;
1927                                 
1928                                 if (*s == end_quote)
1929                                 {
1930                                         *g.flush(s) = 0;
1931                                 
1932                                         return s + 1;
1933                                 }
1934                                 else if (PUGI__IS_CHARTYPE(*s, ct_space))
1935                                 {
1936                                         if (*s == '\r')
1937                                         {
1938                                                 *s++ = ' ';
1939                                 
1940                                                 if (*s == '\n') g.push(s, 1);
1941                                         }
1942                                         else *s++ = ' ';
1943                                 }
1944                                 else if (opt_escape::value && *s == '&')
1945                                 {
1946                                         s = strconv_escape(s, g);
1947                                 }
1948                                 else if (!*s)
1949                                 {
1950                                         return 0;
1951                                 }
1952                                 else ++s;
1953                         }
1954                 }
1955
1956                 static char_t* parse_eol(char_t* s, char_t end_quote)
1957                 {
1958                         gap g;
1959
1960                         while (true)
1961                         {
1962                                 while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
1963                                 
1964                                 if (*s == end_quote)
1965                                 {
1966                                         *g.flush(s) = 0;
1967                                 
1968                                         return s + 1;
1969                                 }
1970                                 else if (*s == '\r')
1971                                 {
1972                                         *s++ = '\n';
1973                                         
1974                                         if (*s == '\n') g.push(s, 1);
1975                                 }
1976                                 else if (opt_escape::value && *s == '&')
1977                                 {
1978                                         s = strconv_escape(s, g);
1979                                 }
1980                                 else if (!*s)
1981                                 {
1982                                         return 0;
1983                                 }
1984                                 else ++s;
1985                         }
1986                 }
1987
1988                 static char_t* parse_simple(char_t* s, char_t end_quote)
1989                 {
1990                         gap g;
1991
1992                         while (true)
1993                         {
1994                                 while (!PUGI__IS_CHARTYPE(*s, ct_parse_attr)) ++s;
1995                                 
1996                                 if (*s == end_quote)
1997                                 {
1998                                         *g.flush(s) = 0;
1999                                 
2000                                         return s + 1;
2001                                 }
2002                                 else if (opt_escape::value && *s == '&')
2003                                 {
2004                                         s = strconv_escape(s, g);
2005                                 }
2006                                 else if (!*s)
2007                                 {
2008                                         return 0;
2009                                 }
2010                                 else ++s;
2011                         }
2012                 }
2013         };
2014
2015         PUGI__FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
2016         {
2017                 PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
2018                 
2019                 switch ((optmask >> 4) & 15) // get bitmask for flags (wconv wnorm eol escapes)
2020                 {
2021                 case 0:  return strconv_attribute_impl<opt_false>::parse_simple;
2022                 case 1:  return strconv_attribute_impl<opt_true>::parse_simple;
2023                 case 2:  return strconv_attribute_impl<opt_false>::parse_eol;
2024                 case 3:  return strconv_attribute_impl<opt_true>::parse_eol;
2025                 case 4:  return strconv_attribute_impl<opt_false>::parse_wconv;
2026                 case 5:  return strconv_attribute_impl<opt_true>::parse_wconv;
2027                 case 6:  return strconv_attribute_impl<opt_false>::parse_wconv;
2028                 case 7:  return strconv_attribute_impl<opt_true>::parse_wconv;
2029                 case 8:  return strconv_attribute_impl<opt_false>::parse_wnorm;
2030                 case 9:  return strconv_attribute_impl<opt_true>::parse_wnorm;
2031                 case 10: return strconv_attribute_impl<opt_false>::parse_wnorm;
2032                 case 11: return strconv_attribute_impl<opt_true>::parse_wnorm;
2033                 case 12: return strconv_attribute_impl<opt_false>::parse_wnorm;
2034                 case 13: return strconv_attribute_impl<opt_true>::parse_wnorm;
2035                 case 14: return strconv_attribute_impl<opt_false>::parse_wnorm;
2036                 case 15: return strconv_attribute_impl<opt_true>::parse_wnorm;
2037                 default: return 0; // should not get here
2038                 }
2039         }
2040
2041         inline xml_parse_result make_parse_result(xml_parse_status status, ptrdiff_t offset = 0)
2042         {
2043                 xml_parse_result result;
2044                 result.status = status;
2045                 result.offset = offset;
2046
2047                 return result;
2048         }
2049
2050         struct xml_parser
2051         {
2052                 xml_allocator alloc;
2053                 char_t* error_offset;
2054                 xml_parse_status error_status;
2055                 
2056                 // Parser utilities.
2057                 #define PUGI__SKIPWS()                  { while (PUGI__IS_CHARTYPE(*s, ct_space)) ++s; }
2058                 #define PUGI__OPTSET(OPT)                       ( optmsk & (OPT) )
2059                 #define PUGI__PUSHNODE(TYPE)            { cursor = append_node(cursor, alloc, TYPE); if (!cursor) PUGI__THROW_ERROR(status_out_of_memory, s); }
2060                 #define PUGI__POPNODE()                 { cursor = cursor->parent; }
2061                 #define PUGI__SCANFOR(X)                        { while (*s != 0 && !(X)) ++s; }
2062                 #define PUGI__SCANWHILE(X)              { while ((X)) ++s; }
2063                 #define PUGI__ENDSEG()                  { ch = *s; *s = 0; ++s; }
2064                 #define PUGI__THROW_ERROR(err, m)       return error_offset = m, error_status = err, static_cast<char_t*>(0)
2065                 #define PUGI__CHECK_ERROR(err, m)       { if (*s == 0) PUGI__THROW_ERROR(err, m); }
2066                 
2067                 xml_parser(const xml_allocator& alloc_): alloc(alloc_), error_offset(0), error_status(status_ok)
2068                 {
2069                 }
2070
2071                 // DOCTYPE consists of nested sections of the following possible types:
2072                 // <!-- ... -->, <? ... ?>, "...", '...'
2073                 // <![...]]>
2074                 // <!...>
2075                 // First group can not contain nested groups
2076                 // Second group can contain nested groups of the same type
2077                 // Third group can contain all other groups
2078                 char_t* parse_doctype_primitive(char_t* s)
2079                 {
2080                         if (*s == '"' || *s == '\'')
2081                         {
2082                                 // quoted string
2083                                 char_t ch = *s++;
2084                                 PUGI__SCANFOR(*s == ch);
2085                                 if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
2086
2087                                 s++;
2088                         }
2089                         else if (s[0] == '<' && s[1] == '?')
2090                         {
2091                                 // <? ... ?>
2092                                 s += 2;
2093                                 PUGI__SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
2094                                 if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
2095
2096                                 s += 2;
2097                         }
2098                         else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
2099                         {
2100                                 s += 4;
2101                                 PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
2102                                 if (!*s) PUGI__THROW_ERROR(status_bad_doctype, s);
2103
2104                                 s += 4;
2105                         }
2106                         else PUGI__THROW_ERROR(status_bad_doctype, s);
2107
2108                         return s;
2109                 }
2110
2111                 char_t* parse_doctype_ignore(char_t* s)
2112                 {
2113                         assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
2114                         s++;
2115
2116                         while (*s)
2117                         {
2118                                 if (s[0] == '<' && s[1] == '!' && s[2] == '[')
2119                                 {
2120                                         // nested ignore section
2121                                         s = parse_doctype_ignore(s);
2122                                         if (!s) return s;
2123                                 }
2124                                 else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
2125                                 {
2126                                         // ignore section end
2127                                         s += 3;
2128
2129                                         return s;
2130                                 }
2131                                 else s++;
2132                         }
2133
2134                         PUGI__THROW_ERROR(status_bad_doctype, s);
2135                 }
2136
2137                 char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
2138                 {
2139                         assert(s[0] == '<' && s[1] == '!');
2140                         s++;
2141
2142                         while (*s)
2143                         {
2144                                 if (s[0] == '<' && s[1] == '!' && s[2] != '-')
2145                                 {
2146                                         if (s[2] == '[')
2147                                         {
2148                                                 // ignore
2149                                                 s = parse_doctype_ignore(s);
2150                                                 if (!s) return s;
2151                                         }
2152                                         else
2153                                         {
2154                                                 // some control group
2155                                                 s = parse_doctype_group(s, endch, false);
2156                                                 if (!s) return s;
2157                                         }
2158                                 }
2159                                 else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
2160                                 {
2161                                         // unknown tag (forbidden), or some primitive group
2162                                         s = parse_doctype_primitive(s);
2163                                         if (!s) return s;
2164                                 }
2165                                 else if (*s == '>')
2166                                 {
2167                                         s++;
2168
2169                                         return s;
2170                                 }
2171                                 else s++;
2172                         }
2173
2174                         if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
2175
2176                         return s;
2177                 }
2178
2179                 char_t* parse_exclamation(char_t* s, xml_node_struct* cursor, unsigned int optmsk, char_t endch)
2180                 {
2181                         // parse node contents, starting with exclamation mark
2182                         ++s;
2183
2184                         if (*s == '-') // '<!-...'
2185                         {
2186                                 ++s;
2187
2188                                 if (*s == '-') // '<!--...'
2189                                 {
2190                                         ++s;
2191
2192                                         if (PUGI__OPTSET(parse_comments))
2193                                         {
2194                                                 PUGI__PUSHNODE(node_comment); // Append a new node on the tree.
2195                                                 cursor->value = s; // Save the offset.
2196                                         }
2197
2198                                         if (PUGI__OPTSET(parse_eol) && PUGI__OPTSET(parse_comments))
2199                                         {
2200                                                 s = strconv_comment(s, endch);
2201
2202                                                 if (!s) PUGI__THROW_ERROR(status_bad_comment, cursor->value);
2203                                         }
2204                                         else
2205                                         {
2206                                                 // Scan for terminating '-->'.
2207                                                 PUGI__SCANFOR(s[0] == '-' && s[1] == '-' && ENDSWITH(s[2], '>'));
2208                                                 PUGI__CHECK_ERROR(status_bad_comment, s);
2209
2210                                                 if (PUGI__OPTSET(parse_comments))
2211                                                         *s = 0; // Zero-terminate this segment at the first terminating '-'.
2212
2213                                                 s += (s[2] == '>' ? 3 : 2); // Step over the '\0->'.
2214                                         }
2215                                 }
2216                                 else PUGI__THROW_ERROR(status_bad_comment, s);
2217                         }
2218                         else if (*s == '[')
2219                         {
2220                                 // '<![CDATA[...'
2221                                 if (*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
2222                                 {
2223                                         ++s;
2224
2225                                         if (PUGI__OPTSET(parse_cdata))
2226                                         {
2227                                                 PUGI__PUSHNODE(node_cdata); // Append a new node on the tree.
2228                                                 cursor->value = s; // Save the offset.
2229
2230                                                 if (PUGI__OPTSET(parse_eol))
2231                                                 {
2232                                                         s = strconv_cdata(s, endch);
2233
2234                                                         if (!s) PUGI__THROW_ERROR(status_bad_cdata, cursor->value);
2235                                                 }
2236                                                 else
2237                                                 {
2238                                                         // Scan for terminating ']]>'.
2239                                                         PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
2240                                                         PUGI__CHECK_ERROR(status_bad_cdata, s);
2241
2242                                                         *s++ = 0; // Zero-terminate this segment.
2243                                                 }
2244                                         }
2245                                         else // Flagged for discard, but we still have to scan for the terminator.
2246                                         {
2247                                                 // Scan for terminating ']]>'.
2248                                                 PUGI__SCANFOR(s[0] == ']' && s[1] == ']' && ENDSWITH(s[2], '>'));
2249                                                 PUGI__CHECK_ERROR(status_bad_cdata, s);
2250
2251                                                 ++s;
2252                                         }
2253
2254                                         s += (s[1] == '>' ? 2 : 1); // Step over the last ']>'.
2255                                 }
2256                                 else PUGI__THROW_ERROR(status_bad_cdata, s);
2257                         }
2258                         else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
2259                         {
2260                                 s -= 2;
2261
2262                                 if (cursor->parent) PUGI__THROW_ERROR(status_bad_doctype, s);
2263
2264                                 char_t* mark = s + 9;
2265
2266                                 s = parse_doctype_group(s, endch, true);
2267                                 if (!s) return s;
2268
2269                                 if (PUGI__OPTSET(parse_doctype))
2270                                 {
2271                                         while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
2272
2273                                         PUGI__PUSHNODE(node_doctype);
2274
2275                                         cursor->value = mark;
2276
2277                                         assert((s[0] == 0 && endch == '>') || s[-1] == '>');
2278                                         s[*s == 0 ? 0 : -1] = 0;
2279
2280                                         PUGI__POPNODE();
2281                                 }
2282                         }
2283                         else if (*s == 0 && endch == '-') PUGI__THROW_ERROR(status_bad_comment, s);
2284                         else if (*s == 0 && endch == '[') PUGI__THROW_ERROR(status_bad_cdata, s);
2285                         else PUGI__THROW_ERROR(status_unrecognized_tag, s);
2286
2287                         return s;
2288                 }
2289
2290                 char_t* parse_question(char_t* s, xml_node_struct*& ref_cursor, unsigned int optmsk, char_t endch)
2291                 {
2292                         // load into registers
2293                         xml_node_struct* cursor = ref_cursor;
2294                         char_t ch = 0;
2295
2296                         // parse node contents, starting with question mark
2297                         ++s;
2298
2299                         // read PI target
2300                         char_t* target = s;
2301
2302                         if (!PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_pi, s);
2303
2304                         PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol));
2305                         PUGI__CHECK_ERROR(status_bad_pi, s);
2306
2307                         // determine node type; stricmp / strcasecmp is not portable
2308                         bool declaration = (target[0] | ' ') == 'x' && (target[1] | ' ') == 'm' && (target[2] | ' ') == 'l' && target + 3 == s;
2309
2310                         if (declaration ? PUGI__OPTSET(parse_declaration) : PUGI__OPTSET(parse_pi))
2311                         {
2312                                 if (declaration)
2313                                 {
2314                                         // disallow non top-level declarations
2315                                         if (cursor->parent) PUGI__THROW_ERROR(status_bad_pi, s);
2316
2317                                         PUGI__PUSHNODE(node_declaration);
2318                                 }
2319                                 else
2320                                 {
2321                                         PUGI__PUSHNODE(node_pi);
2322                                 }
2323
2324                                 cursor->name = target;
2325
2326                                 PUGI__ENDSEG();
2327
2328                                 // parse value/attributes
2329                                 if (ch == '?')
2330                                 {
2331                                         // empty node
2332                                         if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_pi, s);
2333                                         s += (*s == '>');
2334
2335                                         PUGI__POPNODE();
2336                                 }
2337                                 else if (PUGI__IS_CHARTYPE(ch, ct_space))
2338                                 {
2339                                         PUGI__SKIPWS();
2340
2341                                         // scan for tag end
2342                                         char_t* value = s;
2343
2344                                         PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
2345                                         PUGI__CHECK_ERROR(status_bad_pi, s);
2346
2347                                         if (declaration)
2348                                         {
2349                                                 // replace ending ? with / so that 'element' terminates properly
2350                                                 *s = '/';
2351
2352                                                 // we exit from this function with cursor at node_declaration, which is a signal to parse() to go to LOC_ATTRIBUTES
2353                                                 s = value;
2354                                         }
2355                                         else
2356                                         {
2357                                                 // store value and step over >
2358                                                 cursor->value = value;
2359                                                 PUGI__POPNODE();
2360
2361                                                 PUGI__ENDSEG();
2362
2363                                                 s += (*s == '>');
2364                                         }
2365                                 }
2366                                 else PUGI__THROW_ERROR(status_bad_pi, s);
2367                         }
2368                         else
2369                         {
2370                                 // scan for tag end
2371                                 PUGI__SCANFOR(s[0] == '?' && ENDSWITH(s[1], '>'));
2372                                 PUGI__CHECK_ERROR(status_bad_pi, s);
2373
2374                                 s += (s[1] == '>' ? 2 : 1);
2375                         }
2376
2377                         // store from registers
2378                         ref_cursor = cursor;
2379
2380                         return s;
2381                 }
2382
2383                 char_t* parse(char_t* s, xml_node_struct* xmldoc, unsigned int optmsk, char_t endch)
2384                 {
2385                         strconv_attribute_t strconv_attribute = get_strconv_attribute(optmsk);
2386                         strconv_pcdata_t strconv_pcdata = get_strconv_pcdata(optmsk);
2387                         
2388                         char_t ch = 0;
2389                         xml_node_struct* cursor = xmldoc;
2390                         char_t* mark = s;
2391
2392                         while (*s != 0)
2393                         {
2394                                 if (*s == '<')
2395                                 {
2396                                         ++s;
2397
2398                                 LOC_TAG:
2399                                         if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // '<#...'
2400                                         {
2401                                                 PUGI__PUSHNODE(node_element); // Append a new node to the tree.
2402
2403                                                 cursor->name = s;
2404
2405                                                 PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
2406                                                 PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
2407
2408                                                 if (ch == '>')
2409                                                 {
2410                                                         // end of tag
2411                                                 }
2412                                                 else if (PUGI__IS_CHARTYPE(ch, ct_space))
2413                                                 {
2414                                                 LOC_ATTRIBUTES:
2415                                                         while (true)
2416                                                         {
2417                                                                 PUGI__SKIPWS(); // Eat any whitespace.
2418                                                 
2419                                                                 if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) // <... #...
2420                                                                 {
2421                                                                         xml_attribute_struct* a = append_attribute_ll(cursor, alloc); // Make space for this attribute.
2422                                                                         if (!a) PUGI__THROW_ERROR(status_out_of_memory, s);
2423
2424                                                                         a->name = s; // Save the offset.
2425
2426                                                                         PUGI__SCANWHILE(PUGI__IS_CHARTYPE(*s, ct_symbol)); // Scan for a terminator.
2427                                                                         PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
2428
2429                                                                         PUGI__ENDSEG(); // Save char in 'ch', terminate & step over.
2430                                                                         PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
2431
2432                                                                         if (PUGI__IS_CHARTYPE(ch, ct_space))
2433                                                                         {
2434                                                                                 PUGI__SKIPWS(); // Eat any whitespace.
2435                                                                                 PUGI__CHECK_ERROR(status_bad_attribute, s); //$ redundant, left for performance
2436
2437                                                                                 ch = *s;
2438                                                                                 ++s;
2439                                                                         }
2440                                                                         
2441                                                                         if (ch == '=') // '<... #=...'
2442                                                                         {
2443                                                                                 PUGI__SKIPWS(); // Eat any whitespace.
2444
2445                                                                                 if (*s == '"' || *s == '\'') // '<... #="...'
2446                                                                                 {
2447                                                                                         ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
2448                                                                                         ++s; // Step over the quote.
2449                                                                                         a->value = s; // Save the offset.
2450
2451                                                                                         s = strconv_attribute(s, ch);
2452                                                                                 
2453                                                                                         if (!s) PUGI__THROW_ERROR(status_bad_attribute, a->value);
2454
2455                                                                                         // After this line the loop continues from the start;
2456                                                                                         // Whitespaces, / and > are ok, symbols and EOF are wrong,
2457                                                                                         // everything else will be detected
2458                                                                                         if (PUGI__IS_CHARTYPE(*s, ct_start_symbol)) PUGI__THROW_ERROR(status_bad_attribute, s);
2459                                                                                 }
2460                                                                                 else PUGI__THROW_ERROR(status_bad_attribute, s);
2461                                                                         }
2462                                                                         else PUGI__THROW_ERROR(status_bad_attribute, s);
2463                                                                 }
2464                                                                 else if (*s == '/')
2465                                                                 {
2466                                                                         ++s;
2467                                                                         
2468                                                                         if (*s == '>')
2469                                                                         {
2470                                                                                 PUGI__POPNODE();
2471                                                                                 s++;
2472                                                                                 break;
2473                                                                         }
2474                                                                         else if (*s == 0 && endch == '>')
2475                                                                         {
2476                                                                                 PUGI__POPNODE();
2477                                                                                 break;
2478                                                                         }
2479                                                                         else PUGI__THROW_ERROR(status_bad_start_element, s);
2480                                                                 }
2481                                                                 else if (*s == '>')
2482                                                                 {
2483                                                                         ++s;
2484
2485                                                                         break;
2486                                                                 }
2487                                                                 else if (*s == 0 && endch == '>')
2488                                                                 {
2489                                                                         break;
2490                                                                 }
2491                                                                 else PUGI__THROW_ERROR(status_bad_start_element, s);
2492                                                         }
2493
2494                                                         // !!!
2495                                                 }
2496                                                 else if (ch == '/') // '<#.../'
2497                                                 {
2498                                                         if (!ENDSWITH(*s, '>')) PUGI__THROW_ERROR(status_bad_start_element, s);
2499
2500                                                         PUGI__POPNODE(); // Pop.
2501
2502                                                         s += (*s == '>');
2503                                                 }
2504                                                 else if (ch == 0)
2505                                                 {
2506                                                         // we stepped over null terminator, backtrack & handle closing tag
2507                                                         --s;
2508                                                         
2509                                                         if (endch != '>') PUGI__THROW_ERROR(status_bad_start_element, s);
2510                                                 }
2511                                                 else PUGI__THROW_ERROR(status_bad_start_element, s);
2512                                         }
2513                                         else if (*s == '/')
2514                                         {
2515                                                 ++s;
2516
2517                                                 char_t* name = cursor->name;
2518                                                 if (!name) PUGI__THROW_ERROR(status_end_element_mismatch, s);
2519                                                 
2520                                                 while (PUGI__IS_CHARTYPE(*s, ct_symbol))
2521                                                 {
2522                                                         if (*s++ != *name++) PUGI__THROW_ERROR(status_end_element_mismatch, s);
2523                                                 }
2524
2525                                                 if (*name)
2526                                                 {
2527                                                         if (*s == 0 && name[0] == endch && name[1] == 0) PUGI__THROW_ERROR(status_bad_end_element, s);
2528                                                         else PUGI__THROW_ERROR(status_end_element_mismatch, s);
2529                                                 }
2530                                                         
2531                                                 PUGI__POPNODE(); // Pop.
2532
2533                                                 PUGI__SKIPWS();
2534
2535                                                 if (*s == 0)
2536                                                 {
2537                                                         if (endch != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
2538                                                 }
2539                                                 else
2540                                                 {
2541                                                         if (*s != '>') PUGI__THROW_ERROR(status_bad_end_element, s);
2542                                                         ++s;
2543                                                 }
2544                                         }
2545                                         else if (*s == '?') // '<?...'
2546                                         {
2547                                                 s = parse_question(s, cursor, optmsk, endch);
2548                                                 if (!s) return s;
2549
2550                                                 assert(cursor);
2551                                                 if ((cursor->header & xml_memory_page_type_mask) + 1 == node_declaration) goto LOC_ATTRIBUTES;
2552                                         }
2553                                         else if (*s == '!') // '<!...'
2554                                         {
2555                                                 s = parse_exclamation(s, cursor, optmsk, endch);
2556                                                 if (!s) return s;
2557                                         }
2558                                         else if (*s == 0 && endch == '?') PUGI__THROW_ERROR(status_bad_pi, s);
2559                                         else PUGI__THROW_ERROR(status_unrecognized_tag, s);
2560                                 }
2561                                 else
2562                                 {
2563                                         mark = s; // Save this offset while searching for a terminator.
2564
2565                                         PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
2566
2567                                         if (*s == '<')
2568                                         {
2569                                                 // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
2570                                                 assert(mark != s);
2571
2572                                                 if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single))
2573                                                 {
2574                                                         continue;
2575                                                 }
2576                                                 else if (PUGI__OPTSET(parse_ws_pcdata_single))
2577                                                 {
2578                                                         if (s[1] != '/' || cursor->first_child) continue;
2579                                                 }
2580                                         }
2581
2582                                         s = mark;
2583                                                         
2584                                         if (cursor->parent)
2585                                         {
2586                                                 PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
2587                                                 cursor->value = s; // Save the offset.
2588
2589                                                 s = strconv_pcdata(s);
2590                                                                 
2591                                                 PUGI__POPNODE(); // Pop since this is a standalone.
2592                                                 
2593                                                 if (!*s) break;
2594                                         }
2595                                         else
2596                                         {
2597                                                 PUGI__SCANFOR(*s == '<'); // '...<'
2598                                                 if (!*s) break;
2599                                                 
2600                                                 ++s;
2601                                         }
2602
2603                                         // We're after '<'
2604                                         goto LOC_TAG;
2605                                 }
2606                         }
2607
2608                         // check that last tag is closed
2609                         if (cursor != xmldoc) PUGI__THROW_ERROR(status_end_element_mismatch, s);
2610
2611                         return s;
2612                 }
2613
2614                 static xml_parse_result parse(char_t* buffer, size_t length, xml_node_struct* root, unsigned int optmsk)
2615                 {
2616                         xml_document_struct* xmldoc = static_cast<xml_document_struct*>(root);
2617
2618                         // store buffer for offset_debug
2619                         xmldoc->buffer = buffer;
2620
2621                         // early-out for empty documents
2622                         if (length == 0) return make_parse_result(status_ok);
2623
2624                         // create parser on stack
2625                         xml_parser parser(*xmldoc);
2626
2627                         // save last character and make buffer zero-terminated (speeds up parsing)
2628                         char_t endch = buffer[length - 1];
2629                         buffer[length - 1] = 0;
2630                         
2631                         // perform actual parsing
2632                         parser.parse(buffer, xmldoc, optmsk, endch);
2633
2634                         xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
2635                         assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
2636
2637                         // update allocator state
2638                         *static_cast<xml_allocator*>(xmldoc) = parser.alloc;
2639
2640                         // since we removed last character, we have to handle the only possible false positive
2641                         if (result && endch == '<')
2642                         {
2643                                 // there's no possible well-formed document with < at the end
2644                                 return make_parse_result(status_unrecognized_tag, length);
2645                         }
2646
2647                         return result;
2648                 }
2649         };
2650
2651         // Output facilities
2652         PUGI__FN xml_encoding get_write_native_encoding()
2653         {
2654         #ifdef PUGIXML_WCHAR_MODE
2655                 return get_wchar_encoding();
2656         #else
2657                 return encoding_utf8;
2658         #endif
2659         }
2660
2661         PUGI__FN xml_encoding get_write_encoding(xml_encoding encoding)
2662         {
2663                 // replace wchar encoding with utf implementation
2664                 if (encoding == encoding_wchar) return get_wchar_encoding();
2665
2666                 // replace utf16 encoding with utf16 with specific endianness
2667                 if (encoding == encoding_utf16) return is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
2668
2669                 // replace utf32 encoding with utf32 with specific endianness
2670                 if (encoding == encoding_utf32) return is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
2671
2672                 // only do autodetection if no explicit encoding is requested
2673                 if (encoding != encoding_auto) return encoding;
2674
2675                 // assume utf8 encoding
2676                 return encoding_utf8;
2677         }
2678
2679 #ifdef PUGIXML_WCHAR_MODE
2680         PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
2681         {
2682                 assert(length > 0);
2683
2684                 // discard last character if it's the lead of a surrogate pair 
2685                 return (sizeof(wchar_t) == 2 && static_cast<unsigned int>(static_cast<uint16_t>(data[length - 1]) - 0xD800) < 0x400) ? length - 1 : length;
2686         }
2687
2688         PUGI__FN size_t convert_buffer(char_t* r_char, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
2689         {
2690                 // only endian-swapping is required
2691                 if (need_endian_swap_utf(encoding, get_wchar_encoding()))
2692                 {
2693                         convert_wchar_endian_swap(r_char, data, length);
2694
2695                         return length * sizeof(char_t);
2696                 }
2697         
2698                 // convert to utf8
2699                 if (encoding == encoding_utf8)
2700                 {
2701                         uint8_t* dest = r_u8;
2702                         uint8_t* end = utf_decoder<utf8_writer>::decode_wchar_block(data, length, dest);
2703
2704                         return static_cast<size_t>(end - dest);
2705                 }
2706
2707                 // convert to utf16
2708                 if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
2709                 {
2710                         uint16_t* dest = r_u16;
2711
2712                         // convert to native utf16
2713                         uint16_t* end = utf_decoder<utf16_writer>::decode_wchar_block(data, length, dest);
2714
2715                         // swap if necessary
2716                         xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
2717
2718                         if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
2719
2720                         return static_cast<size_t>(end - dest) * sizeof(uint16_t);
2721                 }
2722
2723                 // convert to utf32
2724                 if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
2725                 {
2726                         uint32_t* dest = r_u32;
2727
2728                         // convert to native utf32
2729                         uint32_t* end = utf_decoder<utf32_writer>::decode_wchar_block(data, length, dest);
2730
2731                         // swap if necessary
2732                         xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
2733
2734                         if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
2735
2736                         return static_cast<size_t>(end - dest) * sizeof(uint32_t);
2737                 }
2738
2739                 // convert to latin1
2740                 if (encoding == encoding_latin1)
2741                 {
2742                         uint8_t* dest = r_u8;
2743                         uint8_t* end = utf_decoder<latin1_writer>::decode_wchar_block(data, length, dest);
2744
2745                         return static_cast<size_t>(end - dest);
2746                 }
2747
2748                 assert(!"Invalid encoding");
2749                 return 0;
2750         }
2751 #else
2752         PUGI__FN size_t get_valid_length(const char_t* data, size_t length)
2753         {
2754                 assert(length > 4);
2755
2756                 for (size_t i = 1; i <= 4; ++i)
2757                 {
2758                         uint8_t ch = static_cast<uint8_t>(data[length - i]);
2759
2760                         // either a standalone character or a leading one
2761                         if ((ch & 0xc0) != 0x80) return length - i;
2762                 }
2763
2764                 // there are four non-leading characters at the end, sequence tail is broken so might as well process the whole chunk
2765                 return length;
2766         }
2767
2768         PUGI__FN size_t convert_buffer(char_t* /* r_char */, uint8_t* r_u8, uint16_t* r_u16, uint32_t* r_u32, const char_t* data, size_t length, xml_encoding encoding)
2769         {
2770                 if (encoding == encoding_utf16_be || encoding == encoding_utf16_le)
2771                 {
2772                         uint16_t* dest = r_u16;
2773
2774                         // convert to native utf16
2775                         uint16_t* end = utf_decoder<utf16_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
2776
2777                         // swap if necessary
2778                         xml_encoding native_encoding = is_little_endian() ? encoding_utf16_le : encoding_utf16_be;
2779
2780                         if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
2781
2782                         return static_cast<size_t>(end - dest) * sizeof(uint16_t);
2783                 }
2784
2785                 if (encoding == encoding_utf32_be || encoding == encoding_utf32_le)
2786                 {
2787                         uint32_t* dest = r_u32;
2788
2789                         // convert to native utf32
2790                         uint32_t* end = utf_decoder<utf32_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
2791
2792                         // swap if necessary
2793                         xml_encoding native_encoding = is_little_endian() ? encoding_utf32_le : encoding_utf32_be;
2794
2795                         if (native_encoding != encoding) convert_utf_endian_swap(dest, dest, static_cast<size_t>(end - dest));
2796
2797                         return static_cast<size_t>(end - dest) * sizeof(uint32_t);
2798                 }
2799
2800                 if (encoding == encoding_latin1)
2801                 {
2802                         uint8_t* dest = r_u8;
2803                         uint8_t* end = utf_decoder<latin1_writer>::decode_utf8_block(reinterpret_cast<const uint8_t*>(data), length, dest);
2804
2805                         return static_cast<size_t>(end - dest);
2806                 }
2807
2808                 assert(!"Invalid encoding");
2809                 return 0;
2810         }
2811 #endif
2812
2813         class xml_buffered_writer
2814         {
2815                 xml_buffered_writer(const xml_buffered_writer&);
2816                 xml_buffered_writer& operator=(const xml_buffered_writer&);
2817
2818         public: