Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: orxonox.OLD/orxonox/trunk/src/lib/xmlparser/tinyxmlparser.cc @ 4081

Last change on this file since 4081 was 4010, checked in by bensch, 20 years ago
orxonox/trunk: merged the levelloader from lltrunktemp to the trunk. Big thanks to fuzzy to make this so easy for us, and for implementing it in the first place.
File size: 32.7 KB

Line
1	/*
2	www.sourceforge.net/projects/tinyxml
3	Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5	This software is provided 'as-is', without any express or implied
6	warranty. In no event will the authors be held liable for any
7	damages arising from the use of this software.
8
9	Permission is granted to anyone to use this software for any
10	purpose, including commercial applications, and to alter it and
11	redistribute it freely, subject to the following restrictions:
12
13	1. The origin of this software must not be misrepresented; you must
14	not claim that you wrote the original software. If you use this
15	software in a product, an acknowledgment in the product documentation
16	would be appreciated but is not required.
17
18	2. Altered source versions must be plainly marked as such, and
19	must not be misrepresented as being the original software.
20
21	3. This notice may not be removed or altered from any source
22	distribution.
23	*/
24
25	#include "tinyxml.h"
26	#include <ctype.h>
27
28	//#define DEBUG_PARSER
29
30	// Note tha "PutString" hardcodes the same list. This
31	// is less flexible than it appears. Changing the entries
32	// or order will break putstring.
33	TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
34	{
35	{ "&", 5, '&' },
36	{ "<", 4, '<' },
37	{ ">", 4, '>' },
38	{ """, 6, '\"' },
39	{ "'", 6, '\'' }
40	};
41
42	// Bunch of unicode info at:
43	// http://www.unicode.org/faq/utf_bom.html
44	// Including the basic of this table, which determines the #bytes in the
45	// sequence from the lead byte. 1 placed for invalid sequences --
46	// although the result will be junk, pass it through as much as possible.
47	// Beware of the non-characters in UTF-8:
48	// ef bb bf (Microsoft "lead bytes")
49	// ef bf be
50	// ef bf bf
51
52	const char TIXML_UTF_LEAD_0 = (const char)0xef;
53	const char TIXML_UTF_LEAD_1 = (const char)0xbb;
54	const char TIXML_UTF_LEAD_2 = (const char)0xbf;
55
56	const int TiXmlBase::utf8ByteTable[256] =
57	{
58	// 0 1 2 3 4 5 6 7 8 9 a b c d e f
59	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
60	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
61	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
62	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
63	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
64	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
65	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
66	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
67	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
68	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
69	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
70	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
71	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
72	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
73	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
74	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
75	};
76
77
78	void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
79	{
80	const unsigned long BYTE_MASK = 0xBF;
81	const unsigned long BYTE_MARK = 0x80;
82	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
83
84	if (input < 0x80)
85	*length = 1;
86	else if ( input < 0x800 )
87	*length = 2;
88	else if ( input < 0x10000 )
89	*length = 3;
90	else if ( input < 0x200000 )
91	*length = 4;
92	else
93	{ *length = 0; return; } // This code won't covert this correctly anyway.
94
95	output += *length;
96
97	// Scary scary fall throughs.
98	switch (*length)
99	{
100	case 4:
101	--output;
102	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
103	input >>= 6;
104	case 3:
105	--output;
106	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
107	input >>= 6;
108	case 2:
109	--output;
110	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
111	input >>= 6;
112	case 1:
113	--output;
114	output = (char)(input \| FIRST_BYTE_MARK[length]);
115	}
116	}
117
118
119	/static/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding )
120	{
121	// This will only work for low-ascii, everything else is assumed to be a valid
122	// letter. I'm not sure this is the best approach, but it is quite tricky trying
123	// to figure out alhabetical vs. not across encoding. So take a very
124	// conservative approach.
125
126	// if ( encoding == TIXML_ENCODING_UTF8 )
127	// {
128	if ( anyByte < 127 )
129	return isalpha( anyByte );
130	else
131	return 1; // What else to do? The unicode set is huge...get the english ones right.
132	// }
133	// else
134	// {
135	// return isalpha( anyByte );
136	// }
137	}
138
139
140	/static/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding )
141	{
142	// This will only work for low-ascii, everything else is assumed to be a valid
143	// letter. I'm not sure this is the best approach, but it is quite tricky trying
144	// to figure out alhabetical vs. not across encoding. So take a very
145	// conservative approach.
146
147	// if ( encoding == TIXML_ENCODING_UTF8 )
148	// {
149	if ( anyByte < 127 )
150	return isalnum( anyByte );
151	else
152	return 1; // What else to do? The unicode set is huge...get the english ones right.
153	// }
154	// else
155	// {
156	// return isalnum( anyByte );
157	// }
158	}
159
160
161	class TiXmlParsingData
162	{
163	friend class TiXmlDocument;
164	public:
165	void Stamp( const char* now, TiXmlEncoding encoding );
166
167	const TiXmlCursor& Cursor() { return cursor; }
168
169	private:
170	// Only used by the document!
171	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
172	{
173	assert( start );
174	stamp = start;
175	tabsize = _tabsize;
176	cursor.row = row;
177	cursor.col = col;
178	}
179
180	TiXmlCursor cursor;
181	const char* stamp;
182	int tabsize;
183	};
184
185
186	void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
187	{
188	assert( now );
189
190	// Do nothing if the tabsize is 0.
191	if ( tabsize < 1 )
192	{
193	return;
194	}
195
196	// Get the current row, column.
197	int row = cursor.row;
198	int col = cursor.col;
199	const char* p = stamp;
200	assert( p );
201
202	while ( p < now )
203	{
204	// Code contributed by Fletcher Dunn: (modified by lee)
205	switch (*p) {
206	case 0:
207	// We should never get here, but in case we do, don't
208	// advance past the terminating null character, ever
209	return;
210
211	case '\r':
212	// bump down to the next line
213	++row;
214	col = 0;
215	// Eat the character
216	++p;
217
218	// Check for \r\n sequence, and treat this as a single character
219	if (*p == '\n') {
220	++p;
221	}
222	break;
223
224	case '\n':
225	// bump down to the next line
226	++row;
227	col = 0;
228
229	// Eat the character
230	++p;
231
232	// Check for \n\r sequence, and treat this as a single
233	// character. (Yes, this bizarre thing does occur still
234	// on some arcane platforms...)
235	if (*p == '\r') {
236	++p;
237	}
238	break;
239
240	case '\t':
241	// Eat the character
242	++p;
243
244	// Skip to next tab stop
245	col = (col / tabsize + 1) * tabsize;
246	break;
247
248	case TIXML_UTF_LEAD_0:
249	if ( encoding == TIXML_ENCODING_UTF8 )
250	{
251	if ( (p+1) && (p+2) )
252	{
253	// In these cases, don't advance the column. These are
254	// 0-width spaces.
255	if ( (p+1)==TIXML_UTF_LEAD_1 && (p+2)==TIXML_UTF_LEAD_2 )
256	p += 3;
257	else if ( (p+1)==(char)(0xbf) && (p+2)==(char)(0xbe) )
258	p += 3;
259	else if ( (p+1)==(char)(0xbf) && (p+2)==(char)(0xbf) )
260	p += 3;
261	else
262	{ p +=3; ++col; } // A normal character.
263	}
264	}
265	else
266	{
267	++p;
268	++col;
269	}
270	break;
271
272	default:
273	if ( encoding == TIXML_ENCODING_UTF8 )
274	{
275	// Eat the 1 to 4 byte utf8 character.
276	int step = TiXmlBase::utf8ByteTable[((unsigned char)p)];
277	if ( step == 0 )
278	step = 1; // Error case from bad encoding, but handle gracefully.
279	p += step;
280
281	// Just advance one column, of course.
282	++col;
283	}
284	else
285	{
286	++p;
287	++col;
288	}
289	break;
290	}
291	}
292	cursor.row = row;
293	cursor.col = col;
294	assert( cursor.row >= -1 );
295	assert( cursor.col >= -1 );
296	stamp = p;
297	assert( stamp );
298	}
299
300
301	const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
302	{
303	if ( !p \|\| !*p )
304	{
305	return 0;
306	}
307	if ( encoding == TIXML_ENCODING_UTF8 )
308	{
309	while ( *p )
310	{
311	// Skip the stupid Microsoft UTF-8 Byte order marks
312	if ( *(p+0)==TIXML_UTF_LEAD_0
313	&& *(p+1)==TIXML_UTF_LEAD_1
314	&& *(p+2)==TIXML_UTF_LEAD_2 )
315	{
316	p += 3;
317	continue;
318	}
319	else if(*(p+0)==TIXML_UTF_LEAD_0
320	&& *(p+1)==(const char) 0xbf
321	&& *(p+2)==(const char) 0xbe )
322	{
323	p += 3;
324	continue;
325	}
326	else if(*(p+0)==TIXML_UTF_LEAD_0
327	&& *(p+1)==(const char) 0xbf
328	&& *(p+2)==(const char) 0xbf )
329	{
330	p += 3;
331	continue;
332	}
333
334	if ( IsWhiteSpace( p ) \|\| p == '\n' \|\| *p =='\r' ) // Still using old rules for white space.
335	++p;
336	else
337	break;
338	}
339	}
340	else
341	{
342	while ( p && IsWhiteSpace( p ) \|\| p == '\n' \|\| p =='\r' )
343	++p;
344	}
345
346	return p;
347	}
348
349	#ifdef TIXML_USE_STL
350	/static/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
351	{
352	for( ;; )
353	{
354	if ( !in->good() ) return false;
355
356	int c = in->peek();
357	// At this scope, we can't get to a document. So fail silently.
358	if ( !IsWhiteSpace( c ) \|\| c <= 0 )
359	return true;
360
361	*tag += (char) in->get();
362	}
363	}
364
365	/static/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
366	{
367	//assert( character > 0 && character < 128 ); // else it won't work in utf-8
368	while ( in->good() )
369	{
370	int c = in->peek();
371	if ( c == character )
372	return true;
373	if ( c <= 0 ) // Silent failure: can't get document at this scope
374	return false;
375
376	in->get();
377	*tag += (char) c;
378	}
379	return false;
380	}
381	#endif
382
383	const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
384	{
385	*name = "";
386	assert( p );
387
388	// Names start with letters or underscores.
389	// Of course, in unicode, tinyxml has no idea what a letter is. The
390	// algorithm is generous.
391	//
392	// After that, they can be letters, underscores, numbers,
393	// hyphens, or colons. (Colons are valid ony for namespaces,
394	// but tinyxml can't tell namespaces from names.)
395	if ( p && *p
396	&& ( IsAlpha( (unsigned char) p, encoding ) \|\| p == '_' ) )
397	{
398	while( p && *p
399	&& ( IsAlphaNum( (unsigned char ) *p, encoding )
400	\|\| *p == '_'
401	\|\| *p == '-'
402	\|\| *p == '.'
403	\|\| *p == ':' ) )
404	{
405	(name) += p;
406	++p;
407	}
408	return p;
409	}
410	return 0;
411	}
412
413	const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
414	{
415	// Presume an entity, and pull it out.
416	TIXML_STRING ent;
417	int i;
418	*length = 0;
419
420	if ( (p+1) && (p+1) == '#' && *(p+2) )
421	{
422	unsigned long ucs = 0;
423	//*ME: warning C4244: convert '__w64 int' to 'unsigned'
424	//*ME: Use size_t instead of unsigned (pointer-arithmetic)
425	size_t delta = 0;
426	unsigned mult = 1;
427
428	if ( *(p+2) == 'x' )
429	{
430	// Hexadecimal.
431	if ( !*(p+3) ) return 0;
432
433	const char* q = p+3;
434	q = strchr( q, ';' );
435
436	if ( !q \|\| !*q ) return 0;
437
438	delta = q-p;
439	--q;
440
441	while ( *q != 'x' )
442	{
443	if ( q >= '0' && q <= '9' )
444	ucs += mult * (*q - '0');
445	else if ( q >= 'a' && q <= 'f' )
446	ucs += mult * (*q - 'a' + 10);
447	else if ( q >= 'A' && q <= 'F' )
448	ucs += mult * (*q - 'A' + 10 );
449	else
450	return 0;
451	mult *= 16;
452	--q;
453	}
454	}
455	else
456	{
457	// Decimal.
458	if ( !*(p+2) ) return 0;
459
460	const char* q = p+2;
461	q = strchr( q, ';' );
462
463	if ( !q \|\| !*q ) return 0;
464
465	delta = q-p;
466	--q;
467
468	while ( *q != '#' )
469	{
470	if ( q >= '0' && q <= '9' )
471	ucs += mult * (*q - '0');
472	else
473	return 0;
474	mult *= 10;
475	--q;
476	}
477	}
478	if ( encoding == TIXML_ENCODING_UTF8 )
479	{
480	// convert the UCS to UTF-8
481	ConvertUTF32ToUTF8( ucs, value, length );
482	}
483	else
484	{
485	*value = (char)ucs;
486	*length = 1;
487	}
488	return p + delta + 1;
489	}
490
491	// Now try to match it.
492	for( i=0; i<NUM_ENTITY; ++i )
493	{
494	if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
495	{
496	assert( strlen( entity[i].str ) == entity[i].strLength );
497	*value = entity[i].chr;
498	*length = 1;
499	return ( p + entity[i].strLength );
500	}
501	}
502
503	// So it wasn't an entity, its unrecognized, or something like that.
504	value = p; // Don't put back the last one, since we return it!
505	return p+1;
506	}
507
508
509	bool TiXmlBase::StringEqual( const char* p,
510	const char* tag,
511	bool ignoreCase,
512	TiXmlEncoding encoding )
513	{
514	assert( p );
515	assert( tag );
516	if ( !p \|\| !*p )
517	{
518	assert( 0 );
519	return false;
520	}
521
522	const char* q = p;
523
524	if ( ignoreCase )
525	{
526	while ( q && tag && ToLower( q, encoding ) == ToLower( tag, encoding ) )
527	{
528	++q;
529	++tag;
530	}
531
532	if ( *tag == 0 )
533	return true;
534	}
535	else
536	{
537	while ( q && tag && q == tag )
538	{
539	++q;
540	++tag;
541	}
542
543	if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
544	return true;
545	}
546	return false;
547	}
548
549	const char* TiXmlBase::ReadText( const char* p,
550	TIXML_STRING * text,
551	bool trimWhiteSpace,
552	const char* endTag,
553	bool caseInsensitive,
554	TiXmlEncoding encoding )
555	{
556	*text = "";
557	if ( !trimWhiteSpace // certain tags always keep whitespace
558	\|\| !condenseWhiteSpace ) // if true, whitespace is always kept
559	{
560	// Keep all the white space.
561	while ( p && *p
562	&& !StringEqual( p, endTag, caseInsensitive, encoding )
563	)
564	{
565	int len;
566	char cArr[4] = { 0, 0, 0, 0 };
567	p = GetChar( p, cArr, &len, encoding );
568	text->append( cArr, len );
569	}
570	}
571	else
572	{
573	bool whitespace = false;
574
575	// Remove leading white space:
576	p = SkipWhiteSpace( p, encoding );
577	while ( p && *p
578	&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
579	{
580	if ( p == '\r' \|\| p == '\n' )
581	{
582	whitespace = true;
583	++p;
584	}
585	else if ( IsWhiteSpace( *p ) )
586	{
587	whitespace = true;
588	++p;
589	}
590	else
591	{
592	// If we've found whitespace, add it before the
593	// new character. Any whitespace just becomes a space.
594	if ( whitespace )
595	{
596	(*text) += ' ';
597	whitespace = false;
598	}
599	int len;
600	char cArr[4] = { 0, 0, 0, 0 };
601	p = GetChar( p, cArr, &len, encoding );
602	if ( len == 1 )
603	(*text) += cArr[0]; // more efficient
604	else
605	text->append( cArr, len );
606	}
607	}
608	}
609	return p + strlen( endTag );
610	}
611
612	#ifdef TIXML_USE_STL
613
614	void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
615	{
616	// The basic issue with a document is that we don't know what we're
617	// streaming. Read something presumed to be a tag (and hope), then
618	// identify it, and call the appropriate stream method on the tag.
619	//
620	// This "pre-streaming" will never read the closing ">" so the
621	// sub-tag can orient itself.
622
623	if ( !StreamTo( in, '<', tag ) )
624	{
625	SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
626	return;
627	}
628
629	while ( in->good() )
630	{
631	int tagIndex = (int) tag->length();
632	while ( in->good() && in->peek() != '>' )
633	{
634	int c = in->get();
635	if ( c <= 0 )
636	{
637	SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
638	break;
639	}
640	(*tag) += (char) c;
641	}
642
643	if ( in->good() )
644	{
645	// We now have something we presume to be a node of
646	// some sort. Identify it, and call the node to
647	// continue streaming.
648	TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
649
650	if ( node )
651	{
652	node->StreamIn( in, tag );
653	bool isElement = node->ToElement() != 0;
654	delete node;
655	node = 0;
656
657	// If this is the root element, we're done. Parsing will be
658	// done by the >> operator.
659	if ( isElement )
660	{
661	return;
662	}
663	}
664	else
665	{
666	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
667	return;
668	}
669	}
670	}
671	// We should have returned sooner.
672	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
673	}
674
675	#endif
676
677	const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
678	{
679	ClearError();
680
681	// Parse away, at the document level. Since a document
682	// contains nothing but other tags, most of what happens
683	// here is skipping white space.
684	if ( !p \|\| !*p )
685	{
686	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
687	return 0;
688	}
689
690	// Note that, for a document, this needs to come
691	// before the while space skip, so that parsing
692	// starts from the pointer we are given.
693	location.Clear();
694	if ( prevData )
695	{
696	location.row = prevData->cursor.row;
697	location.col = prevData->cursor.col;
698	}
699	else
700	{
701	location.row = 0;
702	location.col = 0;
703	}
704	TiXmlParsingData data( p, TabSize(), location.row, location.col );
705	location = data.Cursor();
706
707	if ( encoding == TIXML_ENCODING_UNKNOWN )
708	{
709	// Check for the Microsoft UTF-8 lead bytes.
710	if ( (p+0) && (p+0) == TIXML_UTF_LEAD_0
711	&& (p+1) && (p+1) == TIXML_UTF_LEAD_1
712	&& (p+2) && (p+2) == TIXML_UTF_LEAD_2 )
713	{
714	encoding = TIXML_ENCODING_UTF8;
715	}
716	}
717
718	p = SkipWhiteSpace( p, encoding );
719	if ( !p )
720	{
721	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
722	return 0;
723	}
724
725	while ( p && *p )
726	{
727	TiXmlNode* node = Identify( p, encoding );
728	if ( node )
729	{
730	p = node->Parse( p, &data, encoding );
731	LinkEndChild( node );
732	}
733	else
734	{
735	break;
736	}
737
738	// Did we get encoding info?
739	if ( encoding == TIXML_ENCODING_UNKNOWN
740	&& node->ToDeclaration() )
741	{
742	TiXmlDeclaration* dec = node->ToDeclaration();
743	const char* enc = dec->Encoding();
744	assert( enc );
745
746	if ( *enc == 0 )
747	encoding = TIXML_ENCODING_UTF8;
748	else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
749	encoding = TIXML_ENCODING_UTF8;
750	else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
751	encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
752	else
753	encoding = TIXML_ENCODING_LEGACY;
754	}
755
756	p = SkipWhiteSpace( p, encoding );
757	}
758
759	// Was this empty?
760	if ( !firstChild ) {
761	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
762	return 0;
763	}
764
765	// All is well.
766	return p;
767	}
768
769	void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
770	{
771	// The first error in a chain is more accurate - don't set again!
772	if ( error )
773	return;
774
775	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
776	error = true;
777	errorId = err;
778	errorDesc = errorString[ errorId ];
779
780	errorLocation.Clear();
781	if ( pError && data )
782	{
783	//TiXmlParsingData data( pError, prevData );
784	data->Stamp( pError, encoding );
785	errorLocation = data->Cursor();
786	}
787	}
788
789
790	TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
791	{
792	TiXmlNode* returnNode = 0;
793
794	p = SkipWhiteSpace( p, encoding );
795	if( !p \|\| !p \|\| p != '<' )
796	{
797	return 0;
798	}
799
800	TiXmlDocument* doc = GetDocument();
801	p = SkipWhiteSpace( p, encoding );
802
803	if ( !p \|\| !*p )
804	{
805	return 0;
806	}
807
808	// What is this thing?
809	// - Elements start with a letter or underscore, but xml is reserved.
810	// - Comments: <!--
811	// - Decleration: <?xml
812	// - Everthing else is unknown to tinyxml.
813	//
814
815	const char* xmlHeader = { "<?xml" };
816	const char* commentHeader = { "<!--" };
817	const char* dtdHeader = { "<!" };
818
819	if ( StringEqual( p, xmlHeader, true, encoding ) )
820	{
821	#ifdef DEBUG_PARSER
822	TIXML_LOG( "XML parsing Declaration\n" );
823	#endif
824	returnNode = new TiXmlDeclaration();
825	}
826	else if ( StringEqual( p, commentHeader, false, encoding ) )
827	{
828	#ifdef DEBUG_PARSER
829	TIXML_LOG( "XML parsing Comment\n" );
830	#endif
831	returnNode = new TiXmlComment();
832	}
833	else if ( StringEqual( p, dtdHeader, false, encoding ) )
834	{
835	#ifdef DEBUG_PARSER
836	TIXML_LOG( "XML parsing Unknown(1)\n" );
837	#endif
838	returnNode = new TiXmlUnknown();
839	}
840	else if ( IsAlpha( *(p+1), encoding )
841	\|\| *(p+1) == '_' )
842	{
843	#ifdef DEBUG_PARSER
844	TIXML_LOG( "XML parsing Element\n" );
845	#endif
846	returnNode = new TiXmlElement( "" );
847	}
848	else
849	{
850	#ifdef DEBUG_PARSER
851	TIXML_LOG( "XML parsing Unknown(2)\n" );
852	#endif
853	returnNode = new TiXmlUnknown();
854	}
855
856	if ( returnNode )
857	{
858	// Set the parent, so it can report errors
859	returnNode->parent = this;
860	}
861	else
862	{
863	if ( doc )
864	doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
865	}
866	return returnNode;
867	}
868
869	#ifdef TIXML_USE_STL
870
871	void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
872	{
873	// We're called with some amount of pre-parsing. That is, some of "this"
874	// element is in "tag". Go ahead and stream to the closing ">"
875	while( in->good() )
876	{
877	int c = in->get();
878	if ( c <= 0 )
879	{
880	TiXmlDocument* document = GetDocument();
881	if ( document )
882	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
883	return;
884	}
885	(*tag) += (char) c ;
886
887	if ( c == '>' )
888	break;
889	}
890
891	if ( tag->length() < 3 ) return;
892
893	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
894	// If not, identify and stream.
895
896	if ( tag->at( tag->length() - 1 ) == '>'
897	&& tag->at( tag->length() - 2 ) == '/' )
898	{
899	// All good!
900	return;
901	}
902	else if ( tag->at( tag->length() - 1 ) == '>' )
903	{
904	// There is more. Could be:
905	// text
906	// closing tag
907	// another node.
908	for ( ;; )
909	{
910	StreamWhiteSpace( in, tag );
911
912	// Do we have text?
913	if ( in->good() && in->peek() != '<' )
914	{
915	// Yep, text.
916	TiXmlText text( "" );
917	text.StreamIn( in, tag );
918
919	// What follows text is a closing tag or another node.
920	// Go around again and figure it out.
921	continue;
922	}
923
924	// We now have either a closing tag...or another node.
925	// We should be at a "<", regardless.
926	if ( !in->good() ) return;
927	assert( in->peek() == '<' );
928	int tagIndex = tag->length();
929
930	bool closingTag = false;
931	bool firstCharFound = false;
932
933	for( ;; )
934	{
935	if ( !in->good() )
936	return;
937
938	int c = in->peek();
939	if ( c <= 0 )
940	{
941	TiXmlDocument* document = GetDocument();
942	if ( document )
943	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
944	return;
945	}
946
947	if ( c == '>' )
948	break;
949
950	*tag += (char) c;
951	in->get();
952
953	if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
954	{
955	firstCharFound = true;
956	if ( c == '/' )
957	closingTag = true;
958	}
959	}
960	// If it was a closing tag, then read in the closing '>' to clean up the input stream.
961	// If it was not, the streaming will be done by the tag.
962	if ( closingTag )
963	{
964	if ( !in->good() )
965	return;
966
967	int c = in->get();
968	if ( c <= 0 )
969	{
970	TiXmlDocument* document = GetDocument();
971	if ( document )
972	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
973	return;
974	}
975	assert( c == '>' );
976	*tag += (char) c;
977
978	// We are done, once we've found our closing tag.
979	return;
980	}
981	else
982	{
983	// If not a closing tag, id it, and stream.
984	const char* tagloc = tag->c_str() + tagIndex;
985	TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
986	if ( !node )
987	return;
988	node->StreamIn( in, tag );
989	delete node;
990	node = 0;
991
992	// No return: go around from the beginning: text, closing tag, or node.
993	}
994	}
995	}
996	}
997	#endif
998
999	const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1000	{
1001	p = SkipWhiteSpace( p, encoding );
1002	TiXmlDocument* document = GetDocument();
1003
1004	if ( !p \|\| !*p )
1005	{
1006	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1007	return 0;
1008	}
1009
1010	// TiXmlParsingData data( p, prevData );
1011	if ( data )
1012	{
1013	data->Stamp( p, encoding );
1014	location = data->Cursor();
1015	}
1016
1017	if ( *p != '<' )
1018	{
1019	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1020	return 0;
1021	}
1022
1023	p = SkipWhiteSpace( p+1, encoding );
1024
1025	// Read the name.
1026	const char* pErr = p;
1027
1028	p = ReadName( p, &value, encoding );
1029	if ( !p \|\| !*p )
1030	{
1031	if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1032	return 0;
1033	}
1034
1035	TIXML_STRING endTag ("</");
1036	endTag += value;
1037	endTag += ">";
1038
1039	// Check for and read attributes. Also look for an empty
1040	// tag or an end tag.
1041	while ( p && *p )
1042	{
1043	pErr = p;
1044	p = SkipWhiteSpace( p, encoding );
1045	if ( !p \|\| !*p )
1046	{
1047	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1048	return 0;
1049	}
1050	if ( *p == '/' )
1051	{
1052	++p;
1053	// Empty tag.
1054	if ( *p != '>' )
1055	{
1056	if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1057	return 0;
1058	}
1059	return (p+1);
1060	}
1061	else if ( *p == '>' )
1062	{
1063	// Done with attributes (if there were any.)
1064	// Read the value -- which can include other
1065	// elements -- read the end tag, and return.
1066	++p;
1067	p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1068	if ( !p \|\| !*p )
1069	return 0;
1070
1071	// We should find the end tag now
1072	if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1073	{
1074	p += endTag.length();
1075	return p;
1076	}
1077	else
1078	{
1079	if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1080	return 0;
1081	}
1082	}
1083	else
1084	{
1085	// Try to read an attribute:
1086	TiXmlAttribute* attrib = new TiXmlAttribute();
1087	if ( !attrib )
1088	{
1089	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1090	return 0;
1091	}
1092
1093	attrib->SetDocument( document );
1094	const char* pErr = p;
1095	p = attrib->Parse( p, data, encoding );
1096
1097	if ( !p \|\| !*p )
1098	{
1099	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1100	delete attrib;
1101	return 0;
1102	}
1103
1104	// Handle the strange case of double attributes:
1105	TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1106	if ( node )
1107	{
1108	node->SetValue( attrib->Value() );
1109	delete attrib;
1110	return 0;
1111	}
1112
1113	attributeSet.Add( attrib );
1114	}
1115	}
1116	return p;
1117	}
1118
1119
1120	const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1121	{
1122	TiXmlDocument* document = GetDocument();
1123
1124	const char* pWithWhiteSpace = p;
1125	// Read in text and elements in any order.
1126	p = SkipWhiteSpace( p, encoding );
1127	while ( p && *p )
1128	{
1129	if ( *p != '<' )
1130	{
1131	// Take what we have, make a text element.
1132	TiXmlText* textNode = new TiXmlText( "" );
1133
1134	if ( !textNode )
1135	{
1136	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1137	return 0;
1138	}
1139
1140	if ( TiXmlBase::IsWhiteSpaceCondensed() )
1141	{
1142	p = textNode->Parse( p, data, encoding );
1143	}
1144	else
1145	{
1146	// Special case: we want to keep the white space
1147	// so that leading spaces aren't removed.
1148	p = textNode->Parse( pWithWhiteSpace, data, encoding );
1149	}
1150
1151	if ( !textNode->Blank() )
1152	LinkEndChild( textNode );
1153	else
1154	delete textNode;
1155	}
1156	else
1157	{
1158	// We hit a '<'
1159	// Have we hit a new element or an end tag?
1160	if ( StringEqual( p, "</", false, encoding ) )
1161	{
1162	return p;
1163	}
1164	else
1165	{
1166	TiXmlNode* node = Identify( p, encoding );
1167	if ( node )
1168	{
1169	p = node->Parse( p, data, encoding );
1170	LinkEndChild( node );
1171	}
1172	else
1173	{
1174	return 0;
1175	}
1176	}
1177	}
1178	p = SkipWhiteSpace( p, encoding );
1179	}
1180
1181	if ( !p )
1182	{
1183	if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1184	}
1185	return p;
1186	}
1187
1188
1189	#ifdef TIXML_USE_STL
1190	void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1191	{
1192	while ( in->good() )
1193	{
1194	int c = in->get();
1195	if ( c <= 0 )
1196	{
1197	TiXmlDocument* document = GetDocument();
1198	if ( document )
1199	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1200	return;
1201	}
1202	(*tag) += (char) c;
1203
1204	if ( c == '>' )
1205	{
1206	// All is well.
1207	return;
1208	}
1209	}
1210	}
1211	#endif
1212
1213
1214	const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1215	{
1216	TiXmlDocument* document = GetDocument();
1217	p = SkipWhiteSpace( p, encoding );
1218
1219	// TiXmlParsingData data( p, prevData );
1220	if ( data )
1221	{
1222	data->Stamp( p, encoding );
1223	location = data->Cursor();
1224	}
1225	if ( !p \|\| !p \|\| p != '<' )
1226	{
1227	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1228	return 0;
1229	}
1230	++p;
1231	value = "";
1232
1233	while ( p && p && p != '>' )
1234	{
1235	value += *p;
1236	++p;
1237	}
1238
1239	if ( !p )
1240	{
1241	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1242	}
1243	if ( *p == '>' )
1244	return p+1;
1245	return p;
1246	}
1247
1248	#ifdef TIXML_USE_STL
1249	void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1250	{
1251	while ( in->good() )
1252	{
1253	int c = in->get();
1254	if ( c <= 0 )
1255	{
1256	TiXmlDocument* document = GetDocument();
1257	if ( document )
1258	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1259	return;
1260	}
1261
1262	(*tag) += (char) c;
1263
1264	if ( c == '>'
1265	&& tag->at( tag->length() - 2 ) == '-'
1266	&& tag->at( tag->length() - 3 ) == '-' )
1267	{
1268	// All is well.
1269	return;
1270	}
1271	}
1272	}
1273	#endif
1274
1275
1276	const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1277	{
1278	TiXmlDocument* document = GetDocument();
1279	value = "";
1280
1281	p = SkipWhiteSpace( p, encoding );
1282
1283	// TiXmlParsingData data( p, prevData );
1284	if ( data )
1285	{
1286	data->Stamp( p, encoding );
1287	location = data->Cursor();
1288	}
1289	const char* startTag = "<!--";
1290	const char* endTag = "-->";
1291
1292	if ( !StringEqual( p, startTag, false, encoding ) )
1293	{
1294	document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1295	return 0;
1296	}
1297	p += strlen( startTag );
1298	p = ReadText( p, &value, false, endTag, false, encoding );
1299	return p;
1300	}
1301
1302
1303	const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1304	{
1305	p = SkipWhiteSpace( p, encoding );
1306	if ( !p \|\| !*p ) return 0;
1307
1308	int tabsize = 4;
1309	if ( document )
1310	tabsize = document->TabSize();
1311
1312	// TiXmlParsingData data( p, prevData );
1313	if ( data )
1314	{
1315	data->Stamp( p, encoding );
1316	location = data->Cursor();
1317	}
1318	// Read the name, the '=' and the value.
1319	const char* pErr = p;
1320	p = ReadName( p, &name, encoding );
1321	if ( !p \|\| !*p )
1322	{
1323	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1324	return 0;
1325	}
1326	p = SkipWhiteSpace( p, encoding );
1327	if ( !p \|\| !p \|\| p != '=' )
1328	{
1329	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1330	return 0;
1331	}
1332
1333	++p; // skip '='
1334	p = SkipWhiteSpace( p, encoding );
1335	if ( !p \|\| !*p )
1336	{
1337	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1338	return 0;
1339	}
1340
1341	const char* end;
1342
1343	if ( *p == '\'' )
1344	{
1345	++p;
1346	end = "\'";
1347	p = ReadText( p, &value, false, end, false, encoding );
1348	}
1349	else if ( *p == '"' )
1350	{
1351	++p;
1352	end = "\"";
1353	p = ReadText( p, &value, false, end, false, encoding );
1354	}
1355	else
1356	{
1357	// All attribute values should be in single or double quotes.
1358	// But this is such a common error that the parser will try
1359	// its best, even without them.
1360	value = "";
1361	while ( p && *p // existence
1362	&& !IsWhiteSpace( p ) && p != '\n' && *p != '\r' // whitespace
1363	&& p != '/' && p != '>' ) // tag end
1364	{
1365	value += *p;
1366	++p;
1367	}
1368	}
1369	return p;
1370	}
1371
1372	#ifdef TIXML_USE_STL
1373	void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1374	{
1375	while ( in->good() )
1376	{
1377	int c = in->peek();
1378	if ( c == '<' )
1379	return;
1380	if ( c <= 0 )
1381	{
1382	TiXmlDocument* document = GetDocument();
1383	if ( document )
1384	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1385	return;
1386	}
1387
1388	(*tag) += (char) c;
1389	in->get();
1390	}
1391	}
1392	#endif
1393
1394	const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1395	{
1396	value = "";
1397	// TiXmlParsingData data( p, prevData );
1398	if ( data )
1399	{
1400	data->Stamp( p, encoding );
1401	location = data->Cursor();
1402	}
1403	bool ignoreWhite = true;
1404
1405	const char* end = "<";
1406	p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1407	if ( p )
1408	return p-1; // don't truncate the '<'
1409	return 0;
1410	}
1411
1412	#ifdef TIXML_USE_STL
1413	void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1414	{
1415	while ( in->good() )
1416	{
1417	int c = in->get();
1418	if ( c <= 0 )
1419	{
1420	TiXmlDocument* document = GetDocument();
1421	if ( document )
1422	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1423	return;
1424	}
1425	(*tag) += (char) c;
1426
1427	if ( c == '>' )
1428	{
1429	// All is well.
1430	return;
1431	}
1432	}
1433	}
1434	#endif
1435
1436	const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1437	{
1438	p = SkipWhiteSpace( p, _encoding );
1439	// Find the beginning, find the end, and look for
1440	// the stuff in-between.
1441	TiXmlDocument* document = GetDocument();
1442	if ( !p \|\| !*p \|\| !StringEqual( p, "<?xml", true, _encoding ) )
1443	{
1444	if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1445	return 0;
1446	}
1447	// TiXmlParsingData data( p, prevData );
1448	if ( data )
1449	{
1450	data->Stamp( p, _encoding );
1451	location = data->Cursor();
1452	}
1453	p += 5;
1454
1455	version = "";
1456	encoding = "";
1457	standalone = "";
1458
1459	while ( p && *p )
1460	{
1461	if ( *p == '>' )
1462	{
1463	++p;
1464	return p;
1465	}
1466
1467	p = SkipWhiteSpace( p, _encoding );
1468	if ( StringEqual( p, "version", true, _encoding ) )
1469	{
1470	TiXmlAttribute attrib;
1471	p = attrib.Parse( p, data, _encoding );
1472	version = attrib.Value();
1473	}
1474	else if ( StringEqual( p, "encoding", true, _encoding ) )
1475	{
1476	TiXmlAttribute attrib;
1477	p = attrib.Parse( p, data, _encoding );
1478	encoding = attrib.Value();
1479	}
1480	else if ( StringEqual( p, "standalone", true, _encoding ) )
1481	{
1482	TiXmlAttribute attrib;
1483	p = attrib.Parse( p, data, _encoding );
1484	standalone = attrib.Value();
1485	}
1486	else
1487	{
1488	// Read over whatever it is.
1489	while( p && p && p != '>' && !IsWhiteSpace( *p ) )
1490	++p;
1491	}
1492	}
1493	return 0;
1494	}
1495
1496	bool TiXmlText::Blank() const
1497	{
1498	for ( unsigned i=0; i<value.length(); i++ )
1499	if ( !IsWhiteSpace( value[i] ) )
1500	return false;
1501	return true;
1502	}
1503

Note: See TracBrowser for help on using the repository browser.

Download in other formats: