Context Navigation

tinyxmlparser.cc @ 9929

Last change on this file since 9929 was 5819, checked in by bensch, 19 years ago
orxonox/trunk: merged branches world_entities to trunk again merged with command svn merge -r5795:HEAD branches/world_entities/ trunk/ no conflicts (what a wonder)
File size: 33.9 KB

Line
1	/*
2	www.sourceforge.net/projects/tinyxml
3	Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4
5	This software is provided 'as-is', without any express or implied
6	warranty. In no event will the authors be held liable for any
7	damages arising from the use of this software.
8
9	Permission is granted to anyone to use this software for any
10	purpose, including commercial applications, and to alter it and
11	redistribute it freely, subject to the following restrictions:
12
13	1. The origin of this software must not be misrepresented; you must
14	not claim that you wrote the original software. If you use this
15	software in a product, an acknowledgment in the product documentation
16	would be appreciated but is not required.
17
18	2. Altered source versions must be plainly marked as such, and
19	must not be misrepresented as being the original software.
20
21	3. This notice may not be removed or altered from any source
22	distribution.
23	*/
24
25	#include "tinyxml.h"
26	#include <ctype.h>
27	#include <stddef.h>
28
29	//#define DEBUG_PARSER
30
31	// Note tha "PutString" hardcodes the same list. This
32	// is less flexible than it appears. Changing the entries
33	// or order will break putstring.
34	TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
35	{
36	{ "&", 5, '&' },
37	{ "<", 4, '<' },
38	{ ">", 4, '>' },
39	{ """, 6, '\"' },
40	{ "'", 6, '\'' }
41	};
42
43	// Bunch of unicode info at:
44	// http://www.unicode.org/faq/utf_bom.html
45	// Including the basic of this table, which determines the #bytes in the
46	// sequence from the lead byte. 1 placed for invalid sequences --
47	// although the result will be junk, pass it through as much as possible.
48	// Beware of the non-characters in UTF-8:
49	// ef bb bf (Microsoft "lead bytes")
50	// ef bf be
51	// ef bf bf
52
53	const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
54	const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
55	const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
56
57	const int TiXmlBase::utf8ByteTable[256] =
58	{
59	// 0 1 2 3 4 5 6 7 8 9 a b c d e f
60	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00
61	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10
62	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20
63	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30
64	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40
65	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50
66	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60
67	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70 End of ASCII range
68	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x80 0x80 to 0xc1 invalid
69	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x90
70	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xa0
71	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xb0
72	1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xc0 0xc2 to 0xdf 2 byte
73	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xd0
74	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xe0 0xe0 to 0xef 3 byte
75	4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 // 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
76	};
77
78
79	void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
80	{
81	const unsigned long BYTE_MASK = 0xBF;
82	const unsigned long BYTE_MARK = 0x80;
83	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
84
85	if (input < 0x80)
86	*length = 1;
87	else if ( input < 0x800 )
88	*length = 2;
89	else if ( input < 0x10000 )
90	*length = 3;
91	else if ( input < 0x200000 )
92	*length = 4;
93	else
94	{ *length = 0; return; } // This code won't covert this correctly anyway.
95
96	output += *length;
97
98	// Scary scary fall throughs.
99	switch (*length)
100	{
101	case 4:
102	--output;
103	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
104	input >>= 6;
105	case 3:
106	--output;
107	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
108	input >>= 6;
109	case 2:
110	--output;
111	*output = (char)((input \| BYTE_MARK) & BYTE_MASK);
112	input >>= 6;
113	case 1:
114	--output;
115	output = (char)(input \| FIRST_BYTE_MARK[length]);
116	}
117	}
118
119
120	/static/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /encoding/ )
121	{
122	// This will only work for low-ascii, everything else is assumed to be a valid
123	// letter. I'm not sure this is the best approach, but it is quite tricky trying
124	// to figure out alhabetical vs. not across encoding. So take a very
125	// conservative approach.
126
127	// if ( encoding == TIXML_ENCODING_UTF8 )
128	// {
129	if ( anyByte < 127 )
130	return isalpha( anyByte );
131	else
132	return 1; // What else to do? The unicode set is huge...get the english ones right.
133	// }
134	// else
135	// {
136	// return isalpha( anyByte );
137	// }
138	}
139
140
141	/static/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /encoding/ )
142	{
143	// This will only work for low-ascii, everything else is assumed to be a valid
144	// letter. I'm not sure this is the best approach, but it is quite tricky trying
145	// to figure out alhabetical vs. not across encoding. So take a very
146	// conservative approach.
147
148	// if ( encoding == TIXML_ENCODING_UTF8 )
149	// {
150	if ( anyByte < 127 )
151	return isalnum( anyByte );
152	else
153	return 1; // What else to do? The unicode set is huge...get the english ones right.
154	// }
155	// else
156	// {
157	// return isalnum( anyByte );
158	// }
159	}
160
161
162	class TiXmlParsingData
163	{
164	friend class TiXmlDocument;
165	public:
166	void Stamp( const char* now, TiXmlEncoding encoding );
167
168	const TiXmlCursor& Cursor() { return cursor; }
169
170	private:
171	// Only used by the document!
172	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
173	{
174	assert( start );
175	stamp = start;
176	tabsize = _tabsize;
177	cursor.row = row;
178	cursor.col = col;
179	}
180
181	TiXmlCursor cursor;
182	const char* stamp;
183	int tabsize;
184	};
185
186
187	void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
188	{
189	assert( now );
190
191	// Do nothing if the tabsize is 0.
192	if ( tabsize < 1 )
193	{
194	return;
195	}
196
197	// Get the current row, column.
198	int row = cursor.row;
199	int col = cursor.col;
200	const char* p = stamp;
201	assert( p );
202
203	while ( p < now )
204	{
205	// Treat p as unsigned, so we have a happy compiler.
206	const unsigned char* pU = (const unsigned char*)p;
207
208	// Code contributed by Fletcher Dunn: (modified by lee)
209	switch (*pU) {
210	case 0:
211	// We should never get here, but in case we do, don't
212	// advance past the terminating null character, ever
213	return;
214
215	case '\r':
216	// bump down to the next line
217	++row;
218	col = 0;
219	// Eat the character
220	++p;
221
222	// Check for \r\n sequence, and treat this as a single character
223	if (*p == '\n') {
224	++p;
225	}
226	break;
227
228	case '\n':
229	// bump down to the next line
230	++row;
231	col = 0;
232
233	// Eat the character
234	++p;
235
236	// Check for \n\r sequence, and treat this as a single
237	// character. (Yes, this bizarre thing does occur still
238	// on some arcane platforms...)
239	if (*p == '\r') {
240	++p;
241	}
242	break;
243
244	case '\t':
245	// Eat the character
246	++p;
247
248	// Skip to next tab stop
249	col = (col / tabsize + 1) * tabsize;
250	break;
251
252	case TIXML_UTF_LEAD_0:
253	if ( encoding == TIXML_ENCODING_UTF8 )
254	{
255	if ( (p+1) && (p+2) )
256	{
257	// In these cases, don't advance the column. These are
258	// 0-width spaces.
259	if ( (pU+1)==TIXML_UTF_LEAD_1 && (pU+2)==TIXML_UTF_LEAD_2 )
260	p += 3;
261	else if ( (pU+1)==0xbfU && (pU+2)==0xbeU )
262	p += 3;
263	else if ( (pU+1)==0xbfU && (pU+2)==0xbfU )
264	p += 3;
265	else
266	{ p +=3; ++col; } // A normal character.
267	}
268	}
269	else
270	{
271	++p;
272	++col;
273	}
274	break;
275
276	default:
277	if ( encoding == TIXML_ENCODING_UTF8 )
278	{
279	// Eat the 1 to 4 byte utf8 character.
280	int step = TiXmlBase::utf8ByteTable[((unsigned char)p)];
281	if ( step == 0 )
282	step = 1; // Error case from bad encoding, but handle gracefully.
283	p += step;
284
285	// Just advance one column, of course.
286	++col;
287	}
288	else
289	{
290	++p;
291	++col;
292	}
293	break;
294	}
295	}
296	cursor.row = row;
297	cursor.col = col;
298	assert( cursor.row >= -1 );
299	assert( cursor.col >= -1 );
300	stamp = p;
301	assert( stamp );
302	}
303
304
305	const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
306	{
307	if ( !p \|\| !*p )
308	{
309	return 0;
310	}
311	if ( encoding == TIXML_ENCODING_UTF8 )
312	{
313	while ( *p )
314	{
315	const unsigned char* pU = (const unsigned char*)p;
316
317	// Skip the stupid Microsoft UTF-8 Byte order marks
318	if ( *(pU+0)==TIXML_UTF_LEAD_0
319	&& *(pU+1)==TIXML_UTF_LEAD_1
320	&& *(pU+2)==TIXML_UTF_LEAD_2 )
321	{
322	p += 3;
323	continue;
324	}
325	else if(*(pU+0)==TIXML_UTF_LEAD_0
326	&& *(pU+1)==0xbfU
327	&& *(pU+2)==0xbeU )
328	{
329	p += 3;
330	continue;
331	}
332	else if(*(pU+0)==TIXML_UTF_LEAD_0
333	&& *(pU+1)==0xbfU
334	&& *(pU+2)==0xbfU )
335	{
336	p += 3;
337	continue;
338	}
339
340	if ( IsWhiteSpace( p ) \|\| p == '\n' \|\| *p =='\r' ) // Still using old rules for white space.
341	++p;
342	else
343	break;
344	}
345	}
346	else
347	{
348	while ( p && IsWhiteSpace( p ) \|\| p == '\n' \|\| p =='\r' )
349	++p;
350	}
351
352	return p;
353	}
354
355	#ifdef TIXML_USE_STL
356	/static/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag )
357	{
358	for( ;; )
359	{
360	if ( !in->good() ) return false;
361
362	int c = in->peek();
363	// At this scope, we can't get to a document. So fail silently.
364	if ( !IsWhiteSpace( c ) \|\| c <= 0 )
365	return true;
366
367	*tag += (char) in->get();
368	}
369	}
370
371	/static/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag )
372	{
373	//assert( character > 0 && character < 128 ); // else it won't work in utf-8
374	while ( in->good() )
375	{
376	int c = in->peek();
377	if ( c == character )
378	return true;
379	if ( c <= 0 ) // Silent failure: can't get document at this scope
380	return false;
381
382	in->get();
383	*tag += (char) c;
384	}
385	return false;
386	}
387	#endif
388
389	const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
390	{
391	*name = "";
392	assert( p );
393
394	// Names start with letters or underscores.
395	// Of course, in unicode, tinyxml has no idea what a letter is. The
396	// algorithm is generous.
397	//
398	// After that, they can be letters, underscores, numbers,
399	// hyphens, or colons. (Colons are valid ony for namespaces,
400	// but tinyxml can't tell namespaces from names.)
401	if ( p && *p
402	&& ( IsAlpha( (unsigned char) p, encoding ) \|\| p == '_' ) )
403	{
404	while( p && *p
405	&& ( IsAlphaNum( (unsigned char ) *p, encoding )
406	\|\| *p == '_'
407	\|\| *p == '-'
408	\|\| *p == '.'
409	\|\| *p == ':' ) )
410	{
411	(name) += p;
412	++p;
413	}
414	return p;
415	}
416	return 0;
417	}
418
419	const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
420	{
421	// Presume an entity, and pull it out.
422	TIXML_STRING ent;
423	int i;
424	*length = 0;
425
426	if ( (p+1) && (p+1) == '#' && *(p+2) )
427	{
428	unsigned long ucs = 0;
429	ptrdiff_t delta = 0;
430	unsigned mult = 1;
431
432	if ( *(p+2) == 'x' )
433	{
434	// Hexadecimal.
435	if ( !*(p+3) ) return 0;
436
437	const char* q = p+3;
438	q = strchr( q, ';' );
439
440	if ( !q \|\| !*q ) return 0;
441
442	delta = q-p;
443	--q;
444
445	while ( *q != 'x' )
446	{
447	if ( q >= '0' && q <= '9' )
448	ucs += mult * (*q - '0');
449	else if ( q >= 'a' && q <= 'f' )
450	ucs += mult * (*q - 'a' + 10);
451	else if ( q >= 'A' && q <= 'F' )
452	ucs += mult * (*q - 'A' + 10 );
453	else
454	return 0;
455	mult *= 16;
456	--q;
457	}
458	}
459	else
460	{
461	// Decimal.
462	if ( !*(p+2) ) return 0;
463
464	const char* q = p+2;
465	q = strchr( q, ';' );
466
467	if ( !q \|\| !*q ) return 0;
468
469	delta = q-p;
470	--q;
471
472	while ( *q != '#' )
473	{
474	if ( q >= '0' && q <= '9' )
475	ucs += mult * (*q - '0');
476	else
477	return 0;
478	mult *= 10;
479	--q;
480	}
481	}
482	if ( encoding == TIXML_ENCODING_UTF8 )
483	{
484	// convert the UCS to UTF-8
485	ConvertUTF32ToUTF8( ucs, value, length );
486	}
487	else
488	{
489	*value = (char)ucs;
490	*length = 1;
491	}
492	return p + delta + 1;
493	}
494
495	// Now try to match it.
496	for( i=0; i<NUM_ENTITY; ++i )
497	{
498	if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
499	{
500	assert( strlen( entity[i].str ) == entity[i].strLength );
501	*value = entity[i].chr;
502	*length = 1;
503	return ( p + entity[i].strLength );
504	}
505	}
506
507	// So it wasn't an entity, its unrecognized, or something like that.
508	value = p; // Don't put back the last one, since we return it!
509	return p+1;
510	}
511
512
513	bool TiXmlBase::StringEqual( const char* p,
514	const char* tag,
515	bool ignoreCase,
516	TiXmlEncoding encoding )
517	{
518	assert( p );
519	assert( tag );
520	if ( !p \|\| !*p )
521	{
522	assert( 0 );
523	return false;
524	}
525
526	const char* q = p;
527
528	if ( ignoreCase )
529	{
530	while ( q && tag && ToLower( q, encoding ) == ToLower( tag, encoding ) )
531	{
532	++q;
533	++tag;
534	}
535
536	if ( *tag == 0 )
537	return true;
538	}
539	else
540	{
541	while ( q && tag && q == tag )
542	{
543	++q;
544	++tag;
545	}
546
547	if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
548	return true;
549	}
550	return false;
551	}
552
553	const char* TiXmlBase::ReadText( const char* p,
554	TIXML_STRING * text,
555	bool trimWhiteSpace,
556	const char* endTag,
557	bool caseInsensitive,
558	TiXmlEncoding encoding )
559	{
560	*text = "";
561	if ( !trimWhiteSpace // certain tags always keep whitespace
562	\|\| !condenseWhiteSpace ) // if true, whitespace is always kept
563	{
564	// Keep all the white space.
565	while ( p && *p
566	&& !StringEqual( p, endTag, caseInsensitive, encoding )
567	)
568	{
569	int len;
570	char cArr[4] = { 0, 0, 0, 0 };
571	p = GetChar( p, cArr, &len, encoding );
572	text->append( cArr, len );
573	}
574	}
575	else
576	{
577	bool whitespace = false;
578
579	// Remove leading white space:
580	p = SkipWhiteSpace( p, encoding );
581	while ( p && *p
582	&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
583	{
584	if ( p == '\r' \|\| p == '\n' )
585	{
586	whitespace = true;
587	++p;
588	}
589	else if ( IsWhiteSpace( *p ) )
590	{
591	whitespace = true;
592	++p;
593	}
594	else
595	{
596	// If we've found whitespace, add it before the
597	// new character. Any whitespace just becomes a space.
598	if ( whitespace )
599	{
600	(*text) += ' ';
601	whitespace = false;
602	}
603	int len;
604	char cArr[4] = { 0, 0, 0, 0 };
605	p = GetChar( p, cArr, &len, encoding );
606	if ( len == 1 )
607	(*text) += cArr[0]; // more efficient
608	else
609	text->append( cArr, len );
610	}
611	}
612	}
613	return p + strlen( endTag );
614	}
615
616	#ifdef TIXML_USE_STL
617
618	void TiXmlDocument::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
619	{
620	// The basic issue with a document is that we don't know what we're
621	// streaming. Read something presumed to be a tag (and hope), then
622	// identify it, and call the appropriate stream method on the tag.
623	//
624	// This "pre-streaming" will never read the closing ">" so the
625	// sub-tag can orient itself.
626
627	if ( !StreamTo( in, '<', tag ) )
628	{
629	SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
630	return;
631	}
632
633	while ( in->good() )
634	{
635	int tagIndex = (int) tag->length();
636	while ( in->good() && in->peek() != '>' )
637	{
638	int c = in->get();
639	if ( c <= 0 )
640	{
641	SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
642	break;
643	}
644	(*tag) += (char) c;
645	}
646
647	if ( in->good() )
648	{
649	// We now have something we presume to be a node of
650	// some sort. Identify it, and call the node to
651	// continue streaming.
652	TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
653
654	if ( node )
655	{
656	node->StreamIn( in, tag );
657	bool isElement = node->ToElement() != 0;
658	delete node;
659	node = 0;
660
661	// If this is the root element, we're done. Parsing will be
662	// done by the >> operator.
663	if ( isElement )
664	{
665	return;
666	}
667	}
668	else
669	{
670	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
671	return;
672	}
673	}
674	}
675	// We should have returned sooner.
676	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
677	}
678
679	#endif
680
681	const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
682	{
683	ClearError();
684
685	// Parse away, at the document level. Since a document
686	// contains nothing but other tags, most of what happens
687	// here is skipping white space.
688	if ( !p \|\| !*p )
689	{
690	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
691	return 0;
692	}
693
694	// Note that, for a document, this needs to come
695	// before the while space skip, so that parsing
696	// starts from the pointer we are given.
697	location.Clear();
698	if ( prevData )
699	{
700	location.row = prevData->cursor.row;
701	location.col = prevData->cursor.col;
702	}
703	else
704	{
705	location.row = 0;
706	location.col = 0;
707	}
708	TiXmlParsingData data( p, TabSize(), location.row, location.col );
709	location = data.Cursor();
710
711	if ( encoding == TIXML_ENCODING_UNKNOWN )
712	{
713	// Check for the Microsoft UTF-8 lead bytes.
714	const unsigned char* pU = (const unsigned char*)p;
715	if ( (pU+0) && (pU+0) == TIXML_UTF_LEAD_0
716	&& (pU+1) && (pU+1) == TIXML_UTF_LEAD_1
717	&& (pU+2) && (pU+2) == TIXML_UTF_LEAD_2 )
718	{
719	encoding = TIXML_ENCODING_UTF8;
720	useMicrosoftBOM = true;
721	}
722	}
723
724	p = SkipWhiteSpace( p, encoding );
725	if ( !p )
726	{
727	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
728	return 0;
729	}
730
731	while ( p && *p )
732	{
733	TiXmlNode* node = Identify( p, encoding );
734	if ( node )
735	{
736	p = node->Parse( p, &data, encoding );
737	LinkEndChild( node );
738	}
739	else
740	{
741	break;
742	}
743
744	// Did we get encoding info?
745	if ( encoding == TIXML_ENCODING_UNKNOWN
746	&& node->ToDeclaration() )
747	{
748	TiXmlDeclaration* dec = node->ToDeclaration();
749	const char* enc = dec->Encoding();
750	assert( enc );
751
752	if ( *enc == 0 )
753	encoding = TIXML_ENCODING_UTF8;
754	else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
755	encoding = TIXML_ENCODING_UTF8;
756	else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
757	encoding = TIXML_ENCODING_UTF8; // incorrect, but be nice
758	else
759	encoding = TIXML_ENCODING_LEGACY;
760	}
761
762	p = SkipWhiteSpace( p, encoding );
763	}
764
765	// Was this empty?
766	if ( !firstChild ) {
767	SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
768	return 0;
769	}
770
771	// All is well.
772	return p;
773	}
774
775	void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
776	{
777	// The first error in a chain is more accurate - don't set again!
778	if ( error )
779	return;
780
781	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
782	error = true;
783	errorId = err;
784	errorDesc = errorString[ errorId ];
785
786	errorLocation.Clear();
787	if ( pError && data )
788	{
789	data->Stamp( pError, encoding );
790	errorLocation = data->Cursor();
791	}
792	}
793
794
795	TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
796	{
797	TiXmlNode* returnNode = 0;
798
799	p = SkipWhiteSpace( p, encoding );
800	if( !p \|\| !p \|\| p != '<' )
801	{
802	return 0;
803	}
804
805	TiXmlDocument* doc = GetDocument();
806	p = SkipWhiteSpace( p, encoding );
807
808	if ( !p \|\| !*p )
809	{
810	return 0;
811	}
812
813	// What is this thing?
814	// - Elements start with a letter or underscore, but xml is reserved.
815	// - Comments: <!--
816	// - Decleration: <?xml
817	// - Everthing else is unknown to tinyxml.
818	//
819
820	const char* xmlHeader = { "<?xml" };
821	const char* commentHeader = { "<!--" };
822	const char* dtdHeader = { "<!" };
823	const char* cdataHeader = { "<![CDATA[" };
824
825	if ( StringEqual( p, xmlHeader, true, encoding ) )
826	{
827	#ifdef DEBUG_PARSER
828	TIXML_LOG( "XML parsing Declaration\n" );
829	#endif
830	returnNode = new TiXmlDeclaration();
831	}
832	else if ( StringEqual( p, commentHeader, false, encoding ) )
833	{
834	#ifdef DEBUG_PARSER
835	TIXML_LOG( "XML parsing Comment\n" );
836	#endif
837	returnNode = new TiXmlComment();
838	}
839	else if ( StringEqual( p, cdataHeader, false, encoding ) )
840	{
841	#ifdef DEBUG_PARSER
842	TIXML_LOG( "XML parsing CDATA\n" );
843	#endif
844	TiXmlText* text = new TiXmlText( "" );
845	text->SetCDATA( true );
846	returnNode = text;
847	}
848	else if ( StringEqual( p, dtdHeader, false, encoding ) )
849	{
850	#ifdef DEBUG_PARSER
851	TIXML_LOG( "XML parsing Unknown(1)\n" );
852	#endif
853	returnNode = new TiXmlUnknown();
854	}
855	else if ( IsAlpha( *(p+1), encoding )
856	\|\| *(p+1) == '_' )
857	{
858	#ifdef DEBUG_PARSER
859	TIXML_LOG( "XML parsing Element\n" );
860	#endif
861	returnNode = new TiXmlElement( "" );
862	}
863	else
864	{
865	#ifdef DEBUG_PARSER
866	TIXML_LOG( "XML parsing Unknown(2)\n" );
867	#endif
868	returnNode = new TiXmlUnknown();
869	}
870
871	if ( returnNode )
872	{
873	// Set the parent, so it can report errors
874	returnNode->parent = this;
875	}
876	else
877	{
878	if ( doc )
879	doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
880	}
881	return returnNode;
882	}
883
884	#ifdef TIXML_USE_STL
885
886	void TiXmlElement::StreamIn (TIXML_ISTREAM * in, TIXML_STRING * tag)
887	{
888	// We're called with some amount of pre-parsing. That is, some of "this"
889	// element is in "tag". Go ahead and stream to the closing ">"
890	while( in->good() )
891	{
892	int c = in->get();
893	if ( c <= 0 )
894	{
895	TiXmlDocument* document = GetDocument();
896	if ( document )
897	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
898	return;
899	}
900	(*tag) += (char) c ;
901
902	if ( c == '>' )
903	break;
904	}
905
906	if ( tag->length() < 3 ) return;
907
908	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
909	// If not, identify and stream.
910
911	if ( tag->at( tag->length() - 1 ) == '>'
912	&& tag->at( tag->length() - 2 ) == '/' )
913	{
914	// All good!
915	return;
916	}
917	else if ( tag->at( tag->length() - 1 ) == '>' )
918	{
919	// There is more. Could be:
920	// text
921	// closing tag
922	// another node.
923	for ( ;; )
924	{
925	StreamWhiteSpace( in, tag );
926
927	// Do we have text?
928	if ( in->good() && in->peek() != '<' )
929	{
930	// Yep, text.
931	TiXmlText text( "" );
932	text.StreamIn( in, tag );
933
934	// What follows text is a closing tag or another node.
935	// Go around again and figure it out.
936	continue;
937	}
938
939	// We now have either a closing tag...or another node.
940	// We should be at a "<", regardless.
941	if ( !in->good() ) return;
942	assert( in->peek() == '<' );
943	int tagIndex = (int) tag->length();
944
945	bool closingTag = false;
946	bool firstCharFound = false;
947
948	for( ;; )
949	{
950	if ( !in->good() )
951	return;
952
953	int c = in->peek();
954	if ( c <= 0 )
955	{
956	TiXmlDocument* document = GetDocument();
957	if ( document )
958	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
959	return;
960	}
961
962	if ( c == '>' )
963	break;
964
965	*tag += (char) c;
966	in->get();
967
968	if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
969	{
970	firstCharFound = true;
971	if ( c == '/' )
972	closingTag = true;
973	}
974	}
975	// If it was a closing tag, then read in the closing '>' to clean up the input stream.
976	// If it was not, the streaming will be done by the tag.
977	if ( closingTag )
978	{
979	if ( !in->good() )
980	return;
981
982	int c = in->get();
983	if ( c <= 0 )
984	{
985	TiXmlDocument* document = GetDocument();
986	if ( document )
987	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
988	return;
989	}
990	assert( c == '>' );
991	*tag += (char) c;
992
993	// We are done, once we've found our closing tag.
994	return;
995	}
996	else
997	{
998	// If not a closing tag, id it, and stream.
999	const char* tagloc = tag->c_str() + tagIndex;
1000	TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1001	if ( !node )
1002	return;
1003	node->StreamIn( in, tag );
1004	delete node;
1005	node = 0;
1006
1007	// No return: go around from the beginning: text, closing tag, or node.
1008	}
1009	}
1010	}
1011	}
1012	#endif
1013
1014	const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1015	{
1016	p = SkipWhiteSpace( p, encoding );
1017	TiXmlDocument* document = GetDocument();
1018
1019	if ( !p \|\| !*p )
1020	{
1021	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1022	return 0;
1023	}
1024
1025	if ( data )
1026	{
1027	data->Stamp( p, encoding );
1028	location = data->Cursor();
1029	}
1030
1031	if ( *p != '<' )
1032	{
1033	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1034	return 0;
1035	}
1036
1037	p = SkipWhiteSpace( p+1, encoding );
1038
1039	// Read the name.
1040	const char* pErr = p;
1041
1042	p = ReadName( p, &value, encoding );
1043	if ( !p \|\| !*p )
1044	{
1045	if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1046	return 0;
1047	}
1048
1049	TIXML_STRING endTag ("</");
1050	endTag += value;
1051	endTag += ">";
1052
1053	// Check for and read attributes. Also look for an empty
1054	// tag or an end tag.
1055	while ( p && *p )
1056	{
1057	pErr = p;
1058	p = SkipWhiteSpace( p, encoding );
1059	if ( !p \|\| !*p )
1060	{
1061	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1062	return 0;
1063	}
1064	if ( *p == '/' )
1065	{
1066	++p;
1067	// Empty tag.
1068	if ( *p != '>' )
1069	{
1070	if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1071	return 0;
1072	}
1073	return (p+1);
1074	}
1075	else if ( *p == '>' )
1076	{
1077	// Done with attributes (if there were any.)
1078	// Read the value -- which can include other
1079	// elements -- read the end tag, and return.
1080	++p;
1081	p = ReadValue( p, data, encoding ); // Note this is an Element method, and will set the error if one happens.
1082	if ( !p \|\| !*p )
1083	return 0;
1084
1085	// We should find the end tag now
1086	if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1087	{
1088	p += endTag.length();
1089	return p;
1090	}
1091	else
1092	{
1093	if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1094	return 0;
1095	}
1096	}
1097	else
1098	{
1099	// Try to read an attribute:
1100	TiXmlAttribute* attrib = new TiXmlAttribute();
1101	if ( !attrib )
1102	{
1103	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1104	return 0;
1105	}
1106
1107	attrib->SetDocument( document );
1108	const char* pErr = p;
1109	p = attrib->Parse( p, data, encoding );
1110
1111	if ( !p \|\| !*p )
1112	{
1113	if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1114	delete attrib;
1115	return 0;
1116	}
1117
1118	// Handle the strange case of double attributes:
1119	TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1120	if ( node )
1121	{
1122	node->SetValue( attrib->Value() );
1123	delete attrib;
1124	return 0;
1125	}
1126
1127	attributeSet.Add( attrib );
1128	}
1129	}
1130	return p;
1131	}
1132
1133
1134	const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1135	{
1136	TiXmlDocument* document = GetDocument();
1137
1138	// Read in text and elements in any order.
1139	const char* pWithWhiteSpace = p;
1140	p = SkipWhiteSpace( p, encoding );
1141
1142	while ( p && *p )
1143	{
1144	if ( *p != '<' )
1145	{
1146	// Take what we have, make a text element.
1147	TiXmlText* textNode = new TiXmlText( "" );
1148
1149	if ( !textNode )
1150	{
1151	if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1152	return 0;
1153	}
1154
1155	if ( TiXmlBase::IsWhiteSpaceCondensed() )
1156	{
1157	p = textNode->Parse( p, data, encoding );
1158	}
1159	else
1160	{
1161	// Special case: we want to keep the white space
1162	// so that leading spaces aren't removed.
1163	p = textNode->Parse( pWithWhiteSpace, data, encoding );
1164	}
1165
1166	if ( !textNode->Blank() )
1167	LinkEndChild( textNode );
1168	else
1169	delete textNode;
1170	}
1171	else
1172	{
1173	// We hit a '<'
1174	// Have we hit a new element or an end tag? This could also be
1175	// a TiXmlText in the "CDATA" style.
1176	if ( StringEqual( p, "</", false, encoding ) )
1177	{
1178	return p;
1179	}
1180	else
1181	{
1182	TiXmlNode* node = Identify( p, encoding );
1183	if ( node )
1184	{
1185	p = node->Parse( p, data, encoding );
1186	LinkEndChild( node );
1187	}
1188	else
1189	{
1190	return 0;
1191	}
1192	}
1193	}
1194	pWithWhiteSpace = p;
1195	p = SkipWhiteSpace( p, encoding );
1196	}
1197
1198	if ( !p )
1199	{
1200	if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1201	}
1202	return p;
1203	}
1204
1205
1206	#ifdef TIXML_USE_STL
1207	void TiXmlUnknown::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1208	{
1209	while ( in->good() )
1210	{
1211	int c = in->get();
1212	if ( c <= 0 )
1213	{
1214	TiXmlDocument* document = GetDocument();
1215	if ( document )
1216	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1217	return;
1218	}
1219	(*tag) += (char) c;
1220
1221	if ( c == '>' )
1222	{
1223	// All is well.
1224	return;
1225	}
1226	}
1227	}
1228	#endif
1229
1230
1231	const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1232	{
1233	TiXmlDocument* document = GetDocument();
1234	p = SkipWhiteSpace( p, encoding );
1235
1236	if ( data )
1237	{
1238	data->Stamp( p, encoding );
1239	location = data->Cursor();
1240	}
1241	if ( !p \|\| !p \|\| p != '<' )
1242	{
1243	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1244	return 0;
1245	}
1246	++p;
1247	value = "";
1248
1249	while ( p && p && p != '>' )
1250	{
1251	value += *p;
1252	++p;
1253	}
1254
1255	if ( !p )
1256	{
1257	if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1258	}
1259	if ( *p == '>' )
1260	return p+1;
1261	return p;
1262	}
1263
1264	#ifdef TIXML_USE_STL
1265	void TiXmlComment::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1266	{
1267	while ( in->good() )
1268	{
1269	int c = in->get();
1270	if ( c <= 0 )
1271	{
1272	TiXmlDocument* document = GetDocument();
1273	if ( document )
1274	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1275	return;
1276	}
1277
1278	(*tag) += (char) c;
1279
1280	if ( c == '>'
1281	&& tag->at( tag->length() - 2 ) == '-'
1282	&& tag->at( tag->length() - 3 ) == '-' )
1283	{
1284	// All is well.
1285	return;
1286	}
1287	}
1288	}
1289	#endif
1290
1291
1292	const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1293	{
1294	TiXmlDocument* document = GetDocument();
1295	value = "";
1296
1297	p = SkipWhiteSpace( p, encoding );
1298
1299	if ( data )
1300	{
1301	data->Stamp( p, encoding );
1302	location = data->Cursor();
1303	}
1304	const char* startTag = "<!--";
1305	const char* endTag = "-->";
1306
1307	if ( !StringEqual( p, startTag, false, encoding ) )
1308	{
1309	document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1310	return 0;
1311	}
1312	p += strlen( startTag );
1313	p = ReadText( p, &value, false, endTag, false, encoding );
1314	return p;
1315	}
1316
1317
1318	const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1319	{
1320	p = SkipWhiteSpace( p, encoding );
1321	if ( !p \|\| !*p ) return 0;
1322
1323	int tabsize = 4;
1324	if ( document )
1325	tabsize = document->TabSize();
1326
1327	if ( data )
1328	{
1329	data->Stamp( p, encoding );
1330	location = data->Cursor();
1331	}
1332	// Read the name, the '=' and the value.
1333	const char* pErr = p;
1334	p = ReadName( p, &name, encoding );
1335	if ( !p \|\| !*p )
1336	{
1337	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1338	return 0;
1339	}
1340	p = SkipWhiteSpace( p, encoding );
1341	if ( !p \|\| !p \|\| p != '=' )
1342	{
1343	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1344	return 0;
1345	}
1346
1347	++p; // skip '='
1348	p = SkipWhiteSpace( p, encoding );
1349	if ( !p \|\| !*p )
1350	{
1351	if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1352	return 0;
1353	}
1354
1355	const char* end;
1356
1357	if ( *p == '\'' )
1358	{
1359	++p;
1360	end = "\'";
1361	p = ReadText( p, &value, false, end, false, encoding );
1362	}
1363	else if ( *p == '"' )
1364	{
1365	++p;
1366	end = "\"";
1367	p = ReadText( p, &value, false, end, false, encoding );
1368	}
1369	else
1370	{
1371	// All attribute values should be in single or double quotes.
1372	// But this is such a common error that the parser will try
1373	// its best, even without them.
1374	value = "";
1375	while ( p && *p // existence
1376	&& !IsWhiteSpace( p ) && p != '\n' && *p != '\r' // whitespace
1377	&& p != '/' && p != '>' ) // tag end
1378	{
1379	value += *p;
1380	++p;
1381	}
1382	}
1383	return p;
1384	}
1385
1386	#ifdef TIXML_USE_STL
1387	void TiXmlText::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1388	{
1389	if ( cdata )
1390	{
1391	int c = in->get();
1392	if ( c <= 0 )
1393	{
1394	TiXmlDocument* document = GetDocument();
1395	if ( document )
1396	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1397	return;
1398	}
1399
1400	(*tag) += (char) c;
1401
1402	if ( c == '>'
1403	&& tag->at( tag->length() - 2 ) == ']'
1404	&& tag->at( tag->length() - 3 ) == ']' )
1405	{
1406	// All is well.
1407	return;
1408	}
1409	}
1410	else
1411	{
1412	while ( in->good() )
1413	{
1414	int c = in->peek();
1415	if ( c == '<' )
1416	return;
1417	if ( c <= 0 )
1418	{
1419	TiXmlDocument* document = GetDocument();
1420	if ( document )
1421	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1422	return;
1423	}
1424
1425	(*tag) += (char) c;
1426	in->get();
1427	}
1428	}
1429	}
1430	#endif
1431
1432	const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1433	{
1434	value = "";
1435	TiXmlDocument* document = GetDocument();
1436
1437	if ( data )
1438	{
1439	data->Stamp( p, encoding );
1440	location = data->Cursor();
1441	}
1442
1443	const char* const startTag = "<![CDATA[";
1444	const char* const endTag = "]]>";
1445
1446	if ( cdata \|\| StringEqual( p, startTag, false, encoding ) )
1447	{
1448	cdata = true;
1449
1450	if ( !StringEqual( p, startTag, false, encoding ) )
1451	{
1452	document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1453	return 0;
1454	}
1455	p += strlen( startTag );
1456
1457	// Keep all the white space, ignore the encoding, etc.
1458	while ( p && *p
1459	&& !StringEqual( p, endTag, false, encoding )
1460	)
1461	{
1462	value += *p;
1463	++p;
1464	}
1465
1466	TIXML_STRING dummy;
1467	p = ReadText( p, &dummy, false, endTag, false, encoding );
1468	return p;
1469	}
1470	else
1471	{
1472	bool ignoreWhite = true;
1473
1474	const char* end = "<";
1475	p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1476	if ( p )
1477	return p-1; // don't truncate the '<'
1478	return 0;
1479	}
1480	}
1481
1482	#ifdef TIXML_USE_STL
1483	void TiXmlDeclaration::StreamIn( TIXML_ISTREAM * in, TIXML_STRING * tag )
1484	{
1485	while ( in->good() )
1486	{
1487	int c = in->get();
1488	if ( c <= 0 )
1489	{
1490	TiXmlDocument* document = GetDocument();
1491	if ( document )
1492	document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1493	return;
1494	}
1495	(*tag) += (char) c;
1496
1497	if ( c == '>' )
1498	{
1499	// All is well.
1500	return;
1501	}
1502	}
1503	}
1504	#endif
1505
1506	const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1507	{
1508	p = SkipWhiteSpace( p, _encoding );
1509	// Find the beginning, find the end, and look for
1510	// the stuff in-between.
1511	TiXmlDocument* document = GetDocument();
1512	if ( !p \|\| !*p \|\| !StringEqual( p, "<?xml", true, _encoding ) )
1513	{
1514	if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1515	return 0;
1516	}
1517	if ( data )
1518	{
1519	data->Stamp( p, _encoding );
1520	location = data->Cursor();
1521	}
1522	p += 5;
1523
1524	version = "";
1525	encoding = "";
1526	standalone = "";
1527
1528	while ( p && *p )
1529	{
1530	if ( *p == '>' )
1531	{
1532	++p;
1533	return p;
1534	}
1535
1536	p = SkipWhiteSpace( p, _encoding );
1537	if ( StringEqual( p, "version", true, _encoding ) )
1538	{
1539	TiXmlAttribute attrib;
1540	p = attrib.Parse( p, data, _encoding );
1541	version = attrib.Value();
1542	}
1543	else if ( StringEqual( p, "encoding", true, _encoding ) )
1544	{
1545	TiXmlAttribute attrib;
1546	p = attrib.Parse( p, data, _encoding );
1547	encoding = attrib.Value();
1548	}
1549	else if ( StringEqual( p, "standalone", true, _encoding ) )
1550	{
1551	TiXmlAttribute attrib;
1552	p = attrib.Parse( p, data, _encoding );
1553	standalone = attrib.Value();
1554	}
1555	else
1556	{
1557	// Read over whatever it is.
1558	while( p && p && p != '>' && !IsWhiteSpace( *p ) )
1559	++p;
1560	}
1561	}
1562	return 0;
1563	}
1564
1565	bool TiXmlText::Blank() const
1566	{
1567	for ( unsigned i=0; i<value.length(); i++ )
1568	if ( !IsWhiteSpace( value[i] ) )
1569	return false;
1570	return true;
1571	}
1572

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: orxonox.OLD/branches/test/src/lib/parser/tinyxml/tinyxmlparser.cc @ 9929

Download in other formats: