Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_XMLParser.cpp
1// @HEADER
2// *****************************************************************************
3// Teuchos: Common Tools Package
4//
5// Copyright 2004 NTESS and the Teuchos contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10// BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
11// when printing attribute values, one must check if the value contains quote
12// or apost;
13// a quot'd attval cannot contain literal quot
14// a apos'd attval cannot contain literal apos
15// either they have to be matched appropriately or (easier) all quot and apos must
16// be replaced by " and '
17
18#include "Teuchos_XMLParser.hpp"
20#include "Teuchos_Assert.hpp"
21#include <stack>
22
23using namespace Teuchos;
24
25// this parser currently does not support:
26// * processing instructions
27// * XML schemas
28// * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
29// * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
30//
31// it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
32//
33// it currently does support:
34// * comments
35// * empty element tags, e.g. <hello />
36// * entity references: &amp; &lt; &gt; &apos; &quot;
37// * numeric character references: &#32;
38// * std::exception/error handling on parse errors
39
40
41/* From the W3C XML 1.0 Third Edition
42 http://www.w3.org/TR/2004/REC-xml-20040204/
43
44 The following productions specify well-formed XML documents.
45 These have been reduced to the support anticipated for support by this parser.
46
47 element ::= EmptyElemTag
48 | STag content ETag
49 STag ::= '<' Name (S Attribute)* S? '>'
50 Attribute ::= Name Eq AttValue
51 ETag ::= '</' Name S? '>'
52 content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
53 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
54
55 AttValue ::= '"' ([^<&"] | Reference)* '"'
56 | "'" ([^<&'] | Reference)* "'"
57
58 CharRef ::= '&#' [0-9]+ ';'
59 EntityRef ::= '&' Name ';'
60 Reference ::= EntityRef | CharRef
61
62 #x20 (space)
63 #x9 (horizontal tab)
64 #xD (carriage return)
65 #xA (new line, new line line feed)
66
67 S ::= (#x20 | #x9 | #xD | #xA)+
68 Eq ::= S? '=' S?
69 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
70 Name ::= (Letter | '_' | ':') (NameChar)*
71
72 Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
73 | [#x00C0-#x00D6] | [#x00D8-#x00F6]
74 | [#x00F8-#x00FF]
75 Digit ::= [#x0030-#x0039]
76
77 Char ::= #x9 | #xA | #xD | [#x20-#xFF]
78 CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
79 that is, some std::string of characters not containing '<' or '&' or ']]>'
80 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
81 that is, '<!--' txt '-->', where txt does not contain '--'
82
83 CDSect ::= CDStart CData CDEnd
84 CDStart ::= '<![CDATA['
85 CData ::= (Char* - (Char* ']]>' Char*))
86 CDEnd ::= ']]>'
87
88 document ::= prolog element Misc*
89 prolog ::= XMLDecl? Misc*
90 XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
91 Misc ::= Comment | S
92
93 VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
94 Eq ::= S? '=' S?
95 VersionNum ::= '1.' [0-9]+
96 Misc ::= Comment | S
97
98
99
100*/
101
102#define XMLPARSER_TFE( T , S ) \
103 TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
104
106{
107
109
110 _entities.clear();
111 _entities["apos"] = "'";
112 _entities["quot"] = "\"";
113 _entities["lt"] = "<";
114 _entities["gt"] = ">";
115 _entities["amp"] = "&";
116
117 bool done = false;
118 int curopen = 0; // number of currently open tags, or "do we process character data?"
119 bool gotRoot = false;
120 std::stack<long> tagLineStarts;
121 std::stack<string> tags;
122
123 while (!done) {
124
125 std::string tag, cdata;
126 unsigned char c1, c2;
128
129 // Consume any whitespace
130 if (curopen == 0) {
131 // this will leave a lookahead in c1
132 c1 = '\0';
133 if ( getSpace(c1) ) {
134 done = true;
135 break;
136 }
137 }
138 else {
139 // need to manually lookahead
140 if (_is->readBytes(&c1,1) < 1) {
141 done = true;
142 break;
143 }
144 if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
145 }
146
147 if (c1 == '<') {
148 // determine if it is a STag/EmptyElemTag or ETag or Comment
149 // get lookahead
150 XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
151
152 if (c2 == '/') {
153 // we have: </
154 // try to get an ETag
155 getETag(tag);
156 // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
157 XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
158 XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
159 << " did not match start element '" << tags.top()
160 << "' from line " << tagLineStarts.top() );
161 curopen--;
162 tagLineStarts.pop();
163 tags.pop();
164 }
165 else if (isLetter(c2) || c2==':' || c2=='_') {
166 // it looks like a STag or an EmptyElemTag
167 bool emptytag;
168 tagLineStarts.push(_lineNo);
169 getSTag(c2, tag, attrs, emptytag);
170 tags.push(tag);
171 handler->startElement(tag,attrs);
172 if (curopen == 0) {
173 XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
174 gotRoot = true;
175 }
176 curopen++;
177 if (emptytag) {
178 // we just open this tag, so we should have any trouble closing it
179 XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
180 curopen--;
181 tagLineStarts.pop();
182 tags.pop();
183 }
184 }
185 else if (c2 == '?') {
186 // it is starting to look like an xml declaration
187 XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
188 XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
189 XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
190 ignoreXMLDeclaration();
191 }
192 else if (c2 == '!') {
193 // it is starting to look like a comment; we need '--'
194 // if we don't get this, it means
195 // * the document is not well-formed
196 // * the document employs a feature not supported by this parser,
197 // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
198 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
199 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
200 getComment(_lineNo);
201 }
202 else {
203 XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
204 }
205 }
206 else if ( (curopen > 0) && (c1 == '&') ) {
207 std::string chars = "";
208 getReference(chars);
209 handler->characters(chars);
210 }
211 else if ( (curopen > 0) ) {
212 std::string chars = "";
213 chars.push_back(c1);
214 handler->characters(chars);
215 }
216 else {
217 XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
218 }
219 }
220
221 XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
222
223 return handler->getObject();
224
225}
226
227
228void XMLParser::getETag(std::string &tag)
229{
230 /* Recall from the specification:
231 ETag ::= '</' Name S? '>'
232 Name ::= (Letter | '_' | ':') (NameChar)*
233
234 We have already consumed: </
235 */
236
237 bool tagover = false;
238 unsigned char c;
239 // clear tag
240 tag = "";
241 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
242 XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
243 tag.push_back(c);
244 while (1) {
245 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
246 if ( isNameChar(c) ) {
247 if (tagover) {
248 XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
249 }
250 tag.push_back(c);
251 }
252 else if (isSpace(c)) {
253 // mark the end of the tag and consume the whitespace
254 // if it is ia newline, it isn't an error
255 if (c == '\n') ++_lineNo;
256 tagover = true;
257 }
258 else if (c == '>') {
259 break;
260 }
261 else {
262 XMLPARSER_TFE(1, "end element not well-formed");
263 }
264 }
265}
266
267
268void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
269{
270
271 /* Recall from the specification:
272
273 STag ::= '<' Name (S Attribute)* S? '>'
274 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
275 Name ::= (Letter | '_' | ':') (NameChar)*
276 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
277
278 S ::= (#x20 | #x9 | #xD | #xA)+
279 Attribute ::= Name Eq AttValue
280 Eq ::= S? '=' S?
281 AttValue ::= '"' ([^<&"] | Reference)* '"'
282 | "'" ([^<&'] | Reference)* "'"
283 Reference ::= EntityRef | CharRef
284 CharRef ::= '&#' [0-9]+ ';'
285 EntityRef ::= '&' Name ';'
286
287 We have already consumed: <lookahead
288 */
289
290 unsigned char c;
291 attrs.clear();
292
293 tag = lookahead;
294 // get the rest of the tag: (NameChar)*
295 while (1) {
296 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
297 if (isNameChar(c)) {
298 tag.push_back(c);
299 }
300 else {
301 break;
302 }
303 }
304
305 // after the name: should be one of the following
306 // (S Attribute) | S? '>' | S? '/>'
307 do {
308
309 bool hadspace = false;
310
311 // if space, consume the whitespace
312 if ( isSpace(c) ) {
313 hadspace = true;
314 XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
315 }
316
317 // now, either Attribute | '>' | '/>'
318 if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
319
320 // Attribute
321 // get attribute name, starting with contents of c
322 std::string attname, attval;
323 attname = c;
324 do {
325 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
326 if ( isNameChar(c) ) {
327 attname.push_back(c);
328 }
329 else if ( isSpace(c) || c=='=' ) {
330 break;
331 }
332 else {
333 XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
334 }
335 } while (1);
336
337 // if whitespace, consume it
338 if (isSpace(c)) {
339 getSpace(c);
340 }
341 // should be on '='
342 if (c != '=') {
343 XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
344 }
345
346 // get any whitespace following the '='
347 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
348 if (isSpace(c)) {
349 getSpace(c);
350 }
351
352 // now get the quoted attribute value
353 bool apost;
354 attval = "";
355 if (c == '\'') {
356 apost = true;
357 }
358 else if (c == '\"') {
359 apost = false;
360 }
361 else {
362 XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
363 }
364 do {
365 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
366 if (apost && c=='\'') {
367 // end of attval
368 break;
369 }
370 else if (!apost && c=='\"') {
371 // end of attval
372 break;
373 }
374 else if ( c == '&' ) {
375 // finish: need to add support for Reference
376 std::string refstr;
377 getReference(refstr);
378 attval += refstr;
379 }
380 else if ( c!='<' ) {
381 // valid character for attval
382 attval.push_back(c);
383 }
384 else {
385 XMLPARSER_TFE(1, "invalid character in attribute value");
386 }
387 } while(1);
388
389 // add attribute to list
390 XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
392 }
393 else if (c == '>') {
394 emptytag = false;
395 break;
396 }
397 else if (c == '/') {
398 XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
399 emptytag = true;
400 break;
401 }
402 else {
403 XMLPARSER_TFE(1, "start element not well-formed: invalid character");
404 }
405
406 // get next char
407 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
408
409 } while(1);
410}
411
412
413void XMLParser::getComment(long /* startLine */)
414{
415 /* Recall from the specification:
416 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
417 that is, '<!--' txt '-->', where txt does not contain '--'
418 We have already consumed: <!--
419
420 Be wary here of the fact that c=='-' implies isChar(c)
421 */
422 unsigned char c;
423 while (1) {
424 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
425 if (c == '\n') ++_lineNo;
426 // if we have a -
427 if (c=='-') {
428 // then it must be the end of the comment or be a Char
429 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
430 if (c == '\n') ++_lineNo;
431 if (c=='-') {
432 // this had better be leading to the end of the comment
433 XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
434 break;
435 }
436 else if (!isChar(c)) {
437 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
438 }
439 }
440 else if (!isChar(c)) {
441 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
442 }
443 }
444}
445
446
447void XMLParser::getReference(std::string &refstr) {
448 // finish: does CharRef support only dec, or hex as well?
449 unsigned char c;
450 unsigned int num, base;
451 refstr = "";
452 // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
453 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
454 if (c == '#') {
455 // get a CharRef
456 // CharRef ::= '&#' [0-9]+ ';'
457 // | '&#x' [0-9]+ ';'
458 // get first number
459 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
460 if (c == 'x') {
461 base = 16;
462 num = 0;
463 }
464 else if ('0' <= c && c <= '9') {
465 base = 10;
466 num = c - '0';
467 }
468 else {
469 XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
470 }
471
472 do {
473 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
474 XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
475 if (c == ';') {
476 break;
477 }
478 num = num*base + (c-'0');
479 } while (1);
480 XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
481 refstr.push_back( (unsigned char)num );
482 }
483 else if (isLetter(c) || c=='_' || c==':') {
484 // get an EntityRef
485 // EntityRef ::= '&' Name ';'
486 std::string entname = "";
487 entname.push_back(c);
488 do {
489 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
490 if (c==';') {
491 break;
492 }
493 else if ( isLetter(c) || ('0' <= c && c <= '9')
494 || c=='.' || c=='-' || c=='_' || c==':'
495 || c==0xB7 ) {
496 entname.push_back(c);
497 }
498 else {
499 XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
500 }
501 } while (1);
502 XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
503 refstr = _entities[entname];
504 }
505 else {
506 XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
507 }
508}
509
510
511int XMLParser::getSpace(unsigned char &lookahead) {
512 // if space, consume the whitespace
513 do {
514 if (lookahead == '\n') ++_lineNo;
515 if (_is->readBytes(&lookahead,1) < 1) {
516 return 1; // inform caller that we reached the end
517 }
518 }
519 while (isSpace(lookahead));
520 return 0;
521}
522
523
524bool XMLParser::isLetter(unsigned char c) {
525 if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
526 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
527 (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
528 {
529 return true;
530 }
531 return false;
532}
533
534
535bool XMLParser::isNameChar(unsigned char c) {
536 if ( isLetter(c) || ('0' <= c && c <= '9') ||
537 c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
538 {
539 return true;
540 }
541 return false;
542}
543
544
545bool XMLParser::isSpace(unsigned char c) {
546 if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
547 {
548 return true;
549 }
550 return false;
551}
552
553
554bool XMLParser::isChar(unsigned char c) {
555 if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
556 return true;
557 }
558 return false;
559}
560
561
562int XMLParser::assertChar(unsigned char cexp)
563{
564 // pull the next character off the stream and verify that it is what is expected
565 // if not, return an error to the caller
566 unsigned char c;
567 // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
568 if (_is->readBytes(&c,1) < 1) {
569 return 1;
570 }
571 if (c != cexp) {
572 return 2;
573 }
574 return 0;
575}
576
577void XMLParser::ignoreXMLDeclaration()
578{
579 /* Be a little lax on the spec here; read until we get to '?', then assert '>'
580 We have already consumed: <xml
581 */
582 unsigned char c;
583 while (1) {
584 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
585 if (c == '\n') ++_lineNo;
586 // if we have a -
587 if (c=='?') {
588 // this had better be leading to the end of the declaration
589 XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
590 break;
591 }
592 }
593}
Defines a class for assembling an XMLObject from XML input.
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...
Smart reference counting pointer class for automatic garbage collection.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object,...
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.