Teuchos - Trilinos Tools Package Version of the Day
Loading...
Searching...
No Matches
Teuchos_XMLParser.cpp
1// @HEADER
2// *****************************************************************************
3// Teuchos: Common Tools Package
4//
5// Copyright 2004 NTESS and the Teuchos contributors.
6// SPDX-License-Identifier: BSD-3-Clause
7// *****************************************************************************
8// @HEADER
9
10// BUGS: There is a bug in Teuchos_XMLObjectImplem.cpp, line 82
11// when printing attribute values, one must check if the value contains quote
12// or apost;
13// a quot'd attval cannot contain literal quot
14// a apos'd attval cannot contain literal apos
15// either they have to be matched appropriately or (easier) all quot and apos must
16// be replaced by " and '
17
18#include "Teuchos_XMLParser.hpp"
20#include "Teuchos_Assert.hpp"
21#include <stack>
22
23using namespace Teuchos;
24
25// this parser currently does not support:
26// * processing instructions
27// * XML schemas
28// * CDATA sections...see http://www.w3.org/TR/2004/REC-xml-20040204/#dt-cdsection
29// * full Unicode support (we read unsigned bytes, so we get only 0x00 through 0xFF)
30//
31// it tolerates (read: ignores) xml declarations, at any point in the file where a tag would be valid
32//
33// it currently does support:
34// * comments
35// * empty element tags, e.g. <hello />
36// * entity references: &amp; &lt; &gt; &apos; &quot;
37// * numeric character references: &#32;
38// * std::exception/error handling on parse errors
39
40
41/* From the W3C XML 1.0 Third Edition
42 http://www.w3.org/TR/2004/REC-xml-20040204/
43
44 The following productions specify well-formed XML documents.
45 These have been reduced to the support anticipated for support by this parser.
46
47 element ::= EmptyElemTag
48 | STag content ETag
49 STag ::= '<' Name (S Attribute)* S? '>'
50 Attribute ::= Name Eq AttValue
51 ETag ::= '</' Name S? '>'
52 content ::= CharData? ((element | Reference | CDSect | Comment) CharData?)*
53 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
54
55 AttValue ::= '"' ([^<&"] | Reference)* '"'
56 | "'" ([^<&'] | Reference)* "'"
57
58 CharRef ::= '&#' [0-9]+ ';'
59 EntityRef ::= '&' Name ';'
60 Reference ::= EntityRef | CharRef
61
62 #x20 (space)
63 #x9 (horizontal tab)
64 #xD (carriage return)
65 #xA (new line, new line line feed)
66
67 S ::= (#x20 | #x9 | #xD | #xA)+
68 Eq ::= S? '=' S?
69 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
70 Name ::= (Letter | '_' | ':') (NameChar)*
71
72 Letter ::= [#x0041-#x005A] | [#x0061-#x007A]
73 | [#x00C0-#x00D6] | [#x00D8-#x00F6]
74 | [#x00F8-#x00FF]
75 Digit ::= [#x0030-#x0039]
76
77 Char ::= #x9 | #xA | #xD | [#x20-#xFF]
78 CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
79 that is, some std::string of characters not containing '<' or '&' or ']]>'
80 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
81 that is, '<!--' txt '-->', where txt does not contain '--'
82
83 CDSect ::= CDStart CData CDEnd
84 CDStart ::= '<![CDATA['
85 CData ::= (Char* - (Char* ']]>' Char*))
86 CDEnd ::= ']]>'
87
88 document ::= prolog element Misc*
89 prolog ::= XMLDecl? Misc*
90 XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
91 Misc ::= Comment | S
92
93 VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
94 Eq ::= S? '=' S?
95 VersionNum ::= '1.' [0-9]+
96 Misc ::= Comment | S
97
98
99
100*/
101
102#define XMLPARSER_TFE( T , S ) \
103 TEUCHOS_TEST_FOR_EXCEPTION( T, std::runtime_error, "XML parse error at line " << _lineNo << ": " << S )
104
106{
107
109
110 _entities.clear();
111 _entities["apos"] = "'";
112 _entities["quot"] = "\"";
113 _entities["lt"] = "<";
114 _entities["gt"] = ">";
115 _entities["amp"] = "&";
116
117 bool done = false;
118 int curopen = 0; // number of currently open tags, or "do we process character data?"
119 bool gotRoot = false;
120 std::stack<long> tagLineStarts;
121 std::stack<string> tags;
122
123 while (!done) {
124
125 std::string tag, cdata;
126 unsigned char c1, c2;
128
129 // Consume any whitespace
130 if (curopen == 0) {
131 // this will leave a lookahead in c1
132 c1 = '\0';
133 if ( getSpace(c1) ) {
134 done = true;
135 break;
136 }
137 }
138 else {
139 // need to manually lookahead
140 if (_is->readBytes(&c1,1) < 1) {
141 done = true;
142 break;
143 }
144 if (c1 == '\n') ++_lineNo; // a newline while processing character data; not an error
145 }
146
147 if (c1 == '<') {
148 // determine if it is a STag/EmptyElemTag or ETag or Comment
149 // get lookahead
150 XMLPARSER_TFE( _is->readBytes(&c2,1) < 1 , "stream ended in tag begin/end");
151
152 if (c2 == '/') {
153 // we have: </
154 // try to get an ETag
155 getETag(tag);
156 // have to check whether we have an enclosing, otherwise tags and tagLineStarts have no top()
157 XMLPARSER_TFE( curopen == 0, "document not well-formed: encountered end element '" << tag << "' while not enclosed." );
158 XMLPARSER_TFE( handler->endElement(tag)!=0, "document not well-formed: end element tag = '" << tag << "'"
159 << " did not match start element '" << tags.top()
160 << "' from line " << tagLineStarts.top() );
161 curopen--;
162 tagLineStarts.pop();
163 tags.pop();
164 }
165 else if (isLetter(c2) || c2==':' || c2=='_') {
166 // it looks like a STag or an EmptyElemTag
167 bool emptytag;
168 tagLineStarts.push(_lineNo);
169 getSTag(c2, tag, attrs, emptytag);
170 tags.push(tag);
171 handler->startElement(tag,attrs);
172 if (curopen == 0) {
173 XMLPARSER_TFE(gotRoot == true, "document not well-formed: more than one root element specified" );
174 gotRoot = true;
175 }
176 curopen++;
177 if (emptytag) {
178 // we just open this tag, so we should have any trouble closing it
179 XMLPARSER_TFE( handler->endElement(tag)!=0, "unknown failure from handler while processing tag '" << tag << "'" );
180 curopen--;
181 tagLineStarts.pop();
182 tags.pop();
183 }
184 }
185 else if (c2 == '?') {
186 // it is starting to look like an xml declaration
187 XMLPARSER_TFE( assertChar('x') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
188 XMLPARSER_TFE( assertChar('m') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
189 XMLPARSER_TFE( assertChar('l') != 0 , "was expecting an XML declaration; element not well-formed or exploits unsupported feature" );
190 ignoreXMLDeclaration();
191 }
192 else if (c2 == '!') {
193 // it is starting to look like a comment; we need '--'
194 // if we don't get this, it means
195 // * the document is not well-formed
196 // * the document employs a feature not supported by this parser,
197 // e.g. <!ELEMENT... <!ATTLIST... <!DOCTYPE... <![CDATA[...
198 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
199 XMLPARSER_TFE( assertChar('-') != 0 , "element not well-formed or exploits unsupported feature" );
200 getComment(_lineNo);
201 }
202 else {
203 XMLPARSER_TFE(true, "element not well-formed or exploits unsupported feature" );
204 }
205 }
206 else if ( (curopen > 0) && (c1 == '&') ) {
207 std::string chars = "";
208 getReference(chars);
209 handler->characters(chars);
210 }
211 else if ( (curopen > 0) ) {
212 std::string chars = "";
213 chars.push_back(c1);
214 handler->characters(chars);
215 }
216 else {
217 XMLPARSER_TFE(1 , "document not well-formed: character data outside of an enclosing tag");
218 }
219 }
220
221 XMLPARSER_TFE( curopen != 0 , "file ended before closing element '" << tags.top() << "' from line " << tagLineStarts.top() );
222
223 return handler->getObject();
224
225}
226
227
228void XMLParser::getETag(std::string &tag)
229{
230 /* Recall from the specification:
231 ETag ::= '</' Name S? '>'
232 Name ::= (Letter | '_' | ':') (NameChar)*
233
234 We have already consumed: </
235 */
236
237 bool tagover = false;
238 unsigned char c;
239 // clear tag
240 tag = "";
241 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
242 XMLPARSER_TFE( !isLetter(c) && c!='_' && c!=':' , "tag not well-formed");
243 tag.push_back(c);
244 while (1) {
245 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before end element was terminated");
246 if ( isNameChar(c) ) {
247 if (tagover) {
248 XMLPARSER_TFE(1, "end element not well-formed: expected '>'");
249 }
250 tag.push_back(c);
251 }
252 else if (isSpace(c)) {
253 // mark the end of the tag and consume the whitespace
254 // if it is ia newline, it isn't an error
255 if (c == '\n') ++_lineNo;
256 tagover = true;
257 }
258 else if (c == '>') {
259 break;
260 }
261 else {
262 XMLPARSER_TFE(1, "end element not well-formed");
263 }
264 }
265}
266
267
268void XMLParser::getSTag(unsigned char lookahead, std::string &tag, Teuchos::map<std::string,string> &attrs, bool &emptytag)
269{
270
271 /* Recall from the specification:
272
273 STag ::= '<' Name (S Attribute)* S? '>'
274 EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
275 Name ::= (Letter | '_' | ':') (NameChar)*
276 NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | #x00B7
277
278 S ::= (#x20 | #x9 | #xD | #xA)+
279 Attribute ::= Name Eq AttValue
280 Eq ::= S? '=' S?
281 AttValue ::= '"' ([^<&"] | Reference)* '"'
282 | "'" ([^<&'] | Reference)* "'"
283 Reference ::= EntityRef | CharRef
284 CharRef ::= '&#' [0-9]+ ';'
285 EntityRef ::= '&' Name ';'
286
287 We have already consumed: <lookahead
288 */
289
290 unsigned char c;
291 attrs.clear();
292
293 tag = lookahead;
294 // get the rest of the tag: (NameChar)*
295 while (1) {
296 XMLPARSER_TFE( _is->readBytes(&c,1) < 1 , "EOF before start element was terminated");
297 if (isNameChar(c)) {
298 tag.push_back(c);
299 }
300 else {
301 break;
302 }
303 }
304
305 // after the name: should be one of the following
306 // (S Attribute) | S? '>' | S? '/>'
307 do {
308
309 bool hadspace = false;
310
311 // if space, consume the whitespace
312 if ( isSpace(c) ) {
313 hadspace = true;
314 XMLPARSER_TFE( getSpace(c)!=0, "EOF before start element was terminated");
315 }
316
317 // now, either Attribute | '>' | '/>'
318 if ( (isLetter(c) || c=='_' || c==':') && hadspace ) {
319
320 // Attribute
321 // get attribute name, starting with contents of c
322 std::string attname, attval;
323 attname = c;
324 do {
325 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
326 if ( isNameChar(c) ) {
327 attname.push_back(c);
328 }
329 else if ( isSpace(c) || c=='=' ) {
330 break;
331 }
332 else {
333 XMLPARSER_TFE(1, "attribute not well-formed: expected whitespace or '='");
334 }
335 } while (1);
336
337 // if whitespace, consume it
338 if (isSpace(c)) {
339 getSpace(c);
340 }
341 // should be on '='
342 if (c != '=') {
343 XMLPARSER_TFE(1, "attribute not well-formed: expected '='");
344 }
345
346 // get any whitespace following the '='
347 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
348 if (isSpace(c)) {
349 getSpace(c);
350 }
351
352 // now get the quoted attribute value
353 bool apost;
354 attval = "";
355 if (c == '\'') {
356 apost = true;
357 }
358 else if (c == '\"') {
359 apost = false;
360 }
361 else {
362 XMLPARSER_TFE(1, "attribute value must be quoted with either ''' or '\"'");
363 }
364 do {
365 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
366 if (apost && c=='\'') {
367 // end of attval
368 break;
369 }
370 else if (!apost && c=='\"') {
371 // end of attval
372 break;
373 }
374 else if ( c == '&' ) {
375 // finish: need to add support for Reference
376 std::string refstr;
377 getReference(refstr);
378 attval += refstr;
379 }
380 else if ( c == '\r' ) {
381 // \link https://www.w3.org/TR/xml/#sec-line-ends
382 // XML spec p2.11: normalize \r\n and standalone \r to \n.
383 // Skip \r here; the \n that follows (in \r\n) will be appended normally,
384 // and a standalone \r at end-of-line is not encountered in well-formed XML.
385 }
386 else if ( c!='<' ) {
387 // valid character for attval
388 attval.push_back(c);
389 }
390 else {
391 XMLPARSER_TFE(1, "invalid character in attribute value");
392 }
393 } while(1);
394
395 // add attribute to list
396 XMLPARSER_TFE( attrs.find(attname) != attrs.end() , "cannot have two attributes with the same name");
398 }
399 else if (c == '>') {
400 emptytag = false;
401 break;
402 }
403 else if (c == '/') {
404 XMLPARSER_TFE(assertChar('>')!=0, "empty element tag not well-formed: expected '>'");
405 emptytag = true;
406 break;
407 }
408 else {
409 XMLPARSER_TFE(1, "start element not well-formed: invalid character");
410 }
411
412 // get next char
413 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before start element was terminated");
414
415 } while(1);
416}
417
418
419void XMLParser::getComment(long /* startLine */)
420{
421 /* Recall from the specification:
422 Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
423 that is, '<!--' txt '-->', where txt does not contain '--'
424 We have already consumed: <!--
425
426 Be wary here of the fact that c=='-' implies isChar(c)
427 */
428 unsigned char c;
429 while (1) {
430 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
431 if (c == '\n') ++_lineNo;
432 // if we have a -
433 if (c=='-') {
434 // then it must be the end of the comment or be a Char
435 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating comment begun at line " << _lineNo );
436 if (c == '\n') ++_lineNo;
437 if (c=='-') {
438 // this had better be leading to the end of the comment
439 XMLPARSER_TFE( assertChar('>')!=0, "comment not well-formed: missing expected '>' at line " << _lineNo );
440 break;
441 }
442 else if (!isChar(c)) {
443 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
444 }
445 }
446 else if (!isChar(c)) {
447 XMLPARSER_TFE(1, "comment not well-formed: invalid character at line " << _lineNo );
448 }
449 }
450}
451
452
453void XMLParser::getReference(std::string &refstr) {
454 // finish: does CharRef support only dec, or hex as well?
455 unsigned char c;
456 unsigned int num, base;
457 refstr = "";
458 // none of these bytes read are allowed to be a newline, so don't do any incrementing of _lineNo
459 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
460 if (c == '#') {
461 // get a CharRef
462 // CharRef ::= '&#' [0-9]+ ';'
463 // | '&#x' [0-9]+ ';'
464 // get first number
465 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
466 if (c == 'x') {
467 base = 16;
468 num = 0;
469 }
470 else if ('0' <= c && c <= '9') {
471 base = 10;
472 num = c - '0';
473 }
474 else {
475 XMLPARSER_TFE(1, "invalid character in character reference: expected 'x' or [0-9]");
476 }
477
478 do {
479 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
480 XMLPARSER_TFE( c != ';' && !('0' <= c && c <= '9') , "invalid character in character reference: expected [0-9] or ';'");
481 if (c == ';') {
482 break;
483 }
484 num = num*base + (c-'0');
485 } while (1);
486 XMLPARSER_TFE(num > 0xFF, "character reference value out of range");
487 refstr.push_back( (unsigned char)num );
488 }
489 else if (isLetter(c) || c=='_' || c==':') {
490 // get an EntityRef
491 // EntityRef ::= '&' Name ';'
492 std::string entname = "";
493 entname.push_back(c);
494 do {
495 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before reference was terminated");
496 if (c==';') {
497 break;
498 }
499 else if ( isLetter(c) || ('0' <= c && c <= '9')
500 || c=='.' || c=='-' || c=='_' || c==':'
501 || c==0xB7 ) {
502 entname.push_back(c);
503 }
504 else {
505 XMLPARSER_TFE(1, "entity reference not well-formed: invalid character");
506 }
507 } while (1);
508 XMLPARSER_TFE( _entities.find(entname) == _entities.end(), "entity reference not well-formed: undefined entity");
509 refstr = _entities[entname];
510 }
511 else {
512 XMLPARSER_TFE(1, "reference not well-formed: expected name or '#'");
513 }
514}
515
516
517int XMLParser::getSpace(unsigned char &lookahead) {
518 // if space, consume the whitespace
519 do {
520 if (lookahead == '\n') ++_lineNo;
521 if (_is->readBytes(&lookahead,1) < 1) {
522 return 1; // inform caller that we reached the end
523 }
524 }
525 while (isSpace(lookahead));
526 return 0;
527}
528
529
530bool XMLParser::isLetter(unsigned char c) {
531 if ( (0x41 <= c && c <= 0x5A) || (0x61 <= c && c <= 0x7A) ||
532 (0xC0 <= c && c <= 0xD6) || (0xD8 <= c && c <= 0xF6) ||
533 (0xF8 <= c) /* unsigned char must be <= 0xFF */ )
534 {
535 return true;
536 }
537 return false;
538}
539
540
541bool XMLParser::isNameChar(unsigned char c) {
542 if ( isLetter(c) || ('0' <= c && c <= '9') ||
543 c=='.' || c=='-' || c=='_' || c==':' || c==0xB7 )
544 {
545 return true;
546 }
547 return false;
548}
549
550
551bool XMLParser::isSpace(unsigned char c) {
552 if ( c==0x20 || c==0x9 || c==0xD || c==0xA )
553 {
554 return true;
555 }
556 return false;
557}
558
559
560bool XMLParser::isChar(unsigned char c) {
561 if ( c==0x9 || c==0xA || c==0xD || 0x20 <= c) { // unsigned char must be <= 0xFF
562 return true;
563 }
564 return false;
565}
566
567
568int XMLParser::assertChar(unsigned char cexp)
569{
570 // pull the next character off the stream and verify that it is what is expected
571 // if not, return an error to the caller
572 unsigned char c;
573 // don't worry about newlines; assertChar is always wrapped in TEST_FOR_EXCEPTION, so we don't want to advance the line counter
574 if (_is->readBytes(&c,1) < 1) {
575 return 1;
576 }
577 if (c != cexp) {
578 return 2;
579 }
580 return 0;
581}
582
583void XMLParser::ignoreXMLDeclaration()
584{
585 /* Be a little lax on the spec here; read until we get to '?', then assert '>'
586 We have already consumed: <xml
587 */
588 unsigned char c;
589 while (1) {
590 XMLPARSER_TFE(_is->readBytes(&c,1) < 1, "EOF before terminating XML declaration begun at line " << _lineNo );
591 if (c == '\n') ++_lineNo;
592 // if we have a -
593 if (c=='?') {
594 // this had better be leading to the end of the declaration
595 XMLPARSER_TFE( assertChar('>')!=0, "XML declaration not well-formed: missing expected '>' at line " << _lineNo );
596 break;
597 }
598 }
599}
Defines a class for assembling an XMLObject from XML input.
A class providing a simple XML parser. Methods can be overloaded to exploit external XML parsing libr...
Smart reference counting pointer class for automatic garbage collection.
TreeBuildingXMLHandler assembles a XMLObject from your XML input.
Representation of an XML data tree. XMLObject is a ref-counted handle to a XMLObjectImplem object,...
XMLObject parse()
Consume the XMLInputStream to build an XMLObject.
The Teuchos namespace contains all of the classes, structs and enums used by Teuchos,...
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Deprecated.