Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
xmlparser.c
1/*************************************************************************/
2/* */
3/* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4/* University of Edinburgh. */
5/* */
6/* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9/* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10/* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11/* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12/* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13/* */
14/*************************************************************************/
15/* $Id: xmlparser.c,v 1.3 2004/05/04 00:00:17 awb Exp $ */
16
17#ifndef lint
18static char vcid[] = "$Id: xmlparser.c,v 1.3 2004/05/04 00:00:17 awb Exp $";
19#endif /* lint */
20
21/*
22 * XML (and nSGML) parser.
23 * Author: Richard Tobin.
24 */
25
26#include <stdarg.h>
27#include <stdlib.h>
28
29#ifdef FOR_LT
30
31#include "lt-memory.h"
32#include "nsllib.h"
33
34#define Malloc salloc
35#define Realloc srealloc
36#define Free sfree
37
38#else
39
40#include "system.h"
41
42#endif
43
44#include "charset.h"
45#include "string16.h"
46#include "ctype16.h"
47#include "dtd.h"
48#include "input.h"
49#include "stdio16.h"
50#include "xmlparser.h"
51
52static int transcribe(Parser p, int back, int count);
53static void pop_while_at_eoe(Parser p);
54static void maybe_uppercase(Parser p, Char *s);
55static void maybe_uppercase_name(Parser p);
56static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b);
57static int is_ascii_alpha(int c);
58static int is_ascii_digit(int c);
59static int parse_external_id(Parser p, int required,
60 char8 **publicid, char8 **systemid,
61 int preq, int sreq);
62static int parse_conditional(Parser p);
63static int parse_notation_decl(Parser p);
64static int parse_entity_decl(Parser p, Entity ent, int line, int chpos);
65static int parse_attlist_decl(Parser p);
66static int parse_element_decl(Parser p);
67static ContentParticle parse_cp(Parser p);
68static ContentParticle parse_choice_or_seq(Parser p);
69static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren,char sep);
70static int check_content_decl(Parser p, ContentParticle cp);
71static int check_content_decl_1(Parser p, ContentParticle cp);
72static Char *stringify_cp(ContentParticle cp);
73static void print_cp(ContentParticle cp, FILE16 *f);
74static int size_cp(ContentParticle cp);
75void FreeContentParticle(ContentParticle cp);
76static int parse_reference(Parser p, int pe, int expand, int allow_external);
77static int parse_character_reference(Parser p, int expand);
78static const char8 *escape(int c);
79static int parse_name(Parser p, const char8 *where);
80static int parse_nmtoken(Parser p, const char8 *where);
81static int looking_at(Parser p, const char8 *string);
82static void clear_xbit(XBit xbit);
83static int expect(Parser p, int expected, const char8 *where);
84static int expect_dtd_whitespace(Parser p, const char8 *where);
85static void skip_whitespace(InputSource s);
86static int skip_dtd_whitespace(Parser p, int allow_pe);
87static int parse_cdata(Parser p);
88static int process_nsl_decl(Parser p);
89static int process_xml_decl(Parser p);
90static int parse_dtd(Parser p);
91static int read_markupdecls(Parser p);
92static int error(Parser p, const char8 *format, ...);
93static void warn(Parser p, const char8 *format, ...);
94static void verror(XBit bit, const char8 *format, va_list args);
95enum literal_type {LT_cdata_attr, LT_tok_attr, LT_plain, LT_entity};
96static int parse_string(Parser p, const char8 *where, enum literal_type type);
97static int parse_pi(Parser p);
98static int parse_comment(Parser p, int skip);
99static int parse_pcdata(Parser p);
100static int parse_starttag(Parser p);
101static int parse_attribute(Parser p);
102static int parse_endtag(Parser p);
103static int parse_markup(Parser p);
104static int parse(Parser p);
105static int parse_markupdecl(Parser p);
106
107#define require(x) if(x >= 0) {} else return -1
108#define require0(x) if(x >= 0) {} else return 0
109
110#define Consume(buf) (buf = 0, buf##size = 0)
111#define ExpandBuf(buf, sz) \
112 if(buf##size >= (sz)+1) {} else if((buf = Realloc(buf, (buf##size = sz + 1) * sizeof(Char)))) {} else return error(p, "System error")
113
114#define CopyName(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else return error(p, "System error");
115
116#define CopyName0(n) if((n = Malloc((p->namelen + 1)*sizeof(Char)))) {memcpy(n, p->name, p->namelen * sizeof(Char)); n[p->namelen] = 0;} else {error(p, "System error"); return 0;}
117
118const char8 *XBitTypeName[XBIT_enum_count] = {
119 "dtd",
120 "start",
121 "empty",
122 "end",
123 "eof",
124 "pcdata",
125 "pi",
126 "comment",
127 "cdsect",
128 "xml",
129 "error",
130 "warning",
131 "none"
132};
133
134static Entity xml_builtin_entity;
135static Entity xml_predefined_entities;
136
137int ParserInit(void)
138{
139 static int initialised = 0;
140 Entity e, f;
141 int i;
142 static const Char lt[] = {'l','t',0}, ltval[] = {'&','#','6','0',';',0};
143 static const Char gt[] = {'g','t',0}, gtval[] = {'&','#','6','2',';',0};
144 static const Char amp[] = {'a','m','p',0},
145 ampval[] = {'&','#','3','8',';',0};
146 static const Char apos[] = {'a','p','o','s',0}, aposval[] = {'\'',0};
147 static const Char quot[] = {'q','u','o','t',0}, quotval[] = {'"',0};
148 static const Char *builtins[5][2] = {
149 {lt, ltval}, {gt, gtval}, {amp, ampval},
150 {apos, aposval}, {quot, quotval}
151 };
152 (void)vcid;
153
154 if(initialised)
155 return 0;
156 initialised = 1;
157
158 init_charset();
159 init_ctype16();
160 init_stdio16();
161
162 for(i=0, f=0; i<5; i++, f=e)
163 {
164 e = NewInternalEntity(builtins[i][0], builtins[i][1],
165 xml_builtin_entity, 0, 0, 0);
166 if(!e)
167 return -1;
168 e->next = f;
169 }
170
171 xml_predefined_entities = e;
172
173 return 0;
174}
175
176static void skip_whitespace(InputSource s)
177{
178 int c;
179
180 while((c = get(s)) != XEOE && is_xml_whitespace(c))
181 ;
182 unget(s);
183}
184
185/*
186 * Skip whitespace and (optionally) the start and end of PEs. Return 1 if
187 * there actually *was* some whitespace or a PE start/end, -1 if
188 * an error occurred, 0 otherwise.
189 */
190
191static int skip_dtd_whitespace(Parser p, int allow_pe)
192{
193 int c;
194 int got_some = 0;
195 InputSource s = p->source;
196
197 while(1)
198 {
199 c = get(s);
200
201 if(c == XEOE)
202 {
203 got_some = 1;
204 if(s->parent)
205 {
206 if(!allow_pe)
207 return error(p,
208 "PE end not allowed here in internal subset");
209 if(s->entity->type == ET_external)
210 p->external_pe_depth--;
211 ParserPop(p);
212 s = p->source;
213 }
214 else
215 {
216 unget(s); /* leave the final EOE waiting to be read */
217 return got_some;
218 }
219 }
220 else if(is_xml_whitespace(c))
221 {
222 got_some = 1;
223 }
224 else if(c == '%')
225 {
226 /* this complication is needed for <!ENTITY % ...
227 otherwise we could just assume it was a PE reference. */
228
229 c = get(s); unget(s);
230 if(c != XEOE && is_xml_namestart(c))
231 {
232 if(!allow_pe)
233 {
234 unget(s); /* For error position */
235 return error(p,
236 "PE ref not allowed here in internal subset");
237 }
238 require(parse_reference(p, 1, 1, 1));
239 s = p->source;
240 if(s->entity->type == ET_external)
241 p->external_pe_depth++;
242 got_some = 1;
243 }
244 else
245 {
246 unget(s);
247 return got_some;
248 }
249 }
250 else
251 {
252 unget(s);
253 return got_some;
254 }
255 }
256}
257
258static int expect(Parser p, int expected, const char8 *where)
259{
260 int c;
261 InputSource s = p->source;
262
263 c = get(s);
264 if(c != expected)
265 {
266 unget(s); /* For error position */
267 return error(p, "Expected %s %s, but got %s",
268 escape(expected), where, escape(c));
269 }
270
271 return 0;
272}
273
274/*
275 * Expects whitespace or the start or end of a PE.
276 */
277
278static int expect_dtd_whitespace(Parser p, const char8 *where)
279{
280 int r = skip_dtd_whitespace(p, p->external_pe_depth > 0);
281
282 if(r < 0)
283 return -1;
284
285 if(r == 0)
286 return error(p, "Expected whitespace %s", where);
287
288 return 0;
289}
290
291static void clear_xbit(XBit xbit)
292{
293 xbit->type = XBIT_none;
294 xbit->s1 = xbit->s2 = 0;
295 xbit->S1 = xbit->S2 = 0;
296 xbit->attributes = 0;
297 xbit->element_definition = 0;
298}
299
300void FreeXBit(XBit xbit)
301{
302 Attribute a, b;
303
304 if(xbit->S1) Free(xbit->S1);
305 if(xbit->S2) Free(xbit->S2);
306 if(xbit->type != XBIT_error && xbit->type != XBIT_warning && xbit->s1)
307 Free(xbit->s1);
308 if(xbit->s2) Free(xbit->s2);
309 for(a = xbit->attributes; a; a = b)
310 {
311 b = a->next;
312 if(a->value) Free(a->value);
313 Free(a);
314 }
315 clear_xbit(xbit);
316}
317
318/*
319 * Returns 1 if the input matches string (and consume the input).
320 * Otherwise returns 0 and leaves the input stream where it was.
321 * Case-sensitivity depends on the CaseInsensitive flag.
322 * A space character at end of string matches any (non-zero) amount of
323 * whitespace; space are treated literally elsewhere.
324 * Never reads beyond an end-of-line, except to consume
325 * extra whitespace when the last character of string is a space.
326 * Never reads beyond end-of-entity.
327 */
328
329static int looking_at(Parser p, const char8 *string)
330{
331 InputSource s = p->source;
332 int c, d;
333 int save = s->next;
334
335 for(c = *string++; c; c = *string++)
336 {
337 if(at_eol(s))
338 goto fail; /* We would go over a line end */
339
340 d = get(s);
341
342 if(c == ' ' && *string == 0)
343 {
344 if(d == XEOE || !is_xml_whitespace(d))
345 goto fail;
346 skip_whitespace(s);
347 }
348 else
349 if((ParserGetFlag(p, CaseInsensitive) &&
350 Toupper(d) != Toupper(c)) ||
351 (!ParserGetFlag(p, CaseInsensitive) && d != c))
352 goto fail;
353 }
354
355 return 1;
356
357fail:
358 s->next = save;
359 return 0;
360}
361
362static int parse_name(Parser p, const char8 *where)
363{
364 InputSource s = p->source;
365 int c, i;
366
367 c = get(s);
368 if(c == XEOE || !is_xml_namestart(c))
369 {
370 unget(s); /* For error position */
371 error(p, "Expected name, but got %s %s", escape(c), where);
372 return -1;
373 }
374 i = 1;
375
376 while(c = get(s), (c != XEOE && is_xml_namechar(c)))
377 i++;
378 unget(s);
379
380 p->name = s->line + s->next - i;
381 p->namelen = i;
382
383 return 0;
384}
385
386static int parse_nmtoken(Parser p, const char8 *where)
387{
388 InputSource s = p->source;
389 int c, i=0;
390
391 while(c = get(s), (c !=XEOE && is_xml_namechar(c)))
392 i++;
393 unget(s);
394
395 if(i == 0)
396 return error(p, "Expected nmtoken value, but got %s %s",
397 escape(c), where);
398
399 p->name = s->line + s->next - i;
400 p->namelen = i;
401
402 return 0;
403}
404
405/* Escape a character for printing n an error message.
406 NB returns 5 static storage buffers in rotation. */
407
408static const char8 *escape(int c)
409{
410 static char8 buf[5][15];
411 static int bufnum=-1;
412
413#if CHAR_SIZE == 8
414 if(c != XEOE)
415 c &= 0xff;
416#endif
417
418 bufnum = (bufnum + 1) % 5;
419
420 if(c == XEOE)
421 return "<EOE>";
422 else if(c >= 33 && c <= 126)
423 sprintf(buf[bufnum], "%c", c);
424 else if(c == ' ')
425 sprintf(buf[bufnum], "<space>");
426 else
427 sprintf(buf[bufnum], "<0x%x>", c);
428
429 return buf[bufnum];
430}
431
432Parser NewParser(void)
433{
434 Parser p;
435
436 if(ParserInit() == -1)
437 return 0;
438
439 p = Malloc(sizeof(*p));
440 if(!p)
441 return 0;
442 p->state = PS_prolog1;
443 p->document_entity = 0; /* Set at first ParserPush */
444 p->have_dtd = 0;
445 p->standalone = SDD_unspecified;
446 p->flags = 0;
447 p->source = 0;
448 clear_xbit(&p->xbit);
449#ifndef FOR_LT
450 p->xbit.nchildren = 0; /* These three should never be changed */
451 p->xbit.children = 0;
452 p->xbit.parent = 0;
453#endif
454 p->pbufsize = p->pbufnext = 0;
455 p->pbuf = 0;
456 p->peeked = 0;
457 p->dtd = NewDtd();
458 p->dtd_callback = p->warning_callback = 0;
459 p->entity_opener = 0;
460 p->callback_arg = 0;
461 p->external_pe_depth = 0;
462
463 p->element_stack = 0;
464 p->element_stack_alloc = 0;
465 p->element_depth = 0;
466
467 ParserSetFlag(p, XMLPiEnd, 1);
468 ParserSetFlag(p, XMLEmptyTagEnd, 1);
469 ParserSetFlag(p, XMLPredefinedEntities, 1);
470 ParserSetFlag(p, XMLExternalIDs, 1);
471 ParserSetFlag(p, XMLMiscWFErrors, 1);
472 ParserSetFlag(p, ErrorOnUnquotedAttributeValues, 1);
473 ParserSetFlag(p, XMLLessThan, 1);
474 ParserSetFlag(p, IgnoreEntities, 0);
475 ParserSetFlag(p, ExpandGeneralEntities, 1);
476 ParserSetFlag(p, ExpandCharacterEntities, 1);
477 ParserSetFlag(p, NormaliseAttributeValues, 1);
478 ParserSetFlag(p, WarnOnUndefinedElements, 1);
479 ParserSetFlag(p, WarnOnUndefinedAttributes, 1);
480 ParserSetFlag(p, WarnOnRedefinitions, 1);
481 ParserSetFlag(p, TrustSDD, 1);
482 ParserSetFlag(p, ReturnComments, 1);
483 ParserSetFlag(p, CheckEndTagsMatch, 1);
484
485 return p;
486}
487
488void FreeParser(Parser p)
489{
490 while (p->source)
491 ParserPop(p); /* Will close file */
492
493 Free(p->pbuf);
494 Free(p->element_stack);
495 Free(p);
496}
497
498InputSource ParserRootSource(Parser p)
499{
500 InputSource s;
501
502 for(s=p->source; s && s->parent; s = s->parent)
503 ;
504
505 return s;
506}
507
508Entity ParserRootEntity(Parser p)
509{
510 return ParserRootSource(p)->entity;
511}
512
513void ParserSetCallbackArg(Parser p, void *arg)
514{
515 p->callback_arg = arg;
516}
517
518void ParserSetDtdCallback(Parser p, CallbackProc cb)
519{
520 p->dtd_callback = cb;
521}
522
523void ParserSetWarningCallback(Parser p, CallbackProc cb)
524{
525 p->warning_callback = cb;
526}
527
528void ParserSetEntityOpener(Parser p, EntityOpenerProc opener)
529{
530 p->entity_opener = opener;
531}
532
533#ifndef FOR_LT
534
535XBit ReadXTree(Parser p)
536{
537 XBit bit, tree, child;
538 XBit *children;
539
540 bit = ReadXBit(p);
541
542 switch(bit->type)
543 {
544 case XBIT_error:
545 return bit;
546
547 case XBIT_start:
548 if(!(tree = Malloc(sizeof(*tree))))
549 {
550 error(p, "System error");
551 return &p->xbit;
552 }
553 *tree = *bit;
554 while(1)
555 {
556 child = ReadXTree(p);
557 switch(child->type)
558 {
559 case XBIT_error:
560 FreeXTree(tree);
561 return child;
562
563 case XBIT_eof:
564 FreeXTree(tree);
565 {
566 error(p, "EOF in element");
567 return &p->xbit;
568 }
569
570 case XBIT_end:
571 if(child->element_definition != tree->element_definition)
572 {
573 const Char *name1 = tree->element_definition->name,
574 *name2 = child->element_definition->name;
575 FreeXTree(tree);
576 FreeXTree(child);
577 error(p, "Mismatched end tag: expected </%S>, got </%S>",
578 name1, name2);
579 return &p->xbit;
580 }
581 FreeXTree(child);
582 return tree;
583
584 default:
585 children = Realloc(tree->children,
586 (tree->nchildren + 1) * sizeof(XBit));
587 if(!children)
588 {
589 FreeXTree(tree);
590 FreeXTree(child);
591 error(p, "System error");
592 return &p->xbit;
593 }
594 child->parent = tree;
595 children[tree->nchildren] = child;
596 tree->nchildren++;
597 tree->children = children;
598 break;
599 }
600 }
601
602 default:
603 if(!(tree = Malloc(sizeof(*tree))))
604 {
605 error(p, "System error");
606 return &p->xbit;
607 }
608 *tree = *bit;
609 return tree;
610 }
611}
612
613void FreeXTree(XBit tree)
614{
615 int i;
616
617 for(i=0; i<tree->nchildren; i++)
618 FreeXTree(tree->children[i]);
619
620 Free(tree->children);
621
622 FreeXBit(tree);
623
624 if(tree->type == XBIT_error)
625 /* error "trees" are always in the Parser structure, not malloced */
626 return;
627
628 Free(tree);
629}
630
631#endif /* (not) FOR_LT */
632
633XBit ReadXBit(Parser p)
634{
635 if(p->peeked)
636 p->peeked = 0;
637 else
638 parse(p);
639
640 return &p->xbit;
641}
642
643XBit PeekXBit(Parser p)
644{
645 if(p->peeked)
646 error(p, "Attempt to peek twice");
647 else
648 {
649 parse(p);
650 p->peeked = 1;
651 }
652
653 return &p->xbit;
654}
655
656int ParserPush(Parser p, InputSource source)
657{
658 if(!p->source && !p->document_entity)
659 p->document_entity = source->entity;
660
661 source->parent = p->source;
662 p->source = source;
663
664 if(source->entity->type == ET_internal)
665 return 0;
666
667 /* Look at first few bytes of external entities to guess encoding,
668 then look for an XMLDecl or TextDecl. */
669
670 if(source->entity->encoding == CE_unknown) /* we might already know */
671 determine_character_encoding(source);
672
673#if CHAR_SIZE == 8
674 if(!EncodingIsAsciiSuperset(source->entity->encoding))
675 return error(p, "Unsupported character encoding %s",
676 CharacterEncodingName[source->entity->encoding]);
677#else
678 if(source->entity->encoding == CE_unknown)
679 return error(p, "Unknown character encoding");
680#endif
681
682 get(source); unget(source); /* To get the first line read */
683
684 source->entity->ml_decl = ML_unspecified;
685 if(looking_at(p, "<?NSL "))
686 return process_nsl_decl(p);
687 if(looking_at(p, "<?xml "))
688 {
689 require(process_xml_decl(p));
690 if(source->entity == p->document_entity &&
691 !source->entity->version_decl)
692 return error(p, "XML declaration in document entity lacked "
693 "version number");
694 if(source->entity != p->document_entity &&
695 source->entity->standalone_decl != SDD_unspecified)
696 return error(p, "Standalone attribute not allowed except in "
697 "document entity");
698 return 0;
699 }
700 else if(!ParserGetFlag(p, XMLStrictWFErrors) && looking_at(p, "<?XML "))
701 {
702 warn(p, "Found <?XML instead of <?xml; switching to case-"
703 "insensitive mode");
704 ParserSetFlag(p, CaseInsensitive, 1);
705 return process_xml_decl(p);
706 }
707 else
708 return 0;
709}
710
711void ParserPop(Parser p)
712{
713 InputSource source;
714
715 source = p->source;
716 Fclose(source->file16);
717 p->source = source->parent;
718
719 if(source->entity->type == ET_external)
720 Free(source->line);
721 Free(source);
722}
723
724/* Returns true if the source is at EOE. If so, the EOE will have been read. */
725
726static int at_eoe(InputSource s)
727{
728 if(!at_eol(s))
729 return 0;
730 if(s->seen_eoe || get_with_fill(s) == XEOE)
731 return 1;
732 unget(s);
733 return 0;
734}
735
736/* Pops any sources that are at EOE. Leaves source buffer with at least
737 one character in it (except at EOF, where it leaves the EOE unread). */
738
739static void pop_while_at_eoe(Parser p)
740{
741 while(1)
742 {
743 InputSource s = p->source;
744
745 if(!at_eoe(s))
746 return;
747 if(!s->parent)
748 {
749 unget(s);
750 return;
751 }
752 ParserPop(p);
753 }
754}
755
756void ParserSetFlag(Parser p, ParserFlag flag, int value)
757{
758 if(value)
759 p->flags |= (1 << flag);
760 else
761 p->flags &= ~(1 << flag);
762
763 if(flag == XMLPredefinedEntities)
764 {
765 if(value)
766 p->dtd->predefined_entities = xml_predefined_entities;
767 else
768 p->dtd->predefined_entities = 0;
769 }
770}
771
772void ParserPerror(Parser p, XBit bit)
773{
774 int linenum, charnum;
775 InputSource s;
776
777 Fprintf(Stderr, "%s: %s\n",
778 bit->type == XBIT_error ? "Error" : "Warning",
779 bit->error_message);
780
781
782 for(s=p->source; s; s=s->parent)
783 {
784 if(s->entity->name)
785 Fprintf(Stderr, " in entity \"%S\"", s->entity->name);
786 else
787 Fprintf(Stderr, " in unnamed entity");
788
789 switch(SourceLineAndChar(s, &linenum, &charnum))
790 {
791 case 1:
792 Fprintf(Stderr, " at line %d char %d of", linenum+1, charnum+1);
793 break;
794 case 0:
795 Fprintf(Stderr, " defined at line %d char %d of",
796 linenum+1, charnum+1);
797 break;
798 case -1:
799 Fprintf(Stderr, " defined in");
800 break;
801 }
802
803 Fprintf(Stderr, " %s\n", EntityDescription(s->entity));
804 }
805}
806
807
808static int parse(Parser p)
809{
810 int c;
811 InputSource s;
812
813 if(p->state == PS_end || p->state == PS_error)
814 {
815 /* After an error or EOF, just keep returning EOF */
816 p->xbit.type = XBIT_eof;
817 return 0;
818 }
819
820 clear_xbit(&p->xbit);
821
822 if(p->state <= PS_prolog2 || p->state == PS_epilog)
823 skip_whitespace(p->source);
824
825restart:
826 pop_while_at_eoe(p);
827 s = p->source;
828 SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
829
830 switch(c = get(s))
831 {
832 case XEOE:
833 if(p->state != PS_epilog)
834 return error(p, "Document ends too soon");
835 p->state = PS_end;
836 p->xbit.type = XBIT_eof;
837 return 0;
838 case '<':
839 return parse_markup(p);
840 case '&':
841 if(ParserGetFlag(p, IgnoreEntities))
842 goto pcdata;
843 if(p->state <= PS_prolog2)
844 return error(p, "Entity reference not allowed in prolog");
845 if(looking_at(p, "#"))
846 {
847 /* a character reference - go back and parse as pcdata */
848 unget(s);
849 goto pcdata;
850 }
851 if(ParserGetFlag(p, ExpandGeneralEntities))
852 {
853 /* an entity reference - push it and start again */
854 require(parse_reference(p, 0, 1, 1));
855 goto restart;
856 }
857 /* not expanding general entities, so treat as pcdata */
858 goto pcdata;
859 default:
860 pcdata:
861 unget(s);
862 return parse_pcdata(p);
863 }
864}
865
866/* Called after reading '<' */
867
868static int parse_markup(Parser p)
869{
870 InputSource s = p->source;
871 int c = get(s);
872
873 switch(c)
874 {
875 case '!':
876 if(looking_at(p, "--"))
877 {
878 if(ParserGetFlag(p, ReturnComments))
879 return parse_comment(p, 0);
880 else
881 {
882 require(parse_comment(p, 1));
883 return parse(p);
884 }
885 }
886 else if(looking_at(p, "DOCTYPE "))
887 return parse_dtd(p);
888 else if(looking_at(p, "[CDATA["))
889 return parse_cdata(p);
890 else
891 return error(p, "Syntax error after <!");
892
893 case '/':
894 return parse_endtag(p);
895
896 case '?':
897 return parse_pi(p);
898
899 default:
900 unget(s);
901 if(!ParserGetFlag(p, XMLLessThan) &&
902 (c == XEOE || !is_xml_namestart(c)))
903 {
904 /* In nSGML, recognise < as stago only if followed by namestart */
905
906 unget(s); /* put back the < */
907 return parse_pcdata(p);
908 }
909 return parse_starttag(p);
910 }
911}
912
913static int parse_endtag(Parser p)
914{
915 ElementDefinition def;
916 Entity ent;
917
918 p->xbit.type = XBIT_end;
919 require(parse_name(p, "after </"));
920 maybe_uppercase_name(p);
921
922 if(ParserGetFlag(p, CheckEndTagsMatch))
923 {
924 if(p->element_depth <= 0)
925 return error(p, "End tag </%.*S> outside of any element",
926 p->namelen, p->name);
927
928 ent = p->element_stack[--p->element_depth].entity;
929 def = p->element_stack[p->element_depth].definition;
930
931 if(p->namelen == def->namelen &&
932 memcmp(p->name, def->name, p->namelen * sizeof(Char)) == 0)
933 p->xbit.element_definition = def;
934 else
935 return error(p, "Mismatched end tag: expected </%S>, got </%.*S>",
936 def->name, p->namelen, p->name);
937
938 if(ent != p->source->entity)
939 return error(p, "Element ends in different entity from that "
940 "in which it starts");
941
942 if(p->element_depth == 0)
943 p->state = PS_epilog;
944 }
945 else
946 {
947 p->xbit.element_definition = FindElementN(p->dtd, p->name, p->namelen);
948 if(!p->xbit.element_definition)
949 return error(p, "End tag for unknown element %.*S",
950 p->namelen, p->name);
951 }
952
953 skip_whitespace(p->source);
954 return expect(p, '>', "after name in end tag");
955}
956
957static int parse_starttag(Parser p)
958{
959 int c;
960
961 if(p->state == PS_epilog && !ParserGetFlag(p, AllowMultipleElements))
962 return error(p, "Document contains multiple elements");
963
964 p->state = PS_body;
965
966 require(parse_name(p, "after <"));
967 maybe_uppercase_name(p);
968
969 p->xbit.element_definition = FindElementN(p->dtd, p->name, p->namelen);
970 if(!p->xbit.element_definition || p->xbit.element_definition->tentative)
971 {
972 if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedElements))
973 return error(p, "Start tag for undeclared element %.*S",
974 p->namelen, p->name);
975 if(p->have_dtd && ParserGetFlag(p, WarnOnUndefinedElements))
976 warn(p, "Start tag for undeclared element %.*S; "
977 "declaring it to have content ANY",
978 p->namelen, p->name);
979 if(p->xbit.element_definition)
980 RedefineElement(p->xbit.element_definition, CT_any, 0);
981 else
982 {
983 if(!(p->xbit.element_definition =
984 DefineElementN(p->dtd, p->name, p->namelen, CT_any, 0)))
985 return error(p, "System error");
986 }
987 }
988
989 while(1)
990 {
991 InputSource s = p->source;
992
993 /* We could just do skip_whitespace here, but we will get a
994 better error message if we look a bit closer. */
995
996 c = get(s);
997 if(c !=XEOE && is_xml_whitespace(c))
998 {
999 skip_whitespace(s);
1000 c = get(s);
1001 }
1002 else if(c != '>' &&
1003 !(ParserGetFlag(p, XMLEmptyTagEnd) && c == '/'))
1004 {
1005 unget(s); /* For error position */
1006 return error(p, "Expected whitespace or tag end in start tag");
1007 }
1008
1009 if(c == '>')
1010 {
1011 p->xbit.type = XBIT_start;
1012 break;
1013 }
1014
1015 if((ParserGetFlag(p, XMLEmptyTagEnd)) && c == '/')
1016 {
1017 require(expect(p, '>', "after / in start tag"));
1018 p->xbit.type = XBIT_empty;
1019 break;
1020 }
1021
1022 unget(s);
1023
1024 require(parse_attribute(p));
1025 }
1026
1027 if(ParserGetFlag(p, CheckEndTagsMatch))
1028 {
1029 if(p->xbit.type == XBIT_start)
1030 {
1031 if(p->element_depth == p->element_stack_alloc)
1032 {
1033 p->element_stack_alloc =
1034 p->element_stack_alloc == 0 ? 20 :
1035 p->element_stack_alloc * 2;
1036 if(!(p->element_stack =
1037 Realloc(p->element_stack,
1038 (p->element_stack_alloc * sizeof(*p->element_stack)))))
1039 return error(p, "System error");
1040 }
1041 p->element_stack[p->element_depth].definition =
1042 p->xbit.element_definition;
1043 p->element_stack[p->element_depth++].entity = p->source->entity;
1044 }
1045 else
1046 if(p->element_depth == 0)
1047 p->state = PS_epilog;
1048 }
1049
1050 if(ParserGetFlag(p, ReturnDefaultedAttributes))
1051 {
1052 AttributeDefinition d;
1053 Attribute a;
1054
1055 for(d=NextAttributeDefinition(p->xbit.element_definition, 0);
1056 d;
1057 d=NextAttributeDefinition(p->xbit.element_definition, d))
1058 {
1059 if(!d->default_value)
1060 continue;
1061 for(a=p->xbit.attributes; a; a=a->next)
1062 if(a->definition == d)
1063 break;
1064 if(!a)
1065 {
1066 if(!(a = Malloc(sizeof(*a))))
1067 return error(p, "System error");
1068 a->definition = d;
1069 if(!(a->value = Strdup(d->default_value)))
1070 return error(p, "System error");
1071 a->quoted = 1;
1072 a->next = p->xbit.attributes;
1073 p->xbit.attributes = a;
1074 }
1075 }
1076 }
1077
1078 return 0;
1079}
1080
1081static int parse_attribute(Parser p)
1082{
1083 InputSource s = p->source;
1084 AttributeDefinition def;
1085 struct attribute *a;
1086 int c;
1087
1088 require(parse_name(p, "for attribute"));
1089 maybe_uppercase_name(p);
1090
1091 def = FindAttributeN(p->xbit.element_definition, p->name, p->namelen);
1092 if(!def)
1093 {
1094 if(p->have_dtd && ParserGetFlag(p, ErrorOnUndefinedAttributes))
1095 return error(p, "Undeclared attribute %.*S for element %S",
1096 p->namelen, p->name, p->xbit.element_definition->name);
1097 if(p->have_dtd && ParserGetFlag(p, WarnOnUndefinedAttributes))
1098 warn(p, "Undeclared attribute %.*S for element %S; "
1099 "declaring it as CDATA #IMPLIED",
1100 p->namelen, p->name, p->xbit.element_definition->name);
1101 if(!(def = DefineAttributeN(p->xbit.element_definition,
1102 p->name, p->namelen,
1103 AT_cdata, 0, DT_implied, 0)))
1104 return error(p, "System error");
1105 }
1106
1107 for(a = p->xbit.attributes; a; a = a->next)
1108 if(a->definition == def)
1109 return error(p, "Repeated attribute %.*S", p->namelen, p->name);
1110
1111 if(!(a = Malloc(sizeof(*a))))
1112 return error(p, "System error");
1113
1114 a->value = 0; /* in case of error */
1115 a->next = p->xbit.attributes;
1116 p->xbit.attributes = a;
1117 a->definition = def;
1118
1119 skip_whitespace(s);
1120 require(expect(p, '=', "after attribute name"));
1121
1122 skip_whitespace(s);
1123 c = get(s);
1124 unget(s);
1125 switch(c)
1126 {
1127 case '"':
1128 case '\'':
1129 a->quoted = 1;
1130 require(parse_string(p, "in attribute value",
1131 a->definition->type == AT_cdata ? LT_cdata_attr :
1132 LT_tok_attr));
1133 a->value = p->pbuf;
1134 Consume(p->pbuf);
1135 break;
1136 default:
1137 if(ParserGetFlag(p, ErrorOnUnquotedAttributeValues))
1138 return error(p, "Value of attribute is unquoted");
1139 a->quoted = 0;
1140 require(parse_nmtoken(p, "in unquoted attribute value"));
1141 CopyName(a->value);
1142 break;
1143 }
1144
1145 return 0;
1146}
1147
1148static int transcribe(Parser p, int back, int count)
1149{
1150 ExpandBuf(p->pbuf, p->pbufnext + count);
1151 memcpy(p->pbuf + p->pbufnext,
1152 p->source->line + p->source->next - back,
1153 count * sizeof(Char));
1154 p->pbufnext += count;
1155 return 0;
1156}
1157
1158/* Called after pushing back the first character of the pcdata */
1159
1160static int parse_pcdata(Parser p)
1161{
1162 int count = 0;
1163 InputSource s;
1164 Char *buf;
1165 int next, buflen;
1166
1167 if(p->state <= PS_prolog2)
1168 return error(p, "Character data not allowed in prolog");
1169 if(p->state == PS_epilog)
1170 return error(p, "Character data not allowed after body");
1171
1172 s = p->source;
1173 buf = s->line;
1174 next = s->next;
1175 buflen = s->line_length;
1176
1177 p->pbufnext = 0;
1178
1179 while(1)
1180 {
1181 if(next == buflen)
1182 {
1183 s->next = next;
1184 if(count > 0)
1185 {
1186 require(transcribe(p, count, count));
1187 }
1188 count = 0;
1189 if(at_eoe(s))
1190 {
1191 if(!ParserGetFlag(p, MergePCData))
1192 goto done;
1193 else
1194 pop_while_at_eoe(p);
1195 }
1196 s = p->source;
1197 buf = s->line;
1198 next = s->next;
1199 buflen = s->line_length;
1200 if(next == buflen)
1201 goto done; /* must be EOF */
1202 }
1203
1204 switch(buf[next++])
1205 {
1206 case '<':
1207 if(!ParserGetFlag(p, XMLLessThan))
1208 {
1209 /* In nSGML, don't recognise < as markup unless it looks ok */
1210 if(next == buflen)
1211 goto deflt;
1212 if(buf[next] != '!' && buf[next] != '/' && buf[next] != '?' &&
1213 !is_xml_namestart(buf[next]))
1214 goto deflt;
1215 }
1216 s->next = next;
1217 if(count > 0)
1218 {
1219 require(transcribe(p, count+1, count));
1220 }
1221 count = 0;
1222 if(!ParserGetFlag(p, ReturnComments) &&
1223 buflen >= next + 3 &&
1224 buf[next] == '!' && buf[next+1] == '-' && buf[next+2] == '-')
1225 {
1226 s->next = next + 3;
1227 require(parse_comment(p, 1));
1228 buflen = s->line_length;
1229 next = s->next;
1230 }
1231 else
1232 {
1233 s->next = next-1;
1234 goto done;
1235 }
1236 break;
1237 case '&':
1238 if(ParserGetFlag(p, IgnoreEntities))
1239 goto deflt;
1240 if(!ParserGetFlag(p, MergePCData) &&
1241 (p->pbufnext > 0 || count > 0))
1242 {
1243 /* We're returning references as separate bits, and we've
1244 come to one, and we've already got some data to return,
1245 so return what we've got and get the reference next time. */
1246
1247 s->next = next-1;
1248 if(count > 0)
1249 {
1250 require(transcribe(p, count, count));
1251 }
1252 goto done;
1253 }
1254 if(buflen >= next+1 && buf[next] == '#')
1255 {
1256 /* It's a character reference */
1257
1258 s->next = next+1;
1259 if(count > 0)
1260 {
1261 require(transcribe(p, count+2, count));
1262 }
1263 count = 0;
1264 require(parse_character_reference(p,
1265 ParserGetFlag(p, ExpandCharacterEntities)));
1266 next = s->next;
1267
1268 if(!ParserGetFlag(p, MergePCData))
1269 goto done;
1270 }
1271 else
1272 {
1273 /* It's a general entity reference */
1274
1275 s->next = next;
1276 if(count > 0)
1277 {
1278 require(transcribe(p, count+1, count));
1279 }
1280 count = 0;
1281 require(parse_reference(p, 0,
1282 ParserGetFlag(p, ExpandGeneralEntities),
1283 1));
1284 s = p->source;
1285 buf = s->line;
1286 buflen = s->line_length;
1287 next = s->next;
1288
1289 if(!ParserGetFlag(p, MergePCData))
1290 goto done;
1291 }
1292 break;
1293 case ']':
1294 if(ParserGetFlag(p, XMLMiscWFErrors) &&
1295 buflen >= next + 2 &&
1296 buf[next] == ']' && buf[next+1] == '>')
1297 return error(p, "Illegal character sequence ']]>' in pcdata");
1298 /* fall through */
1299 default:
1300 deflt:
1301 count++;
1302 break;
1303 }
1304 }
1305
1306 done:
1307 p->pbuf[p->pbufnext++] = 0;
1308 p->xbit.type = XBIT_pcdata;
1309 p->xbit.pcdata_chars = p->pbuf;
1310 Consume(p->pbuf);
1311
1312 return 0;
1313}
1314
1315/* Called after reading '<!--'. Won't go over an entity end. */
1316
1317static int parse_comment(Parser p, int skip)
1318{
1319 InputSource s = p->source;
1320 int c, c1=0, c2=0;
1321 int count = 0;
1322
1323 if(!skip)
1324 p->pbufnext = 0;
1325
1326 while((c = get(s)) != XEOE)
1327 {
1328 count++;
1329 if(c1 == '-' && c2 == '-')
1330 {
1331 if(c == '>')
1332 break;
1333 unget(s); /* For error position */
1334 return error(p, "-- in comment");
1335 }
1336
1337 if(at_eol(s))
1338 {
1339 if(!skip)
1340 {
1341 require(transcribe(p, count, count));
1342 }
1343 count = 0;
1344 }
1345 c2 = c1; c1 = c;
1346 }
1347
1348 if(c == XEOE)
1349 return error(p, "EOE in comment");
1350
1351 if(skip)
1352 return 0;
1353
1354 require(transcribe(p, count, count-3));
1355 p->pbuf[p->pbufnext++] = 0;
1356 p->xbit.type = XBIT_comment;
1357 p->xbit.comment_chars = p->pbuf;
1358 Consume(p->pbuf);
1359
1360 return 0;
1361}
1362
1363static int parse_pi(Parser p)
1364{
1365 InputSource s = p->source;
1366 int c, c1=0;
1367 int count = 0;
1368 Char xml[] = {'x', 'm', 'l', 0};
1369
1370 require(parse_name(p, "after <?"));
1371 CopyName(p->xbit.pi_name);
1372
1373 p->pbufnext = 0;
1374
1375 if(Strcasecmp(p->xbit.pi_name, xml) == 0)
1376 {
1377 if(ParserGetFlag(p, XMLStrictWFErrors))
1378 return error(p, "Misplaced or wrong-case xml declaration");
1379 else
1380 warn(p, "Misplaced or wrong-case xml declaration; treating as PI");
1381 }
1382
1383 /* Empty PI? */
1384
1385 if(looking_at(p, ParserGetFlag(p, XMLPiEnd) ? "?>" : ">"))
1386 {
1387 ExpandBuf(p->pbuf, 0);
1388 goto done;
1389 }
1390
1391 /* If non-empty, must be white space after name */
1392
1393 c = get(s);
1394 if(c == XEOE || !is_xml_whitespace(c))
1395 return error(p, "Expected whitespace after PI name");
1396 skip_whitespace(s);
1397
1398 while((c = get(s)) != XEOE)
1399 {
1400 count++;
1401 if(c == '>' &&
1402 (!ParserGetFlag(p, XMLPiEnd) || c1 == '?'))
1403 break;
1404 if(at_eol(s))
1405 {
1406 require(transcribe(p, count, count));
1407 count = 0;
1408 }
1409 c1 = c;
1410 }
1411
1412 if(c == XEOE)
1413 return error(p, "EOE in PI");
1414
1415 require(transcribe(p, count, count-(ParserGetFlag(p, XMLPiEnd) ? 2 : 1)));
1416done:
1417 p->pbuf[p->pbufnext++] = 0;
1418 p->xbit.type = XBIT_pi;
1419 p->xbit.pi_chars = p->pbuf;
1420 Consume(p->pbuf);
1421
1422 return 0;
1423}
1424
1425static int parse_string(Parser p, const char8 *where, enum literal_type type)
1426{
1427 int c, quote;
1428 int count = 0;
1429 InputSource start_source, s;
1430
1431 s = start_source = p->source;
1432
1433 quote = get(s);
1434 if(quote != '\'' && quote != '"')
1435 {
1436 unget(s); /* For error position */
1437 return error(p, "Expected quoted string %s, but got %s",
1438 where, escape(quote));
1439 }
1440
1441 p->pbufnext = 0;
1442
1443 while(1)
1444 {
1445 switch(c = get(s))
1446 {
1447 case '\r':
1448 case '\n':
1449 case '\t':
1450 if(type == LT_plain || type == LT_entity ||
1451 !ParserGetFlag(p, NormaliseAttributeValues))
1452 {
1453 count++;
1454 break;
1455 }
1456 if(count > 0)
1457 {
1458 require(transcribe(p, count+1, count));
1459 }
1460 count = 0;
1461 ExpandBuf(p->pbuf, p->pbufnext+1);
1462 p->pbuf[p->pbufnext++] = ' ';
1463 break;
1464
1465 case '<':
1466 if((type == LT_tok_attr || type == LT_cdata_attr) &&
1467 ParserGetFlag(p, XMLMiscWFErrors))
1468 return error(p, "Illegal character '<' %s", where);
1469 count++;
1470 break;
1471
1472 case XEOE:
1473 if(s == start_source)
1474 {
1475 return error(p, "Quoted string goes past entity end");
1476 }
1477 if(count > 0)
1478 {
1479 require(transcribe(p, count, count));
1480 }
1481 count = 0;
1482 ParserPop(p);
1483 s = p->source;
1484 break;
1485
1486 case '%':
1487 if(type != LT_entity)
1488 {
1489 count++;
1490 break;
1491 }
1492 if(count > 0)
1493 {
1494 require(transcribe(p, count+1, count));
1495 }
1496 count = 0;
1497 if(p->external_pe_depth == 0)
1498 {
1499 unget(s); /* For error position */
1500 return error(p, "PE ref not allowed here in internal subset");
1501 }
1502 require(parse_reference(p, 1, 1, 1));
1503 s = p->source;
1504 break;
1505
1506 case '&':
1507 if(ParserGetFlag(p, IgnoreEntities))
1508 goto deflt;
1509 if(type == LT_plain)
1510 {
1511 count++;
1512 break;
1513 }
1514
1515 if(count > 0)
1516 {
1517 require(transcribe(p, count+1, count));
1518 }
1519 count = 0;
1520 if(looking_at(p, "#"))
1521 require(parse_character_reference(p,
1522 ParserGetFlag(p, ExpandCharacterEntities)));
1523 else
1524 {
1525 require(parse_reference(p, 0,
1526 type != LT_entity &&
1527 ParserGetFlag(p, ExpandGeneralEntities),
1528 !ParserGetFlag(p, XMLMiscWFErrors)));
1529 s = p->source;
1530 }
1531 break;
1532
1533 default:
1534 deflt:
1535 if(c == quote && p->source == start_source)
1536 goto done;
1537 count++;
1538 }
1539
1540 if(at_eol(s) && count > 0)
1541 {
1542 require(transcribe(p, count, count));
1543 count = 0;
1544 }
1545 }
1546
1547done:
1548 if(count > 0)
1549 require(transcribe(p, count+1, count));
1550 else
1551 ExpandBuf(p->pbuf, p->pbufnext+1);
1552 p->pbuf[p->pbufnext++] = 0;
1553
1554 if(ParserGetFlag(p, NormaliseAttributeValues) && type == LT_tok_attr)
1555 {
1556 Char *old, *new;
1557
1558 new = old = p->pbuf;
1559
1560 /* Maybe skip leading whitespace */
1561
1562 while(*old == ' ')
1563 old++;
1564
1565 /* Translate whitespace to spaces, maybe compressing */
1566
1567 for( ; *old; old++)
1568 {
1569 if(*old == ' ')
1570 {
1571 /* NB can't be at start because we skipped whitespace */
1572 if(type == LT_tok_attr && new[-1] == ' ')
1573 ;
1574 else
1575 *new++ = ' ';
1576 }
1577 else
1578 *new++ = *old;
1579 }
1580
1581 /* Maybe trim trailing space (only one possible) */
1582
1583 if(new > p->pbuf && new[-1] == ' ')
1584 new--;
1585
1586 *new = 0;
1587 }
1588
1589 return 0;
1590}
1591
1592static int parse_dtd(Parser p)
1593{
1594 InputSource s = p->source;
1595 Entity parent = s->entity;
1596 Entity internal_part = 0, external_part = 0;
1597 Char *name;
1598 char8 *publicid = 0, *systemid = 0;
1599 struct xbit xbit;
1600
1601 xbit = p->xbit; /* copy start position */
1602 xbit.type = XBIT_dtd;
1603
1604 require(parse_name(p, "for name in dtd"));
1605 CopyName(name);
1606 maybe_uppercase(p, name);
1607
1608 skip_whitespace(s);
1609
1610 require(parse_external_id(p, 0, &publicid, &systemid,
1611 ParserGetFlag(p, XMLExternalIDs),
1612 ParserGetFlag(p, XMLExternalIDs)));
1613
1614 if(systemid || publicid)
1615 {
1616 external_part = NewExternalEntity(0, publicid, systemid, 0, parent);
1617 if(!external_part)
1618 {
1619 Free(name);
1620 return error(p, "System error");
1621 }
1622 skip_whitespace(s);
1623 }
1624
1625 if(looking_at(p, "["))
1626 {
1627 int line = s->line_number, cpos = s->next;
1628
1629 require(read_markupdecls(p));
1630 skip_whitespace(s);
1631 internal_part = NewInternalEntity(0, p->pbuf, parent, line, cpos, 1);
1632 Consume(p->pbuf);
1633 if(!internal_part)
1634 {
1635 Free(name);
1636 FreeEntity(external_part);
1637 return error(p, "System error");
1638 }
1639 }
1640
1641 require(expect(p, '>', "at end of dtd"));
1642
1643 if(p->state == PS_prolog1)
1644 p->state = PS_prolog2;
1645 else
1646 {
1647 Free(name);
1648 FreeEntity(external_part);
1649 FreeEntity(internal_part);
1650
1651 if(ParserGetFlag(p, XMLStrictWFErrors))
1652 return error(p, "Misplaced or repeated DOCTYPE declaration");
1653
1654 warn(p, "Misplaced or repeated DOCTYPE declaration");
1655 /* Ignore it and return the next bit */
1656 return parse(p);
1657 }
1658
1659 if(p->dtd->name)
1660 {
1661 Free(name);
1662 FreeEntity(external_part);
1663 FreeEntity(internal_part);
1664
1665 /* This happens if we manually set the dtd */
1666 return parse(p);
1667 }
1668
1669 p->dtd->name = name;
1670 p->dtd->internal_part = internal_part;
1671 p->dtd->external_part = external_part;
1672
1673 if(ParserGetFlag(p, TrustSDD))
1674 {
1675 if(internal_part)
1676 {
1677 ParseDtd(p, internal_part);
1678 if(p->xbit.type == XBIT_error)
1679 return -1;
1680 }
1681 if(external_part && p->standalone != SDD_yes)
1682 {
1683 ParseDtd(p, external_part);
1684 if(p->xbit.type == XBIT_error)
1685 return -1;
1686 }
1687 }
1688
1689 p->xbit = xbit;
1690 return 0;
1691}
1692
1693static int read_markupdecls(Parser p)
1694{
1695 InputSource s = p->source;
1696 int depth=1;
1697 int c, d, hyphens=0;
1698 int count = 0;
1699
1700 p->pbufnext = 0;
1701
1702 while(1)
1703 {
1704 c = get(s);
1705 if(c == XEOE)
1706 return error(p, "EOE in DTD");
1707 if(c == '-')
1708 hyphens++;
1709 else
1710 hyphens = 0;
1711
1712 count++;
1713
1714 switch(c)
1715 {
1716 case ']':
1717 if(--depth == 0)
1718 {
1719 count--; /* We don't want the final ']' */
1720 require(transcribe(p, count+1, count));
1721 p->pbuf[p->pbufnext++] = 0;
1722 return 0;
1723 }
1724 break;
1725
1726 case '[':
1727 depth++;
1728 break;
1729
1730 case '"':
1731 case '\'':
1732 while((d = get(s)) != XEOE)
1733 {
1734 count++;
1735 if(at_eol(s))
1736 {
1737 require(transcribe(p, count, count));
1738 count = 0;
1739 }
1740 if(d == c)
1741 break;
1742 }
1743 if(d == XEOE)
1744 return error(p, "EOE in DTD");
1745 break;
1746
1747 case '-':
1748 if(hyphens < 2)
1749 break;
1750 hyphens = 0;
1751 while((d = get(s)) != XEOE)
1752 {
1753 count++;
1754 if(at_eol(s))
1755 {
1756 require(transcribe(p, count, count));
1757 count = 0;
1758 }
1759 if(d == '-')
1760 hyphens++;
1761 else
1762 hyphens = 0;
1763 if(hyphens == 2)
1764 break;
1765 }
1766 if(d == XEOE)
1767 return error(p, "EOE in DTD");
1768 hyphens = 0;
1769 break;
1770
1771 default:
1772 break;
1773 }
1774
1775 if(at_eol(s) && count > 0)
1776 {
1777 require(transcribe(p, count, count));
1778 count = 0;
1779 }
1780 }
1781}
1782
1783static int process_nsl_decl(Parser p)
1784{
1785 InputSource s = p->source;
1786 int c, count = 0;
1787
1788 s->entity->ml_decl = ML_nsl;
1789
1790 /* The default character encoding for nSGML files is ascii-ash */
1791 if(s->entity->encoding == CE_UTF_8)
1792 s->entity->encoding = CE_unspecified_ascii_superset;
1793
1794 /* Syntax is <?NSL DDB unquoted-filename 0> */
1795
1796 if(!looking_at(p, "DDB "))
1797 return error(p, "Expected \"DDB\" in NSL declaration");
1798
1799 while(c = get(s), !is_xml_whitespace(c))
1800 switch(c)
1801 {
1802 case XEOE:
1803 return error(p, "EOE in NSL declaration");
1804
1805 case '>':
1806 return error(p, "Syntax error in NSL declaration");
1807
1808 default:
1809 count++;
1810 }
1811
1812 p->pbufnext = 0;
1813 require(transcribe(p, count+1, count));
1814 p->pbuf[p->pbufnext++] = 0;
1815
1816 skip_whitespace(s);
1817 if(!looking_at(p, "0>"))
1818 return error(p, "Expected \"0>\" at end of NSL declaration");
1819
1820 if(!(s->entity->ddb_filename = strdup8(Chartochar8(p->pbuf))))
1821 return error(p, "System error");
1822
1823 return 0;
1824}
1825
1826static int process_xml_decl(Parser p)
1827{
1828 InputSource s = p->source;
1829 enum {None, V, E, S} which, last = None;
1830 Char *Value, *cp;
1831 char8 *value;
1832 CharacterEncoding enc = CE_unknown;
1833 Char c;
1834
1835 s->entity->ml_decl = ML_xml;
1836
1837 /* XXX Should save the string buffer because it may already be in use */
1838
1839 while(!looking_at(p, "?>"))
1840 {
1841 if(looking_at(p, "version"))
1842 which = V;
1843 else if(looking_at(p, "encoding"))
1844 which = E;
1845 else if(looking_at(p, "standalone"))
1846 which = S;
1847 else
1848 return error(p, "Expected \"version\", \"encoding\" or "
1849 "\"standalone\" in XML declaration");
1850
1851 if(which <= last)
1852 {
1853 if(ParserGetFlag(p, XMLStrictWFErrors))
1854 return error(p, "Repeated or misordered attributes "
1855 "in XML declaration");
1856 warn(p, "Repeated or misordered attributes in XML declaration");
1857 }
1858 last = which;
1859
1860 skip_whitespace(s);
1861 require(expect(p, '=', "after attribute name in XML declaration"));
1862 skip_whitespace(s);
1863
1864 require(parse_string(p, "for attribute value in XML declaration",
1865 LT_plain));
1866
1867 maybe_uppercase(p, p->pbuf);
1868 Value = p->pbuf;
1869
1870 if(which == E)
1871 {
1872 if(!is_ascii_alpha(Value[0]))
1873 return error(p, "Encoding name does not begin with letter");
1874 for(cp=Value+1; *cp; cp++)
1875 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
1876 *cp != '.' && *cp != '_' && *cp != '-')
1877 return error(p, "Illegal character %s in encoding name",
1878 escape(*cp));
1879
1880 value = Chartochar8(Value);
1881
1882 enc = FindEncoding(value);
1883 if(enc == CE_unknown)
1884 return error(p, "Unknown declared encoding %s", value);
1885
1886 if(EncodingsCompatible(p->source->entity->encoding, enc, &enc))
1887 {
1888#if CHAR_SIZE == 8
1889 /* We ignore the declared encoding in 8-bit mode,
1890 and treat it as a random ascii superset. */
1891#else
1892 p->source->entity->encoding = enc;
1893#endif
1894 }
1895 else
1896 return error(p, "Declared encoding %s is incompatible with %s "
1897 "which was used to read it",
1898 CharacterEncodingName[enc],
1899 CharacterEncodingName[p->source->entity->encoding]);
1900
1901 s->entity->encoding_decl = enc;
1902 }
1903
1904 if(which == S)
1905 {
1906 value = Chartochar8(Value);
1907
1908 if(str_maybecase_cmp8(p, value, "no") == 0)
1909 p->standalone = SDD_no;
1910 else if(str_maybecase_cmp8(p, value, "yes") == 0)
1911 p->standalone = SDD_yes;
1912 else
1913 return error(p, "Expected \"yes\" or \"no\" "
1914 "for standalone in XML declaration");
1915
1916 s->entity->standalone_decl = p->standalone;
1917 }
1918
1919 if(which == V)
1920 {
1921 for(cp=Value; *cp; cp++)
1922 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
1923 *cp != '.' && *cp != '_' && *cp != '-' && *cp != ':')
1924 return error(p, "Illegal character %s in version number",
1925 escape(*cp));
1926
1927 if(!s->entity->version_decl)
1928 if(!(s->entity->version_decl = strdup8(Chartochar8(Value))))
1929 return error(p, "System error");
1930 }
1931
1932 c = get(s);
1933 if(c == '?')
1934 unget(s);
1935 else if(!is_xml_whitespace(c))
1936 return error(p, "Expected whitespace or \"?>\" after attribute "
1937 "in XML declaration");
1938 skip_whitespace(s);
1939 }
1940 return 0;
1941}
1942
1943static int parse_cdata(Parser p)
1944{
1945 InputSource s = p->source;
1946 int c, c1=0, c2=0;
1947 int count = 0;
1948
1949 if(p->state <= PS_prolog2)
1950 return error(p, "Cdata section not allowed in prolog");
1951 if(p->state == PS_epilog)
1952 return error(p, "Cdata section not allowed after body");
1953
1954 p->pbufnext = 0;
1955
1956 while((c = get(s)) != XEOE)
1957 {
1958 count++;
1959 if(c == '>' && c1 == ']' && c2 == ']')
1960 break;
1961 if(at_eol(s))
1962 {
1963 require(transcribe(p, count, count));
1964 count = 0;
1965 }
1966 c2 = c1; c1 = c;
1967 }
1968
1969 if(c == XEOE)
1970 return error(p, "EOE in CData section");
1971
1972 require(transcribe(p, count, count-3));
1973 p->pbuf[p->pbufnext++] = 0;
1974 p->xbit.type = XBIT_cdsect;
1975 p->xbit.cdsect_chars = p->pbuf;
1976 Consume(p->pbuf);
1977
1978 return 0;
1979}
1980
1981XBit ParseDtd(Parser p, Entity e)
1982{
1983 InputSource source, save;
1984
1985 if(e->type == ET_external && p->entity_opener)
1986 source = p->entity_opener(e, p->callback_arg);
1987 else
1988 source = EntityOpen(e);
1989 if(!source)
1990 {
1991 error(p, "Couldn't open dtd entity %s", EntityDescription(e));
1992 return &p->xbit;
1993 }
1994
1995 save = p->source;
1996 p->source = 0;
1997 if(ParserPush(p, source) == -1)
1998 return &p->xbit;
1999
2000 p->have_dtd = 1;
2001
2002 p->external_pe_depth = (source->entity->type == ET_external);
2003
2004 while(parse_markupdecl(p) == 0)
2005 ;
2006
2007 p->external_pe_depth = 0;
2008
2009 /* don't restore after error, so user can call ParserPerror */
2010 if(p->xbit.type != XBIT_error)
2011 {
2012 ParserPop(p); /* to free the input source */
2013 p->source = save;
2014 }
2015
2016 return &p->xbit;
2017}
2018
2019/*
2020 * Returns 0 normally, -1 if error, 1 at EOF.
2021 */
2022static int parse_markupdecl(Parser p)
2023{
2024 InputSource s;
2025 int c;
2026 int cur_line, cur_char;
2027 Entity cur_ent;
2028
2029 if(p->state == PS_error)
2030 return error(p, "Attempt to continue reading DTD after error");
2031
2032 clear_xbit(&p->xbit);
2033
2034 require(skip_dtd_whitespace(p, 1)); /* allow PE even in internal subset */
2035 s = p->source;
2036 SourcePosition(s, &p->xbit.entity, &p->xbit.byte_offset);
2037
2038 cur_ent = s->entity;
2039 cur_line = s->line_number;
2040 cur_char = s->next;
2041
2042 c = get(s);
2043 switch(c)
2044 {
2045 case XEOE:
2046 p->xbit.type = XBIT_none;
2047 return 1;
2048 case '<':
2049 if(looking_at(p, "!ELEMENT"))
2050 {
2051 require(expect_dtd_whitespace(p, "after ELEMENT"));
2052 return parse_element_decl(p);
2053 }
2054 else if(looking_at(p, "!ATTLIST"))
2055 {
2056 require(expect_dtd_whitespace(p, "after ATTLIST"));
2057 return parse_attlist_decl(p);
2058 }
2059 else if(looking_at(p, "!ENTITY"))
2060 {
2061 require(expect_dtd_whitespace(p, "after ENTITY"));
2062 return parse_entity_decl(p, cur_ent, cur_line, cur_char);
2063 }
2064 else if(looking_at(p, "!NOTATION"))
2065 {
2066 require(expect_dtd_whitespace(p, "after NOTATION"));
2067 return parse_notation_decl(p);
2068 }
2069 else if(looking_at(p, "!["))
2070 return parse_conditional(p);
2071 else if(looking_at(p, "?"))
2072 {
2073 require(parse_pi(p));
2074 if(p->dtd_callback)
2075 p->dtd_callback(&p->xbit, p->callback_arg);
2076 else
2077 FreeXBit(&p->xbit);
2078 return 0;
2079 }
2080 else if(looking_at(p, "!--"))
2081 {
2082 if(ParserGetFlag(p, ReturnComments))
2083 {
2084 require(parse_comment(p, 0));
2085 if(p->dtd_callback)
2086 p->dtd_callback(&p->xbit, p->callback_arg);
2087 else
2088 FreeXBit(&p->xbit);
2089 return 0;
2090 }
2091 else
2092 return parse_comment(p, 1);
2093 }
2094 else
2095 return error(p, "Syntax error after < in dtd");
2096 default:
2097 unget(s); /* For error position */
2098 return error(p, "Expected \"<\" in dtd, but got %s", escape(c));
2099 }
2100}
2101
2102static int parse_reference(Parser p, int pe, int expand, int allow_external)
2103{
2104 Entity e;
2105 InputSource s;
2106
2107 require(parse_name(p, pe ? "for parameter entity" : "for entity"));
2108 require(expect(p, ';', "after entity name"));
2109
2110 if(!expand)
2111 return transcribe(p, 1 + p->namelen + 1, 1 + p->namelen + 1);
2112
2113 e = FindEntityN(p->dtd, p->name, p->namelen, pe);
2114 if(!e)
2115 {
2116 Char *buf;
2117 Char *q;
2118 int i;
2119
2120 if(pe || ParserGetFlag(p, ErrorOnUndefinedEntities))
2121 return error(p, "Undefined%s entity %.*S",
2122 pe ? " parameter" : "" ,
2123 p->namelen > 50 ? 50 : p->namelen, p->name);
2124
2125 warn(p, "Undefined%s entity %.*S",
2126 pe ? " parameter" : "",
2127 p->namelen > 50 ? 50 : p->namelen, p->name);
2128
2129 /* Fake a definition for it */
2130
2131 buf = Malloc((5 + p->namelen + 1 + 1) * sizeof(Char));
2132 if(!buf)
2133 return error(p, "System error");
2134 q = buf;
2135 *q++ = '&'; *q++ = '#'; *q++ = '3'; *q++ = '8'; *q++ = ';';
2136 for(i=0; i<p->namelen; i++)
2137 *q++ = p->name[i];
2138 *q++ = ';';
2139 *q++ = 0;
2140
2141 if(!(e = NewInternalEntityN(p->name, p->namelen, buf, 0, 0, 0, 0)))
2142 return error(p, "System error");
2143 if(!DefineEntity(p->dtd, e, 0))
2144 return error(p, "System error");
2145 }
2146
2147 if(!allow_external && e->type == ET_external)
2148 return error(p, "Illegal reference to external entity");
2149
2150 for(s = p->source; s; s = s->parent)
2151 if(s->entity == e)
2152 return error(p, "Recursive reference to entity \"%S\"", e->name);
2153
2154 if(e->type == ET_external && p->entity_opener)
2155 s = p->entity_opener(e, p->callback_arg);
2156 else
2157 s = EntityOpen(e);
2158 if(!s)
2159 return error(p, "Couldn't open entity %S, %s",
2160 e->name, EntityDescription(e));
2161
2162 require(ParserPush(p, s));
2163
2164 return 0;
2165}
2166
2167static int parse_character_reference(Parser p, int expand)
2168{
2169 InputSource s = p->source;
2170 int c, base = 10;
2171 int count = 0;
2172 unsigned int code = 0;
2173 Char *ch = s->line + s->next;
2174
2175 if(looking_at(p, "x"))
2176 {
2177 ch++;
2178 base = 16;
2179 }
2180
2181 while((c = get(s)) != ';')
2182 {
2183 if((c >= '0' && c <= '9') ||
2184 (base == 16 && ((c >= 'A' && c <= 'F') ||
2185 (c >= 'a' && c <= 'f'))))
2186 count++;
2187 else
2188 {
2189 unget(s); /* For error position */
2190 return error(p,
2191 "Illegal character %s in base-%d character reference",
2192 escape(c), base);
2193 }
2194 }
2195
2196 if(!expand)
2197 return transcribe(p, 2 + (base == 16) + count + 1,
2198 2 + (base == 16) + count + 1);
2199
2200 while(count-- > 0)
2201 {
2202 c = *ch++;
2203 if(c >= '0' && c <= '9')
2204 code = code * base + (c - '0');
2205 else if(c >= 'A' && c <= 'F')
2206 code = code * base + 10 + (c - 'A');
2207 else
2208 code = code * base + 10 + (c - 'a');
2209 }
2210
2211#if CHAR_SIZE == 8
2212 if(code > 255 || !is_xml_legal(code))
2213 {
2214 if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
2215 return error(p, "0x%x is not a valid 8-bit XML character", code);
2216 else
2217 warn(p, "0x%x is not a valid 8-bit XML character; ignored", code);
2218 return 0;
2219 }
2220#else
2221 if(!is_xml_legal(code))
2222 {
2223 if(ParserGetFlag(p, ErrorOnBadCharacterEntities))
2224 return error(p, "0x%x is not a valid UTF-16 XML character", code);
2225 else
2226 warn(p, "0x%x is not a valid UTF-16 XML character; ignored", code);
2227 return 0;
2228 }
2229
2230 if(code >= 0x10000)
2231 {
2232 /* Use surrogates */
2233
2234 ExpandBuf(p->pbuf, p->pbufnext+2);
2235 code -= 0x10000;
2236
2237 p->pbuf[p->pbufnext++] = (code >> 10) + 0xd800;
2238 p->pbuf[p->pbufnext++] = (code & 0x3ff) + 0xdc00;
2239
2240 return 0;
2241 }
2242#endif
2243
2244 ExpandBuf(p->pbuf, p->pbufnext+1);
2245 p->pbuf[p->pbufnext++] = code;
2246
2247 return 0;
2248}
2249
2250/* Called after reading '<!ELEMENT ' */
2251
2252static int parse_element_decl(Parser p)
2253{
2254 Char *name;
2255 ContentType type;
2256 ElementDefinition def;
2257#if 1
2258 ContentParticle cp;
2259#else
2260 int c;
2261 Char pcdata[] = {'#','P','C','D','A','T','A',0};
2262#endif
2263 Char *content = 0;
2264
2265 require(parse_name(p, "for name in element declaration"));
2266 CopyName(name);
2267 maybe_uppercase(p, name);
2268
2269 require(expect_dtd_whitespace(p, "after name in element declaration"));
2270
2271 if(looking_at(p, "EMPTY"))
2272 {
2273 type = CT_empty;
2274 content = 0;
2275 }
2276 else if(looking_at(p, "ANY"))
2277 {
2278 type = CT_any;
2279 content = 0;
2280 }
2281 else
2282#if 1
2283 if(looking_at(p, "("))
2284 {
2285 unget(p->source);
2286 if(!(cp = parse_cp(p)) ||
2287 check_content_decl(p, cp) < 0 ||
2288 !(content = stringify_cp(cp)))
2289 {
2290 FreeContentParticle(cp);
2291 Free(content);
2292 Free(name);
2293 return -1;
2294 }
2295
2296 if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
2297 type = CT_mixed;
2298 else
2299 type = CT_element;
2300 {
2301 }
2302 FreeContentParticle(cp); /* XXX */
2303 }
2304 else
2305 {
2306 Free(name);
2307 return error(p, "Expected \"EMPTY\", \"ANY\", or \"(\" after name in "
2308 "element declaration");
2309 }
2310#else
2311 {
2312 /* Don't really parse here... maybe improve sometime */
2313
2314 int count = 0;
2315
2316 p->pbufnext = 0;
2317
2318 while((c = get(p->source)) != '>')
2319 {
2320 switch(c)
2321 {
2322 case XEOE:
2323 if(count > 0)
2324 require(transcribe(p, count, count));
2325 if(!p->source->parent)
2326 return error(p, "EOE in element declaration");
2327 ParserPop(p);
2328 count = 0;
2329 break;
2330 case '%':
2331 if(count > 0)
2332 require(transcribe(p, count+1, count));
2333 if(p->external_pe_depth == 0)
2334 {
2335 unget(p->source); /* For error position */
2336 return error(p,
2337 "PE ref not allowed here in internal subset");
2338 }
2339 require(parse_reference(p, 1, 1, 1));
2340 count = 0;
2341 break;
2342 default:
2343 count++;
2344 if(at_eol(p->source))
2345 {
2346 require(transcribe(p, count, count));
2347 count = 0;
2348 }
2349 }
2350 }
2351
2352 unget(p->source);
2353 require(transcribe(p, count, count));
2354 p->pbuf[p->pbufnext++] = 0;
2355
2356 if(Strstr(p->pbuf, pcdata))
2357 type = CT_mixed;
2358 else
2359 type = CT_element;
2360
2361 content = p->pbuf;
2362 Consume(p->pbuf);
2363 }
2364#endif
2365 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2366 require(expect(p, '>', "at end of element declaration"));
2367
2368 if((def = FindElement(p->dtd, name)))
2369 {
2370 if(def->tentative)
2371 RedefineElement(def, type, content);
2372 else
2373 {
2374 Free(content);
2375 if(ParserGetFlag(p, WarnOnRedefinitions))
2376 warn(p, "Ignoring redeclaration of element %S", name);
2377 }
2378 }
2379 else
2380 if (!DefineElement(p->dtd, name, type, content)) {
2381 return error(p, "System error");
2382 };
2383
2384 Free(name);
2385
2386 return 0;
2387}
2388
2389/* Content model parsing */
2390
2391static ContentParticle parse_cp(Parser p)
2392{
2393 ContentParticle cp;
2394
2395 if(looking_at(p, "("))
2396 {
2397 if(!(cp = parse_choice_or_seq(p)))
2398 return 0;
2399 }
2400 else if(looking_at(p, "#PCDATA"))
2401 {
2402 if(!(cp = Malloc(sizeof(*cp))))
2403 {
2404 error(p, "System error");
2405 return 0;
2406 }
2407
2408 cp->type = CP_pcdata;
2409 }
2410 else
2411 {
2412 if(parse_name(p, "in content declaration") < 0)
2413 return 0;
2414
2415 if(!(cp = Malloc(sizeof(*cp))))
2416 {
2417 error(p, "System error");
2418 return 0;
2419 }
2420
2421 cp->type = CP_name;
2422 CopyName0(cp->name);
2423 }
2424
2425 if(looking_at(p, "*"))
2426 cp->repetition = '*';
2427 else if(looking_at(p, "+"))
2428 cp->repetition = '+';
2429 else if(looking_at(p, "?"))
2430 cp->repetition = '?';
2431 else
2432 cp->repetition = 0;
2433
2434 return cp;
2435}
2436
2437/* Called after '(' */
2438
2439static ContentParticle parse_choice_or_seq(Parser p)
2440{
2441 ContentParticle cp, cp1;
2442
2443
2444 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2445
2446 if(!(cp1 = parse_cp(p)))
2447 return 0;
2448
2449 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2450
2451 if(!(cp = parse_choice_or_seq_1(p, 1, 0)))
2452 FreeContentParticle(cp1);
2453 else
2454 cp->children[0] = cp1;
2455
2456 return cp;
2457}
2458
2459/* Called before '|', ',', or ')' */
2460
2461static ContentParticle parse_choice_or_seq_1(Parser p, int nchildren, char sep)
2462{
2463 ContentParticle cp = 0, cp1;
2464 int nsep = get(p->source);
2465
2466 if(nsep == ')')
2467 {
2468 /* We've reached the end */
2469
2470 if(!(cp = Malloc(sizeof(*cp))) ||
2471 !(cp->children = Malloc(nchildren * sizeof(cp))))
2472 {
2473 Free(cp);
2474 error(p, "System error");
2475 return 0;
2476 }
2477
2478 /* The standard does not specify whether '(foo)' is a choice or a
2479 sequence. We make it a choice so that (#PCDATA) comes out as
2480 a choice, like other mixed models. */
2481
2482 cp->type = sep == ',' ? CP_seq : CP_choice;
2483 cp->nchildren = nchildren;
2484
2485 return cp;
2486 }
2487
2488 if(nsep != '|' && nsep != ',')
2489 {
2490 error(p, "Expected | or , or ) in content declaration, got %s",
2491 escape(nsep));
2492 return 0;
2493 }
2494
2495 if(sep && nsep != sep)
2496 {
2497 error(p, "Content particle contains both | and ,");
2498 return 0;
2499 }
2500
2501 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2502
2503 if(!(cp1 = parse_cp(p)))
2504 return 0;
2505
2506 require0(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2507
2508 if(!(cp = parse_choice_or_seq_1(p, nchildren+1, (char)nsep)))
2509 FreeContentParticle(cp1);
2510 else
2511 cp->children[nchildren] = cp1;
2512
2513 return cp;
2514}
2515
2516/* Check content particle matches Mixed or children */
2517
2518static int check_content_decl(Parser p, ContentParticle cp)
2519{
2520 int i;
2521
2522 if(cp->type == CP_choice && cp->children[0]->type == CP_pcdata)
2523 {
2524 for(i=1; i<cp->nchildren; i++)
2525 if(cp->children[i]->type != CP_name)
2526 return error(p, "Invalid mixed content declaration");
2527
2528 if(cp->repetition != '*' &&
2529 !(cp->nchildren == 1 && cp->repetition == 0))
2530 return error(p, "Invalid mixed content declaration");
2531
2532 return 0;
2533 }
2534 else
2535 return check_content_decl_1(p, cp);
2536}
2537
2538static int check_content_decl_1(Parser p, ContentParticle cp)
2539{
2540 int i;
2541
2542 switch(cp->type)
2543 {
2544 case CP_pcdata:
2545 return error(p, "Misplaced #PCDATA in content declaration");
2546 case CP_seq:
2547 case CP_choice:
2548 for(i=0; i<cp->nchildren; i++)
2549 if(check_content_decl_1(p, cp->children[i]) < 0)
2550 return -1;
2551 return 0;
2552 default:
2553 return 0;
2554 }
2555}
2556
2557/* Reconstruct the content model as a string */
2558
2559static Char *stringify_cp(ContentParticle cp)
2560{
2561 int size = size_cp(cp);
2562 Char *s;
2563 FILE16 *f;
2564
2565 if(!(s = Malloc((size+1) * sizeof(Char))) ||
2566 !(f = MakeFILE16FromString(s, (size + 1) * sizeof(Char), "w")))
2567 {
2568 Free(s);
2569 return 0;
2570 }
2571
2572 print_cp(cp, f);
2573 s[size] = 0;
2574
2575 Fclose(f);
2576
2577 return s;
2578}
2579
2580static void print_cp(ContentParticle cp, FILE16 *f)
2581{
2582 int i;
2583
2584 switch(cp->type)
2585 {
2586 case CP_pcdata:
2587 Fprintf(f, "#PCDATA");
2588 break;
2589 case CP_name:
2590 Fprintf(f, "%S", cp->name);
2591 break;
2592 case CP_seq:
2593 case CP_choice:
2594 Fprintf(f, "(");
2595 for(i=0; i<cp->nchildren; i++)
2596 {
2597 if(i != 0)
2598 Fprintf(f, cp->type == CP_seq ? "," : "|");
2599 print_cp(cp->children[i], f);
2600 }
2601 Fprintf(f, ")");
2602 break;
2603 }
2604
2605 if(cp->repetition)
2606 Fprintf(f, "%c", cp->repetition);
2607}
2608
2609static int size_cp(ContentParticle cp)
2610{
2611 int i, s;
2612
2613 switch(cp->type)
2614 {
2615 case CP_pcdata:
2616 s = 7;
2617 break;
2618 case CP_name:
2619 s = Strlen(cp->name);
2620 break;
2621 default:
2622 s = 2;
2623 for(i=0; i<cp->nchildren; i++)
2624 {
2625 if(i != 0)
2626 s++;
2627 s += size_cp(cp->children[i]);
2628 }
2629 break;
2630 }
2631
2632 if(cp->repetition)
2633 s++;
2634
2635 return s;
2636}
2637
2638void FreeContentParticle(ContentParticle cp)
2639{
2640 int i;
2641
2642 if(!cp)
2643 return;
2644
2645 switch(cp->type)
2646 {
2647 case CP_pcdata:
2648 break;
2649 case CP_name:
2650 Free(cp->name);
2651 break;
2652 case CP_seq:
2653 case CP_choice:
2654 for(i=0; i<cp->nchildren; i++)
2655 FreeContentParticle(cp->children[i]);
2656 Free(cp->children);
2657 break;
2658 }
2659
2660 Free(cp);
2661}
2662
2663/* Called after reading '<!ATTLIST ' */
2664
2665static int parse_attlist_decl(Parser p)
2666{
2667 Char *name;
2668 ElementDefinition element;
2669 AttributeType type;
2670 DefaultType default_type;
2671 Char **allowed_values, *t;
2672 Char *default_value;
2673 int nvalues, i;
2674
2675 require(parse_name(p, "for name in attlist declaration"));
2676 CopyName(name);
2677 maybe_uppercase(p, name);
2678
2679 if(!(element = FindElement(p->dtd, name)))
2680 {
2681 if(!(element = TentativelyDefineElement(p->dtd, name)))
2682 return error(p, "System error");
2683 }
2684 Free(name);
2685
2686 require(expect_dtd_whitespace(p,
2687 "after element name in attlist declaration"));
2688
2689 while(!looking_at(p, ">"))
2690 {
2691 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2692 require(parse_name(p, "for attribute in attlist declaration"));
2693 CopyName(name);
2694 maybe_uppercase(p, name);
2695
2696 require(expect_dtd_whitespace(p, "after name in attlist declaration"));
2697
2698 if(looking_at(p, "CDATA"))
2699 type = AT_cdata;
2700 else if(looking_at(p, "IDREFS"))
2701 type = AT_idrefs;
2702 else if(looking_at(p, "IDREF"))
2703 type = AT_idref;
2704 else if(looking_at(p, "ID"))
2705 type = AT_id;
2706 else if(looking_at(p, "ENTITIES"))
2707 type = AT_entities;
2708 else if(looking_at(p, "ENTITY"))
2709 type = AT_entity;
2710 else if(looking_at(p, "NMTOKENS"))
2711 type = AT_nmtokens;
2712 else if(looking_at(p, "NMTOKEN"))
2713 type = AT_nmtoken;
2714 else if(looking_at(p, "NOTATION"))
2715 type = AT_notation;
2716 else
2717 type = AT_enumeration;
2718
2719 if(type != AT_enumeration)
2720 {
2721 require(expect_dtd_whitespace(p, "after attribute type"));
2722 }
2723
2724 if(type == AT_notation || type == AT_enumeration)
2725 {
2726 require(expect(p, '(',
2727 "or keyword for type in attlist declaration"));
2728
2729 nvalues = 0;
2730 p->pbufnext = 0;
2731 do
2732 {
2733 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2734 if(type == AT_notation)
2735 require(parse_name(p,
2736 "for notation value in attlist declaration"));
2737 else
2738 require(parse_nmtoken(p,
2739 "for enumerated value in attlist declaration"));
2740 maybe_uppercase_name(p);
2741 ExpandBuf(p->pbuf, p->pbufnext + p->namelen + 1);
2742 memcpy(p->pbuf+p->pbufnext,
2743 p->name,
2744 p->namelen * sizeof(Char));
2745 p->pbuf[p->pbufnext + p->namelen] = 0;
2746 p->pbufnext += (p->namelen + 1);
2747 nvalues++;
2748 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2749 }
2750 while(looking_at(p, "|"));
2751
2752 require(expect(p, ')',
2753 "at end of enumerated value list in attlist declaration"));
2754 require(expect_dtd_whitespace(p, "after enumerated value list "
2755 "in attlist declaration"));
2756
2757 allowed_values = Malloc((nvalues+1)*sizeof(Char *));
2758 if(!allowed_values)
2759 return error(p, "System error");
2760 for(i=0, t=p->pbuf; i<nvalues; i++)
2761 {
2762 allowed_values[i] = t;
2763 while(*t++)
2764 ;
2765 }
2766 allowed_values[nvalues] = 0;
2767
2768 Consume(p->pbuf);
2769 }
2770 else
2771 allowed_values = 0;
2772
2773 if(looking_at(p, "#REQUIRED"))
2774 default_type = DT_required;
2775 else if(looking_at(p, "#IMPLIED"))
2776 default_type = DT_implied;
2777 else if(looking_at(p, "#FIXED"))
2778 {
2779 default_type = DT_fixed;
2780 require(expect_dtd_whitespace(p, "after #FIXED"));
2781 }
2782 else
2783 default_type = DT_none;
2784
2785 if(default_type == DT_fixed || default_type == DT_none)
2786 {
2787 require(parse_string(p,
2788 "for default value in attlist declaration",
2789 type == AT_cdata ? LT_cdata_attr :
2790 LT_tok_attr));
2791 default_value = p->pbuf;
2792 Consume(p->pbuf);
2793 if(type != AT_cdata && type != AT_entity && type != AT_entities)
2794 maybe_uppercase(p, default_value);
2795 }
2796 else
2797 default_value = 0;
2798
2799 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2800
2801 if(FindAttribute(element, name))
2802 {
2803 if(ParserGetFlag(p, WarnOnRedefinitions))
2804 warn(p, "Ignoring redeclaration of attribute %S", name);
2805 if(allowed_values)
2806 {
2807 Free(allowed_values[0]);
2808 Free(allowed_values);
2809 }
2810 if(default_value)
2811 Free(default_value);
2812 }
2813 else
2814 if(!DefineAttribute(element, name, type, allowed_values,
2815 default_type, default_value))
2816 return error(p, "System error");
2817
2818 Free(name);
2819 }
2820
2821 return 0;
2822}
2823
2824/* Used for external dtd part, entity definitions and notation definitions. */
2825/* NB PE references are not allowed here (why not?) */
2826
2827static int parse_external_id(Parser p, int required,
2828 char8 **publicid, char8 **systemid,
2829 int preq, int sreq)
2830{
2831 InputSource s = p->source;
2832 int c;
2833 Char *cp;
2834
2835 *publicid = 0;
2836 *systemid = 0;
2837
2838 if(looking_at(p, "SYSTEM"))
2839 {
2840 if(!sreq)
2841 {
2842 skip_whitespace(s);
2843 c = get(s); unget(s);
2844 if(c != '"' && c != '\'')
2845 return 0;
2846 }
2847 else
2848 require(expect_dtd_whitespace(p, "after SYSTEM"));
2849
2850 require(parse_string(p, "for system ID", LT_plain));
2851 if(!(*systemid = strdup8(Chartochar8(p->pbuf))))
2852 return error(p, "System error");
2853 }
2854 else if(looking_at(p, "PUBLIC"))
2855 {
2856 if(!preq && !sreq)
2857 {
2858 skip_whitespace(s);
2859 c = get(s); unget(s);
2860 if(c != '"' && c != '\'')
2861 return 0;
2862 }
2863 else
2864 require(expect_dtd_whitespace(p, "after PUBLIC"));
2865
2866 require(parse_string(p, "for public ID", LT_plain));
2867
2868 for(cp=p->pbuf; *cp; cp++)
2869 if(!is_ascii_alpha(*cp) && !is_ascii_digit(*cp) &&
2870 strchr8("-'()+,./:=?;!*#@$_% \r\n", *cp) == 0)
2871 return error(p, "Illegal character %s in public id",
2872 escape(*cp));
2873
2874 if(!(*publicid = strdup8(Chartochar8(p->pbuf))))
2875 return error(p, "System error");
2876
2877 if(!sreq)
2878 {
2879 skip_whitespace(s);
2880 c = get(s); unget(s);
2881 if(c != '"' && c != '\'')
2882 return 0;
2883 }
2884 else
2885 require(expect_dtd_whitespace(p, "after public id"));
2886
2887 require(parse_string(p, "for system ID", LT_plain));
2888 if(!(*systemid = strdup8(Chartochar8(p->pbuf))))
2889 return error(p, "System error");
2890 }
2891 else if(required)
2892 return error(p, "Missing or invalid external ID");
2893
2894 return 0;
2895}
2896
2897/* Called after reading '<!ENTITY ' */
2898
2899static int parse_entity_decl(Parser p, Entity ent, int line, int chpos)
2900{
2901 Entity e, old;
2902 int pe, t;
2903 Char *name;
2904
2905 pe = looking_at(p, "%"); /* If it were a PE ref, we would
2906 already have pushed it */
2907
2908 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2909 require(parse_name(p, "for name in entity declaration"));
2910 CopyName(name);
2911
2912 require(expect_dtd_whitespace(p, "after name in entity declaration"));
2913
2914 if(looking_at(p, "'") || looking_at(p, "\""))
2915 {
2916 Char *value;
2917
2918 unget(p->source);
2919 require(parse_string(p, "for value in entity declaration", LT_entity));
2920 value = p->pbuf;
2921 Consume(p->pbuf);
2922
2923 if(!(e = NewInternalEntity(name, value, ent, line, chpos, 0)))
2924 return error(p, "System error");
2925 }
2926 else
2927 {
2928 char8 *publicid, *systemid;
2929 NotationDefinition notation = 0;
2930
2931 require(parse_external_id(p, 1, &publicid, &systemid, 1, 1));
2932
2933 require((t = skip_dtd_whitespace(p, p->external_pe_depth > 0)));
2934 if(looking_at(p, "NDATA"))
2935 {
2936 if(t == 0)
2937 return error(p, "Whitespace missing before NDATA");
2938 if(pe)
2939 return error(p, "NDATA not allowed for parameter entity");
2940 require(expect_dtd_whitespace(p, "after NDATA"));
2941 require(parse_name(p, "for notation name in entity declaration"));
2942 maybe_uppercase_name(p);
2943 notation = FindNotationN(p->dtd, p->name, p->namelen);
2944 if(!notation)
2945 {
2946 notation =
2947 TentativelyDefineNotationN(p->dtd, p->name, p->namelen);
2948 if(!notation)
2949 return error(p, "System error");
2950 }
2951 }
2952
2953 if(!(e = NewExternalEntity(name, publicid, systemid, notation, ent)))
2954 return error(p, "System error");
2955 }
2956
2957 Free(name);
2958
2959 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2960 require(expect(p, '>', "at end of entity declaration"));
2961
2962 if((old = FindEntity(p->dtd, e->name, pe)) &&
2963 old->parent != xml_builtin_entity)
2964 {
2965 if(ParserGetFlag(p, WarnOnRedefinitions))
2966 warn(p, "Ignoring redefinition of%s entity %S",
2967 pe ? " parameter" : "", e->name);
2968 }
2969 else
2970 if(!DefineEntity(p->dtd, e, pe))
2971 return error(p, "System error");
2972
2973 return 0;
2974}
2975
2976/* Called after reading '<!NOTATION ' */
2977
2978static int parse_notation_decl(Parser p)
2979{
2980 Char *name;
2981 char8 *publicid, *systemid;
2982 NotationDefinition def;
2983
2984 require(parse_name(p, "for name in notation declaration"));
2985 CopyName(name);
2986 maybe_uppercase(p, name);
2987
2988 require(expect_dtd_whitespace(p, "after name in notation declaration"));
2989
2990 require(parse_external_id(p, 1, &publicid, &systemid, 1, 0));
2991
2992 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
2993 require(expect(p, '>', "at end of notation declaration"));
2994
2995 if((def = FindNotation(p->dtd, name)))
2996 {
2997 if(def->tentative)
2998 RedefineNotation(def, publicid, systemid);
2999 else
3000 if(ParserGetFlag(p, WarnOnRedefinitions))
3001 {
3002 warn(p, "Ignoring redefinition of notation %S", name);
3003 if(publicid) Free(publicid);
3004 if(systemid) Free(systemid);
3005 }
3006 }
3007 else
3008 {
3009 if(!DefineNotation(p->dtd, name, publicid, systemid))
3010 return error(p, "System error");
3011 }
3012
3013 Free(name);
3014
3015 return 0;
3016}
3017
3018static int parse_conditional(Parser p)
3019{
3020 int depth=1;
3021
3022 if(p->external_pe_depth == 0)
3023 return error(p, "Conditional section not allowed in internal subset");
3024
3025 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3026 if(looking_at(p, "INCLUDE"))
3027 {
3028 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3029 require(expect(p, '[', "at start of conditional section"));
3030 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3031 while(!looking_at(p, "]"))
3032 {
3033 switch(parse_markupdecl(p))
3034 {
3035 case 1:
3036 return error(p, "EOF in conditional section");
3037 case -1:
3038 return -1;
3039 }
3040 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3041 }
3042
3043 if(!looking_at(p, "]>"))
3044 return error(p, "]> required after ] in conditional section");
3045 }
3046 else if(looking_at(p, "IGNORE"))
3047 {
3048 /* Easy, because ]]> not even allowed in strings! */
3049
3050 require(skip_dtd_whitespace(p, p->external_pe_depth > 0));
3051 require(expect(p, '[', "at start of conditional section"));
3052
3053 while(depth > 0)
3054 {
3055 switch(get(p->source))
3056 {
3057 case XEOE:
3058 if(p->source->parent)
3059 ParserPop(p);
3060 else
3061 return error(p, "EOE in ignored conditional section");
3062 break;
3063 case '<':
3064 if(looking_at(p, "!["))
3065 depth++;
3066 break;
3067 case ']':
3068 if(looking_at(p, "]>"))
3069 depth--;
3070 }
3071 }
3072 }
3073 else
3074 return error(p, "INCLUDE or IGNORE required in conditional section");
3075
3076 return 0;
3077}
3078
3079static void maybe_uppercase(Parser p, Char *s)
3080{
3081 if(ParserGetFlag(p, CaseInsensitive))
3082 while(*s)
3083 {
3084 *s = Toupper(*s);
3085 s++;
3086 }
3087}
3088
3089static void maybe_uppercase_name(Parser p)
3090{
3091 int i;
3092
3093 if(ParserGetFlag(p, CaseInsensitive))
3094 for(i=0; i<p->namelen; i++)
3095 p->name[i] = Toupper(p->name[i]);
3096}
3097
3098static int str_maybecase_cmp8(Parser p, const char8 *a, const char8 *b)
3099{
3100 return
3101 ParserGetFlag(p, CaseInsensitive) ? strcasecmp8(a, b) : strcmp8(a, b);
3102}
3103
3104static int is_ascii_alpha(int c)
3105{
3106 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
3107}
3108
3109static int is_ascii_digit(int c)
3110{
3111 return c >= '0' && c <= '9';
3112}
3113
3114/* Error handling */
3115
3116static void verror(XBit bit, const char8 *format, va_list args)
3117{
3118 /* yuk, but we don't want to fail if we can't allocate */
3119 static char8 message[400];
3120
3121 /* Print message before freeing xbit, so we can print data from it */
3122 Vsprintf(message, CE_ISO_8859_1, format, args);
3123
3124 FreeXBit(bit);
3125 bit->type = XBIT_error;
3126 bit->error_message = message;
3127}
3128
3129static int error(Parser p, const char8 *format, ...)
3130{
3131 va_list args;
3132
3133 va_start(args, format);
3134 verror(&p->xbit, format, args);
3135
3136 p->state = PS_error;
3137
3138 return -1;
3139}
3140
3141static void warn(Parser p, const char8 *format, ...)
3142{
3143 va_list args;
3144 static struct xbit bit;
3145
3146 va_start(args, format);
3147 verror(&bit, format, args);
3148
3149 bit.type = XBIT_warning;
3150
3151 if(p->warning_callback)
3152 p->warning_callback(&bit, p->callback_arg);
3153 else
3154 ParserPerror(p, &bit);
3155}
3156