Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
genxml.cc
1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* */
34 /* Author: Richard Caley (rjc@cstr.ed.ac.uk) */
35 /* -------------------------------------------------------------------- */
36 /* Code to read utterances marked up in XML according to a DTD with */
37 /* certain conventions indicating the mapping from XML to Utterance. */
38 /* */
39 /*************************************************************************/
40
41#include <cstdlib>
42#include <cstdio>
43#include <cctype>
44#include "EST_TDeque.h"
45#include "EST_THash.h"
46#include "EST_error.h"
47#include "genxml.h"
48#include "rxp/XML_Parser.h"
49
50#include "ling_class_init.h"
51
52#if defined(ESTLIBDIRC)
53# define __STRINGIZE(X) #X
54# define ESTLIBDIR __STRINGIZE(ESTLIBDIRC)
55#endif
56
57
58static EST_Regex simpleIDRegex("[^#]*#id(\\([-a-z0-9]+\\))");
59static EST_Regex rangeIDRegex("[^#]*#id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*).*id(\\([a-z]*\\)\\([0-9]*\\)\\(-\\([0-9]+\\)\\)*)");
60static EST_Regex featureDefRegex("\\([^:]*\\):\\(.*\\)");
61
62// Separator between feature names in attributes.
63
64static EST_String feat_sep(",");
65
66// I'd like to get rid of this. It is a maximum for the number of features
67// which can be named in an attribute, say for copying to the utterance.
68
69#define MAX_FEATS (50)
70
71// Parse state.
72
73class GenXML_Parse_State
74 {
75public:
76 int depth;
77 int open_depth;
78 int rel_start_depth;
79 EST_TDeque<int> depth_stack;
80 EST_String relName;
81 bool linear;
82 EST_Utterance *utt;
83 EST_Relation *rel;
84 EST_Item *parent;
85 EST_Item *current;
86 EST_String contentAttr;
87
88 // used to force a given ID on a node.
89 EST_String id;
90
92
93
94 GenXML_Parse_State() : contents(100) {}
95 };
96
97class GenXML_Parser_Class : public XML_Parser_Class
98{
99protected:
100 virtual void document_open(XML_Parser_Class &c,
101 XML_Parser &p,
102 void *data);
103 virtual void document_close(XML_Parser_Class &c,
104 XML_Parser &p,
105 void *data);
106
107 virtual void element_open(XML_Parser_Class &c,
108 XML_Parser &p,
109 void *data,
110 const char *name,
111 XML_Attribute_List &attributes);
112 virtual void element(XML_Parser_Class &c,
113 XML_Parser &p,
114 void *data,
115 const char *name,
116 XML_Attribute_List &attributes);
117 virtual void element_close(XML_Parser_Class &c,
118 XML_Parser &p,
119 void *data,
120 const char *name);
121
122 virtual void pcdata(XML_Parser_Class &c,
123 XML_Parser &p,
124 void *data,
125 const char *chars);
126 virtual void cdata(XML_Parser_Class &c,
127 XML_Parser &p,
128 void *data,
129 const char *chars);
130
131 virtual void processing(XML_Parser_Class &c,
132 XML_Parser &p,
133 void *data,
134 const char *instruction);
135 virtual void error(XML_Parser_Class &c,
136 XML_Parser &p,
137 void *data);
138};
139
140static void print_attributes(XML_Attribute_List &attributes);
141
142XML_Parser_Class *EST_GenXML::pclass;
143
144
145void EST_GenXML::class_init(void)
146{
147 ling_class_init::use();
148
149 pclass = new GenXML_Parser_Class();
150#ifdef DEBUGGING
151 printf("Register estlib in genxml %s\n", ESTLIBDIR "/\\1.dtd");
152#endif
153
154 pclass->register_id("//CSTR EST//DTD \\(.*\\)//[A-Z]*",
155 ESTLIBDIR "/\\1.dtd");
156 pclass->register_id("//CSTR EST//ENTITIES \\(.*\\)//[A-Z]*",
157 ESTLIBDIR "/\\1.ent");
158}
159
160void EST_GenXML::register_id(const EST_String pattern,
161 const EST_String result)
162{
163 EST_GenXML::pclass->register_id(pattern, result);
164}
165
166void EST_GenXML::registered_ids(EST_StrList &list)
167{
168 EST_GenXML::pclass->registered_ids(list);
169}
170
171InputSource EST_GenXML::try_and_open(Entity ent)
172{
173 return EST_GenXML::pclass->try_and_open(ent);
174}
175
176
177EST_read_status EST_GenXML::read_xml(FILE *file,
178 const EST_String &name,
179 EST_Utterance &u,
180 int &max_id)
181{
182 (void)max_id;
183 (void)print_attributes; // just to shut -Wall up.
184 GenXML_Parse_State state;
185
186 u.clear();
187
188 state.utt=&u;
189
190 XML_Parser *parser = EST_GenXML::pclass->make_parser(file, name, &state);
191 parser->track_context(TRUE);
192
193 CATCH_ERRORS()
194 return read_format_error;
195
196 parser->go();
197
198 END_CATCH_ERRORS();
199
200 return read_ok;
201}
202
203static void ensure_relation(GenXML_Parse_State *state, EST_String name)
204{
205 if (state->rel!=NULL && name == state->relName)
206 return;
207
208 state->rel = state->utt->create_relation(state->relName=name);
209}
210
211static EST_Item_Content *get_contents(GenXML_Parse_State *state, EST_String id)
212{
213 EST_Item_Content *c = state->contents.val(id);
214
215 if (c==NULL)
216 {
217 c = new EST_Item_Content();
218 state->contents.add_item(id, c);
219 c->f.set("id", id);
220 }
221 else
222 {
223 if (c->relations.present(state->relName))
224 return NULL;
225 }
226
227 return c;
228}
229
230static EST_String make_new_id(const char *root)
231{
232 char buf[100];
233 static int count=0;
234
235 sprintf(buf, "%s%d", root, ++count);
236 return buf;
237}
238
239
240static void extract_ids(XML_Attribute_List &attributes,
242{
243 EST_String val;
244 if (attributes.present("id"))
245 {
246 val = attributes.val("id");
247#if defined(EST_DEBUGGING)
248 fprintf(stderr, "ID %s\n", (const char *)val);
249#endif
250 ids.append(val);
251 }
252 else if (attributes.present("href"))
253 {
254 val = attributes.val("href");
255 int starts[EST_Regex_max_subexpressions];
256 int ends[EST_Regex_max_subexpressions];
257
258 if (val.matches(simpleIDRegex, 0, starts, ends))
259 {
260 EST_String n = val.at(starts[1], ends[1]-starts[1]);
261#if defined(EST_DEBUGGING)
262 fprintf(stderr, "SIMPLE %s\n", (const char *)n);
263#endif
264 ids.append(n);
265 }
266 else if (val.matches(rangeIDRegex, 0, starts, ends))
267 {
268 EST_String prefix1 = val.at(starts[1], ends[1]-starts[1]);
269 int n1 = atoi(val.at(starts[2], ends[2]-starts[2]));
270 EST_String postfix1 = val.at(starts[4], ends[4]-starts[4]);
271 EST_String prefix2 = val.at(starts[5], ends[5]-starts[5]);
272 int n2 = atoi(val.at(starts[6], ends[6]-starts[6]));
273 EST_String postfix2 = val.at(starts[8], ends[8]-starts[8]);
274
275#if defined(EST_DEBUGGING)
276 fprintf(stderr, "RANGE '%s' %d - '%s' // '%s' %d - '%s'\n",
277 (const char *)prefix1,
278 n1,
279 (const char *)postfix1,
280 (const char *)prefix2,
281 n2,
282 (const char *)postfix2
283 );
284#endif
285
286 if (prefix1==prefix2)
287 prefix2="";
288
289 char buf[100];
290 if (n1==n2)
291 {
292 int c;
293 if (postfix1.length()==0)
294 {
295 sprintf(buf, "%s%s%d",
296 (const char *)prefix1,
297 (const char *)prefix2,
298 n1
299 );
300 ids.append(buf);
301 c=1;
302 }
303 else
304 c=atoi(postfix1);
305
306 if (postfix2.length()>0)
307 for (; c<=atoi(postfix2); c++)
308 {
309 sprintf(buf, "%s%s%d-%d",
310 (const char *)prefix1,
311 (const char *)prefix2,
312 n1,
313 c
314 );
315 ids.append(buf);
316 }
317 }
318 else
319 {
320 for(int i=n1; i<=n2; i++)
321 {
322 if (i==n2
323 && postfix2.length()>0)
324 {
325 sprintf(buf, "%s%s%d",
326 (const char *)prefix1,
327 (const char *)prefix2,
328 i
329 );
330 ids.append(buf);
331 for (int c=1; c<=atoi(postfix2); c++)
332 {
333 sprintf(buf, "%s%s%d-%d",
334 (const char *)prefix1,
335 (const char *)prefix2,
336 i,
337 c
338 );
339 ids.append(buf);
340 }
341 }
342 else
343 {
344 if ( postfix1.length()>0)
345 sprintf(buf, "%s%s%d-%s",
346 (const char *)prefix1,
347 (const char *)prefix2,
348 i,
349 (const char *)postfix1
350 );
351 else
352 sprintf(buf, "%s%s%d",
353 (const char *)prefix1,
354 (const char *)prefix2,
355 i
356 );
357
358 ids.append(buf);
359 }
360 postfix1="";
361 }
362
363 }
364 }
365 else
366 EST_warning("element with bad ID or HREF '%s'", (const char *)val);
367 }
368 else
369 ids.append(make_new_id("n"));
370
371 // cout << ids << "\n";
372}
373
374/* For debugging.
375 */
376static void print_attributes(XML_Attribute_List &attributes)
377{
379
380 for(them.begin(attributes); them ; them++)
381 printf(" %s='%s'",
382 (const char *)them->k,
383 (const char *)them->v);
384}
385
386/** Now we define the callbacks.
387 */
388
389void GenXML_Parser_Class::document_open(XML_Parser_Class &c,
390 XML_Parser &p,
391 void *data)
392{
393 (void)c; (void)p;
394 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
395
396 state->depth=1;
397 state->open_depth=-1;
398 state->rel_start_depth=-1;
399 state->depth_stack.clear();
400 state->rel=NULL;
401 state->parent=NULL;
402 state->current=NULL;
403 state->id="";
404}
405
406void GenXML_Parser_Class::document_close(XML_Parser_Class &c,
407 XML_Parser &p,
408 void *data)
409{
410 (void)c; (void)p; (void)data;
411}
412
413static void proccess_features(EST_String name,
414 EST_String defs,
415 XML_Attribute_List &attributes,
416 EST_Features &f)
417{
418 EST_String names[MAX_FEATS];
419 int starts[EST_Regex_max_subexpressions];
420 int ends[EST_Regex_max_subexpressions];
421
422 int n = split(defs, names, MAX_FEATS, feat_sep);
423 for(int i=0; i<n; i++)
424 {
425 EST_String def = names[i];
426 EST_String feat;
427 EST_String attr;
428
429 if (def.matches(featureDefRegex, 0, starts, ends))
430 {
431 feat = def.at(starts[1], ends[1]-starts[1]);
432 attr = def.at(starts[2], ends[2]-starts[2]);
433 }
434 else
435 {
436 attr=def;
437 feat=EST_String::cat(name, "_", attr);
438 }
439
440 EST_String fval = attributes.val(attr);
441
442#ifdef DEBUGGING
443 printf("on %s got %s(%s)=%s\n", name,
444 (const char *)feat,
445 (const char *)attr,
446 (const char *)fval);
447#endif
448 if (fval != EST_String::Empty)
449 f.set(feat, fval);
450 }
451}
452
453void GenXML_Parser_Class::element_open(XML_Parser_Class &c,
454 XML_Parser &p,
455 void *data,
456 const char *name,
457 XML_Attribute_List &attributes)
458{
459 (void)c; (void)p; (void)attributes; (void)name;
460 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
461
462 state->depth++;
463
464 EST_String val, ig;
465
466 // Features to copy to utterance
467 if (state->utt != NULL
468 && (val=attributes.val("estUttFeats")) != EST_String::Empty)
469 proccess_features(name, val, attributes, state->utt->f);
470
471 // Features to copy to relation
472 if (state->rel != NULL
473 && (val=attributes.val("estRelFeats")) != EST_String::Empty)
474 proccess_features(name, val, attributes, state->rel->f);
475
476
477 if ((val=attributes.val("estRelationElementAttr")) != EST_String::Empty)
478 {
479 // All nodes inside this element are in the given relation
480 EST_String relName = attributes.val(val);
481
482 if (relName == EST_String::Empty)
483 {
484 relName = "UNNAMED";
485 EST_warning("%s\nNo feature '%s' to name relation\n", get_error(p), (const char *)val);
486 }
487
488 EST_String relationType = attributes.val("estRelationTypeAttr");
489
490 ensure_relation(state, relName);
491 state->rel_start_depth=state->depth;
492 state->linear=(attributes.val(relationType) == "linear"||
493 attributes.val(relationType) == "list");
494#ifdef DEBUGGING
495 printf("start of relation depth=%d name=%s type=%s\n", state->depth, (const char *)relName, state->linear?"linear":"tree");
496#endif
497 }
498 else if ((state->rel_start_depth >= 0 &&
499 (ig=attributes.val("estRelationIgnore")) == EST_String::Empty)
500 || (val=attributes.val("estRelationNode")) != EST_String::Empty)
501 {
502 // This node defines an Item in a relation.
503#ifdef DEBUGGING
504 printf("push depth=%d name=%s ig=%s\n", state->depth, name, (const char *)ig);
505#endif
506 if (val != EST_String::Empty)
507 ensure_relation(state, val);
508
509 state->depth_stack.push(state->open_depth);
510 state->open_depth=state->depth;
511
513
514 if (state->id == EST_String::Empty)
515 {
516 extract_ids(attributes, ids);
517 }
518 else
519 ids.append(state->id);
520
521 switch (ids.length())
522 {
523 case 0:
524 XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
525 break;
526 case 1:
527 {
528 EST_String id = ids.first();
529
530 if (id==EST_String::Empty)
531 XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
532
533 EST_Item_Content *cont = get_contents(state, id);
534
535 if (!cont)
536 XML_Parser_Class::error(c, p, data, EST_String("Repeated Id ") + id);
537
539 for(them.begin(attributes); them ; them++)
540 {
541 EST_String k = them->k;
542 EST_String v = them->v;
543 cont->f.set(k,v);
544 }
545
546 cont->f.set("id", id);
547
548 EST_Item *item;
549
550 if (state->linear)
551 if (state->current == NULL)
552 item = state->rel->append();
553 else
554 item = state->current->insert_after();
555 else if (state->current == NULL)
556 if (state->parent == NULL)
557 item = state->rel->append();
558 else
559 item = state->parent->append_daughter();
560 else
561 if (state->parent == NULL)
562 item = state->current->insert_after();
563 else
564 item = state->parent->append_daughter();
565
566 item->set_contents(cont);
567
568 state->current=NULL;
569 state->parent=item;
570 }
571 break;
572
573 default:
574 {
575 bool embed = (attributes.val("estExpansion") == "embed");
576 if (embed)
577 {
578 state->id=make_new_id("e");
579 element_open(c, p, data, name, attributes);
580 state->id="";
581 }
582 EST_Litem *idp = ids.head();
583 bool first=TRUE;
584 for(; idp!= NULL; idp = idp->next())
585 {
586 EST_String id = ids(idp);
587 if (id==EST_String::Empty)
588 XML_Parser_Class::error(c, p, data, EST_String("Element With No Id"));
589
590 if (!first)
591 element_close(c, p, data, name);
592 else
593 first=FALSE;
594
595 state->id=id;
596 element_open(c, p, data, name, attributes);
597 state->id=EST_String::Empty;
598 }
599 if (embed)
600 {
601 element_close(c, p, data, name);
602 }
603 }
604 }
605
606
607 if (state->parent!=NULL)
608 state->contentAttr = attributes.val("estContentFeature");
609
610#ifdef DEBUGGING
611 printf("\t current=%s parent=%s contA=%s\n",
612 (const char *)state->current->name(),
613 (const char *)state->parent->name(),
614 (const char *)state->contentAttr);
615#endif
616
617 }
618 else
619 ; // Skip
620
621}
622
623
624void GenXML_Parser_Class::element(XML_Parser_Class &c,
625 XML_Parser &p,
626 void *data,
627 const char *name,
628 XML_Attribute_List &attributes)
629{
630 (void)c; (void)p; (void)attributes;
631 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
632 (void)state;
633
634 element_open(c, p, data, name, attributes);
635 element_close(c, p, data, name);
636}
637
638
639void GenXML_Parser_Class::element_close(XML_Parser_Class &c,
640 XML_Parser &p,
641 void *data,
642 const char *name)
643{
644 (void)c; (void)p; (void)name;
645 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
646
647 EST_String val;
648
649
650 if (state->depth == state->rel_start_depth )
651 {
652#ifdef DEBUGGING
653 printf("end of relation depth=%d name=%s\n", state->depth, name);
654#endif
655 state->rel_start_depth=-1;
656 }
657
658 if (
659 state->depth == state->open_depth)
660 {
661#ifdef DEBUGGING
662 printf("pop depth=%d name=%s\n", state->depth, name);
663#endif
664 state->current = state->parent;
665 state->parent=parent(state->parent);
666 state->open_depth = state->depth_stack.pop();
667#ifdef DEBUGGING
668 printf("\t current=%s parent=%s\n",
669 (const char *)state->current->name(),
670 (const char *)state->parent->name());
671#endif
672 }
673
674
675 state->depth--;
676}
677
678
679void GenXML_Parser_Class::pcdata(XML_Parser_Class &c,
680 XML_Parser &p,
681 void *data,
682 const char *chars)
683{
684 (void)c;
685 (void)p;
686 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
687
688
689 if ( state->parent != NULL && state->contentAttr != EST_String::Empty)
690 state->parent->set(state->contentAttr, chars);
691
692#ifdef DEBUGGING
693 printf("GEN XML Parser [pcdata[%s]] %d\n", chars, state->depth);
694#endif
695}
696
697
698void GenXML_Parser_Class::cdata(XML_Parser_Class &c,
699 XML_Parser &p,
700 void *data,
701 const char *chars)
702{
703 (void)c; (void)p; (void)data; (void)chars;
704 // GenXML_Parse_State *state = (GenXML_Parse_State *)data;
705
706#ifdef DEBUGGING
707 printf("GEN XML Parser [cdata[%s]] %d\n", chars, state->depth);
708#endif
709}
710
711
712void GenXML_Parser_Class::processing(XML_Parser_Class &c,
713 XML_Parser &p,
714 void *data,
715 const char *instruction)
716{
717 (void)c; (void)p; (void)instruction;
718 GenXML_Parse_State *state = (GenXML_Parse_State *)data;
719 (void)state;
720
721#ifdef DEBUGGING
722 printf("GEN XML Parser [proc[%s]] %d\n", instruction, state->depth);
723#endif
724}
725
726
727void GenXML_Parser_Class::error(XML_Parser_Class &c,
728 XML_Parser &p,
729 void *data)
730{
731 (void)c; (void)p; (void)data;
732 // GenXML_Parse_State *state = (GenXML_Parse_State *)data;
733
734 EST_error("GEN XML Parser %s", get_error(p));
735
736 est_error_throw();
737}
738
741
742#if defined(INSTANTIATE_TEMPLATES)
743
744#include "../base_class/EST_THash.cc"
745
746Instantiate_TStringHash_T(EST_Item_Content *, THash_String_ItemC_P)
747
748#endif
void set(const EST_String &name, int ival)
EST_Features f
General features for this item.
void set(const EST_String &name, int ival)
Definition EST_Item.h:179
EST_Features f
static const EST_String Empty
Constant empty string.
Definition EST_String.h:111
static EST_String cat(const EST_String s1, const EST_String s2=Empty, const EST_String s3=Empty, const EST_String s4=Empty, const EST_String s5=Empty, const EST_String s6=Empty, const EST_String s7=Empty, const EST_String s8=Empty, const EST_String s9=Empty)
int length(void) const
Length of string ({not} length of underlying chunk)
Definition EST_String.h:241
int matches(const char *e, int pos=0) const
Exactly match this string?
EST_String at(int from, int len=0) const
Return part at position.
Definition EST_String.h:302
void clear(void)
Empty it out.
V & val(const K &key, int &found) const
Definition EST_THash.cc:114
int add_item(const K &key, const V &value, int no_search=0)
Add an entry to the table.
Definition EST_THash.cc:167
int present(const K &key) const
Does the key have an entry?
Definition EST_THash.cc:96
void begin(const Container &over)
Set the iterator ready to run over this container.
const int present(const K &rkey) const
Returns true if key is present.
Definition EST_TKVL.cc:222
void append(const T &item)
add item onto end of list
Definition EST_TList.h:191
const T & first() const
return const reference to first item in list
Definition EST_TList.h:146
EST_Features f
Utterance level features.
void clear()
remove everything in utterance
XML_Parser * make_parser(InputSource source, void *data)
Create a parser for the RXP InputSource.
Definition XML_Parser.cc:72
void register_id(EST_Regex id_pattern, EST_String directory)
Definition XML_Parser.cc:48
virtual void error(XML_Parser_Class &c, XML_Parser &p, void *data)
InputSource try_and_open(Entity ent)
void registered_ids(EST_TList< EST_String > &list)
Definition XML_Parser.cc:53
void track_context(bool flag)