Edinburgh Speech Tools 2.4-release
 
Loading...
Searching...
No Matches
url.c
1/*************************************************************************/
2/* */
3/* Copyright (c) 1997-98 Richard Tobin, Language Technology Group, HCRC, */
4/* University of Edinburgh. */
5/* */
6/* THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, */
7/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
8/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/
9/* IN NO EVENT SHALL THE AUTHOR OR THE UNIVERSITY OF EDINBURGH BE LIABLE */
10/* FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF */
11/* CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION */
12/* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
13/* */
14/*************************************************************************/
15#ifdef FOR_LT
16
17#include "lt-defs.h"
18#include "lt-memory.h"
19#include "lt-errmsg.h"
20#include "lt-comment.h"
21#include "lt-safe.h"
22#include "nsl-err.h"
23
24#define Strerror() strErr()
25#define Malloc salloc
26#define Realloc srealloc
27#define Free sfree
28#define fopen stdsfopen
29
30#else
31
32#include "system.h"
33
34#define LT_ERROR(err, format) fprintf(stderr, format)
35#define LT_ERROR1(err, format, arg) fprintf(stderr, format, arg)
36#define LT_ERROR2(err, format, arg1, arg2) fprintf(stderr, format, arg1, arg2)
37#define LT_ERROR3(err, format, arg1, arg2, arg3) fprintf(stderr, format, arg1, arg2, arg3)
38#define WARN(err, format) fprintf(stderr, format)
39#define WARN1(err, format, arg) fprintf(stderr, format, arg)
40
41#define Strerror() strerror(errno)
42
43#ifdef MAXPATHLEN
44#define CWDBS MAXPATHLEN+1
45#else
46#define CWDBS 1025
47#endif
48
49#define GETWD(buf) getcwd(buf,CWDBS)
50
51#endif /* FOR_LT */
52
53#include <stdio.h>
54#include <ctype.h>
55#include <stdlib.h>
56#include <assert.h>
57#include <errno.h>
58#include <string.h> /* that's where strerror is. really. */
59#include <sys/types.h>
60
61#ifdef WIN32
62#include <direct.h>
63#endif
64
65#ifdef SOCKETS_IMPLEMENTED
66
67#ifdef WIN32
68#undef boolean
69#include <winsock.h>
70#include <fcntl.h>
71#else
72#include <unistd.h>
73#include <netdb.h>
74#include <sys/socket.h>
75#include <netinet/in.h>
76#endif
77
78#endif
79
80#include "string16.h"
81#include "stdio16.h"
82#include "url.h"
83
84#ifdef HAVE_LIBZ
85#include "zlib.h"
86#ifdef macintosh
87#include <fcntl.h>
88#include <unix.h>
89#endif
90#endif
91
92static FILE16 *http_open(const char *url,
93 const char *host, int port, const char *path,
94 const char *type);
95static FILE16 *file_open(const char *url,
96 const char *host, int port, const char *path,
97 const char *type);
98
99static void parse_url(const char *url,
100 char **scheme, char **host, int *port, char **path);
101
102/* Mapping of scheme names to opening functions */
103
104struct {
105 char *scheme;
106 FILE16 *(*open)(const char *, const char *, int, const char *, const char *);
107} schemes[] = {
108 {(char *)"http", http_open},
109 {(char *)"file", file_open},
110};
111#define NSCHEME (sizeof(schemes) / sizeof(schemes[0]))
112
113/* Construct a default base URL, essentially file:`pwd`/ */
114
115char *default_base_url(void)
116{
117 char buf[CWDBS];
118 char *url;
119
120 if(!GETWD(buf))
121 {
122 WARN(LEFILE, "Warning: can't get current directory for default base url\n");
123 return strdup8("file:/");
124 }
125
126
127#ifdef WIN32
128
129 /* DOS: translate C:\a\b to file:/C:/a/b/ */
130 /* XXX should we escape anything? */
131 {
132 char *p;
133 for(p=buf; *p; p++)
134 if(*p == '\\')
135 *p = '/';
136 }
137 url = Malloc(6 + strlen(buf) + 2);
138 sprintf(url, "file:/%s/", buf);
139
140#else
141#ifdef mac_filenames
142
143 /* Mac: translate a:b to file:/a/b/ */
144 /* XXX should escape spaces and slashes, at least */
145 {
146 char *p;
147 for(p=buf; *p; p++)
148 if(*p == ':')
149 *p = '/';
150 /* Mac getcwd (always?) has a trailing separator, which we here bash */
151 if(*--p == '/')
152 *p = 0;
153 }
154 url = Malloc(6 + strlen(buf) + 2);
155 sprintf(url, "file:/%s/", buf);
156
157#else
158
159 /* Unix: translate /a/b to file:/a/b/ */
160
161 url = Malloc(5 + strlen(buf) + 2);
162 sprintf(url, "file:%s/", buf);
163
164#endif
165#endif
166
167 return url;
168}
169
170/*
171 * Merge a URL with a base URL if necessary.
172 * The merged URL is returned.
173 * The parts of the URL are returned in scheme, host, port and path
174 * if these are non-null.
175 * Caller should free the results.
176 */
177
178char *url_merge(const char *url, const char *base,
179 char **_scheme, char **_host, int *_port, char **_path)
180{
181 char *merged_scheme, *merged_host, *merged_path, *merged_url;
182 char *scheme=0, *host=0, *path=0;
183 char *base_scheme=0, *base_host=0, *base_path=0;
184 char *default_base=0;
185 int port, base_port, merged_port, i, j;
186 char *p;
187
188 /* First see if we have an absolute URL */
189
190 parse_url(url, &scheme, &host, &port, &path);
191 if(scheme && (host || *path == '/'))
192 {
193 merged_scheme = scheme;
194 merged_host = host;
195 merged_port = port;
196 merged_path = path;
197 merged_url = strdup8(url);
198 goto ok;
199 }
200
201 /* Relative URL, so we need the base URL */
202
203 if(!base)
204 base = default_base = default_base_url();
205
206 parse_url(base, &base_scheme, &base_host, &base_port, &base_path);
207 if(base_scheme && (base_host || *base_path == '/'))
208 ;
209 else
210 {
211 LT_ERROR1(LEFILE, "Error: bad base URL <%s>\n", base);
212 goto bad;
213 }
214
215 /* Determine merged path */
216
217 if(path[0] == '/')
218 {
219 /* not relative, use as-is */
220 merged_path = path;
221 path = 0;
222 }
223 else
224 {
225 /* relative, append to base path */
226
227 merged_path = Malloc(strlen(base_path) + strlen(path) + 1);
228 strcpy(merged_path, base_path);
229
230 /* strip last component of base */
231
232 for(i=strlen(merged_path)-1; i>=0 && merged_path[i] != '/'; i--)
233 merged_path[i] = '\0';
234
235 /* append relative path */
236
237 strcat(merged_path, path);
238
239 /* Remove . and .. components from path */
240
241 p = merged_path;
242 for(i=0; p[i]; )
243 {
244 assert(p[i] == '/');
245
246 /* find next segment */
247
248 for(j=i+1; p[j] && p[j] != '/'; j++)
249 ;
250
251 /* Do we have "." ? */
252
253 if(j - i == 2 && p[i+1] == '.')
254 {
255 strcpy(&p[i+1], p[j] ? &p[j+1] : &p[j]);
256 continue;
257 }
258
259 /* Do we have "<segment>/.." with <segment> != ".." ? */
260
261 /* (We know we're not looking at "./" so we don't have to
262 * worry about "./..")
263 */
264
265 if(p[j] == '/' && p[j+1] == '.' && p[j+2] == '.' &&
266 (p[j+3] == '/' || p[j+3] == '\0') &&
267 (j - i != 3 || p[i+1] != '.' || p[i+2] != '.'))
268 {
269 strcpy(&p[i+1], p[j+3] ? &p[j+4] : &p[j+3]);
270 i = 0; /* start again from beginning */
271 continue;
272 }
273
274 /* move to next segment */
275
276 i = j;
277 }
278 }
279
280 /* Check for deviant relative URLs like file:foo */
281
282 if(scheme && !host && *path != '/')
283 {
284 if(strcmp(scheme, base_scheme) == 0)
285 {
286 WARN1(LEFILE,
287 "Warning: relative URL <%s> contains scheme, contrary to RFC 1808\n",
288 url);
289 }
290 else
291 {
292 LT_ERROR2(LEFILE,
293 "Error: relative URL <%s> has scheme different from base <%s>\n",
294 url, base);
295 goto bad;
296 }
297 }
298
299 /* Return the parts and the whole thing */
300
301 merged_scheme = base_scheme; if(scheme) Free(scheme);
302
303 if(host)
304 {
305 merged_host = host; Free(base_host);
306 merged_port = port;
307 }
308 else
309 {
310 merged_host = base_host;
311 merged_port = base_port;
312 }
313
314 Free(path); Free(base_path);
315
316 merged_url = Malloc(strlen(merged_scheme) + 1 +
317 (merged_host ? 2 + strlen(merged_host) + 10 : 0) +
318 strlen(merged_path) + 1);
319 if(merged_host)
320 {
321 if(merged_port == -1)
322 sprintf(merged_url, "%s://%s%s",
323 merged_scheme, merged_host, merged_path);
324 else
325 sprintf(merged_url, "%s://%s:%d%s",
326 merged_scheme, merged_host, merged_port, merged_path);
327 }
328 else
329 sprintf(merged_url, "%s:%s", merged_scheme, merged_path);
330
331ok:
332 Free(default_base);
333 if(_scheme) *_scheme = merged_scheme; else Free(merged_scheme);
334 if(_host) *_host = merged_host; else Free(merged_host);
335 if(_port) *_port = merged_port;
336 if(_path) *_path = merged_path; else Free(merged_path);
337
338 return merged_url;
339
340bad:
341 Free(default_base);
342 Free(scheme);
343 Free(host);
344 Free(path);
345 Free(base_scheme);
346 Free(base_host);
347 Free(base_path);
348
349 return NULL;
350}
351
352/*
353 * Open a stream to a URL.
354 * url may be a relative URL, in which case it is merged with base,
355 * which is typically the URL of the containing document. If base
356 * is null, file:`pwd`/ is used, which is the right thing to do for
357 * filenames. If base is "", there is no base URL and relative
358 * URLs will fail.
359 * If merged_url is non-null the resulting URL is stored in it.
360 * If type begins "r", the URL is opened for reading, if "w" for
361 * writing. Writing is only supported for file URLs.
362 * If the type begins "rl", the data will be copied to a temporary
363 * file so that seeking is possible (NOT YET IMPLEMENTED).
364 * Returns a FILE16 for success, NULL for failure.
365 */
366
367FILE16 *url_open(const char *url, const char *base, const char *type,
368 char **merged_url)
369{
370 char *scheme, *host, *path, *m_url;
371 int port, i;
372 FILE16 *f;
373#ifdef HAVE_LIBZ
374 int len, gzipped = 0;
375#endif
376
377 /* Determine the merged URL */
378
379 if(!(m_url = url_merge(url, base, &scheme, &host, &port, &path)))
380 return 0;
381
382#ifdef HAVE_LIBZ
383 len = strlen(m_url);
384 if(len > 3 && strcmp8(m_url+len-3, ".gz") == 0)
385 gzipped = 1;
386#endif
387
388 /*
389 printf("<%s> <%s> <%d> <%s>\n", scheme, host ? host : "", port, path);
390 printf("%s\n", m_url);
391 */
392
393 /* Pass to the appropriate opening function */
394
395 for(i=0; i<NSCHEME; i++)
396 if(strcmp(scheme, schemes[i].scheme) == 0)
397 {
398 f = schemes[i].open(m_url, host, port, path, type);
399
400 Free(scheme);
401 if(host)
402 Free(host);
403 Free(path);
404
405 if(!f)
406 return f;
407
408#ifdef HAVE_LIBZ
409 if(gzipped)
410 {
411 /* We have a gzip-compressed file which we hand to gzopen
412 * for further processing.
413 */
414 gzFile gfile;
415 FILE *file = GetFILE(f);
416
417 if(!f)
418 {
419 LT_ERROR1(LEFILE,
420 "Can't attach gzip processor to URL \"%s\"\n",
421 m_url);
422 Free(m_url);
423 return 0;
424 }
425#ifdef macintosh
426 gfile =gzdopen(dup(fileno(file)), *type == 'r' ? "rb" : "wb");
427#else
428 gfile = gzdopen(dup(fileno(file)), type);
429#endif
430 Fclose(f);
431 f = MakeFILE16FromGzip(gfile, type);
432 }
433#endif
434 if(f && merged_url)
435 *merged_url = m_url;
436 else
437 Free(m_url);
438
439 return f;
440 }
441
442 /* Not implemented */
443
444 LT_ERROR1(LEFILE, "Error: scheme \"%s\" not implemented\n", scheme);
445
446 Free(scheme);
447 if(host)
448 Free(host);
449 Free(path);
450 Free(m_url);
451
452 return 0;
453}
454
455/* Open an http URL */
456
457static FILE16 *http_open(const char *url,
458 const char *host, int port, const char *path,
459 const char *type)
460{
461#ifndef SOCKETS_IMPLEMENTED
462 LT_ERROR(NEUNSUP,
463 "http: URLs are not yet implemented on this platform\n");
464 return 0;
465#else
466 FILE16 *f16;
467 struct sockaddr_in addr;
468 struct hostent *hostent;
469 int s, server_major, server_minor, status, count, c;
470 char reason[81];
471#ifndef WIN32
472 FILE *fin,*fout;
473#else
474 static int inited=0;
475 int i;
476 static char buf[1024];
477 if (!inited)
478 {
479 WORD version = MAKEWORD(1, 1);
480 WSADATA wsaData;
481 int err = WSAStartup(version, &wsaData);
482 if (err)
483 {
484 LT_ERROR(LEFILE, "Error: can't init HTTP interface\n");
485 return 0;
486 }
487 else if(LOBYTE(wsaData.wVersion) != 1 || HIBYTE(wsaData.wVersion) != 1)
488 {
489 LT_ERROR(LEFILE, "Error: wrong version of WINSOCK\n");
490 WSACleanup();
491 return 0;
492 }
493 inited = 1;
494 }
495#endif
496
497 if(*type != 'r')
498 {
499 LT_ERROR1(LEFILE, "Error: can't open http URL \"%s\" for writing\n",
500 url);
501 return 0;
502 }
503
504 if(!host)
505 {
506 LT_ERROR1(LEFILE, "Error: no host part in http URL \"%s\"\n", url);
507 return 0;
508 }
509
510 /* Create the socket */
511
512 s = socket(PF_INET, SOCK_STREAM, 0);
513#ifdef WIN32
514 if (s == INVALID_SOCKET) {
515 LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
516 WSAGetLastError());
517 };
518#else
519 if(s == -1) {
520 LT_ERROR1(LEFILE, "Error: system call socket failed: %s\n",
521 Strerror());
522 return 0;
523 };
524#endif
525
526 /* Find the server address */
527
528 hostent = gethostbyname(host);
529 if(!hostent)
530 {
531 LT_ERROR1(LEFILE,
532 "Error: can't find address for host in http URL \"%s\"\n",
533 url);
534 return 0;
535 }
536
537 memset(&addr, 0, sizeof(addr));
538 addr.sin_family = AF_INET;
539 /* If we were really enthusiastic, we would try all the host's addresses */
540 memcpy(&addr.sin_addr, hostent->h_addr, hostent->h_length);
541 addr.sin_port = htons((u_short)(port == -1 ? 80 : port));
542
543 /* Connect */
544
545 if(connect(s, (struct sockaddr *)&addr, sizeof(addr)) == -1)
546 {
547 LT_ERROR1(LEFILE, "Error: system call connect failed: %s\n",
548 Strerror());
549 return 0;
550 }
551
552#ifndef WIN32
553#ifdef macintosh
554 fin = fdopen(s, "rb");
555 setvbuf(fin, 0, _IONBF, 0);
556 fout = fdopen(dup(s), "wb");
557#else
558 fin = fdopen(s, "r");
559 setvbuf(fin, 0, _IONBF, 0);
560 fout = fdopen(dup(s), "w");
561#endif
562#endif
563
564 /* Send the request */
565
566 /*
567 * Apparently on the Macintosh, \n might not be ASCII LF, so we'll
568 * use numerics to be sure.
569 */
570
571#ifdef WIN32
572 sprintf(buf, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
573 path);
574 if (send(s,buf,strlen8(buf),0)==SOCKET_ERROR) {
575 LT_ERROR1(LEFILE, "Error: system call socket failed: %d\n",
576 WSAGetLastError());
577 /* XXX close the socket? */
578 return 0;
579 };
580#else
581 fprintf(fout, "GET %s HTTP/1.0\012\015Connection: close\012\015\012\015",
582 path);
583
584 /* We used to test for errors after doing fclose, but this seemed
585 to produce spurious errors under Linux (RedHat 4.2), so now we
586 do fflush and test after that. */
587
588 fflush(fout);
589 if(ferror(fout))
590 {
591 LT_ERROR1(LEWRTF, "Error: write to socket failed: %s\n",Strerror());
592 fclose(fout);
593 fclose(fin);
594 return 0;
595 }
596 fclose(fout);
597#endif
598
599 /* Read the status line */
600#ifdef WIN32
601 for(i=0; i<sizeof(buf)-1; i++)
602 {
603 if(recv(s, &buf[i], 1, 0) != 1)
604 LT_ERROR1(LEFILE,
605 "Error: recv error from server for URL \"%s\"\n",
606 url);
607 if(buf[i] == '\n')
608 break;
609 }
610 count=sscanf(buf, "HTTP/%d.%d %d %80[^\012]",
611 &server_major, &server_minor, &status, reason);
612#else
613 count=fscanf(fin, "HTTP/%d.%d %d %80[^\012]",
614 &server_major, &server_minor, &status, reason);
615#endif
616
617 if(count != 4)
618 {
619 LT_ERROR3(LEFILE,
620 "Error: bad header from server for URL \"%s\"\n%d %s\n",
621 url, count, Strerror());
622#ifndef WIN32
623 fclose(fin);
624#endif
625 return 0;
626 }
627
628 if(status != 200)
629 {
630 /* We should handle 301 (redirection) but we don't */
631 LT_ERROR3(LEFILE, "Error: can't retrieve \"%s\": %d %s\n",
632 url, status, reason);
633#ifndef WIN32
634 fclose(fin);
635#endif
636 return 0;
637 }
638
639 /* Skip other headers */
640
641 count = 0;
642#ifdef WIN32
643 while(recv(s, buf, 1, 0) == 1 && (c = buf[0], 1) || (c = EOF, 0))
644#else
645 while((c = getc(fin)) != EOF)
646#endif
647 {
648 if(c == '\012')
649 count++;
650 else if(c != '\015')
651 count = 0;
652 if(count == 2)
653 break;
654 }
655
656 if(c == EOF)
657 {
658 LT_ERROR1(LEFILE, "Error: EOF in headers retrieving \"%s\"\n", url);
659#ifndef WIN32
660 fclose(fin);
661#endif
662 return 0;
663 }
664
665#ifdef WIN32
666 f16 = MakeFILE16FromWinsock(s, type);
667#else
668 f16 = MakeFILE16FromFILE(fin, type);
669#endif
670
671 SetCloseUnderlying(f16, 1);
672 return f16;
673#endif /* SOCKETS_IMPLEMENTED */
674}
675
676/* Open a file URL (easy, at least on unix) */
677
678static FILE16 *file_open(const char *url,
679 const char *host, int port, const char *path,
680 const char *type)
681{
682 FILE *f;
683 FILE16 *f16;
684 char *file;
685
686 if(host && host[0])
687 WARN1(LEFILE, "Warning: ignoring host part in file URL \"%s\"\n", url);
688
689#ifdef WIN32
690
691 /* DOS: translate /C:/a/b.c to C:\a\b.c */
692
693 if(path[0] == '/' && path[1] && path[2] == ':')
694 path++;
695
696 file = strdup8(path);
697 {
698 char *p;
699 for(p=file; *p; p++)
700 if(*p == '/')
701 *p = '\\';
702 }
703
704#else
705#ifdef mac_filenames
706
707 /* Mac: translate /a/b.c to a:b.c */
708
709 if(*path == '/')
710 path++;
711
712 file = strdup8(path);
713 {
714 char *p;
715 for(p=file; *p; p++)
716 if(*p == '/')
717 *p = ':';
718 }
719#else
720
721 /* Unix: a path is a path is a path! */
722
723 file = strdup8(path);
724
725#endif
726#endif
727
728 /* XXX should undo any escapes */
729
730 f = fopen(file, type);
731 if(!f)
732 {
733 perror(file);
734 Free(file);
735 return 0;
736 }
737
738 Free(file);
739
740 f16 = MakeFILE16FromFILE(f, type);
741 SetCloseUnderlying(f16, 1);
742
743 return f16;
744}
745
746static void parse_url(const char *url,
747 char **scheme, char **host, int *port, char **path)
748{
749 char *p, *q;
750 int warned = 0;
751
752 *scheme = *host = *path = 0;
753 *port = -1;
754
755 /* Does it start with a scheme? */
756
757 for(p = (char *)url; *p; p++)
758 if(*p == ':' || *p == '/')
759 break;
760
761 if(p > url && *p == ':')
762 {
763 *scheme = Malloc(p - url + 1);
764 strncpy(*scheme, url, p - url);
765 (*scheme)[p - url] = '\0';
766 url = p+1;
767 }
768
769 /* Does it have a net_loc? */
770
771 if(url[0] == '/' && url[1] == '/')
772 {
773 url += 2;
774
775 for(p = (char *)url; *p; p++)
776 if(*p == '/')
777 break;
778
779 /* Does it have a port number? */
780
781 for(q = p-1; q >= url; q--)
782 if(!isdigit((int)*q))
783 break;
784
785 if(q < p-1 && *q == ':')
786 *port = atoi(q+1);
787 else
788 q = p;
789
790 *host = Malloc(q - url + 1);
791 strncpy(*host, url, q - url);
792 (*host)[q - url] = '\0';
793 url = p;
794 }
795
796 /* The rest is the path */
797
798 if(*url)
799 *path = strdup8(url);
800 else
801 *path = strdup8("/");
802
803 /* Windoze users have a tendency to use backslashes instead of slashes */
804
805 for(p=*path; *p; p++)
806 if(*p == '\\')
807 {
808 if(!warned)
809 {
810 WARN1(LEFILE, "Warning: illegal backslashes in URL path \"%s\""
811 "replaced by slashes\n", url);
812 warned = 1;
813 }
814
815 *p = '/';
816 }
817}
818