/* Copyright 1997 Acorn Computers Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /***************************************************/ /* File : FetchHTML.c */ /* */ /* Purpose: Fetch functions that deal with HTMLLib */ /* data (getting more of it, freeing it, */ /* and so forth). Compare with higher */ /* higher level Fetch.c and FetchPage.c. */ /* */ /* Author : A.D.Hodgkinson */ /* */ /* History: 17-Aug-97: Created from Fetch.c. */ /***************************************************/ #include <stdlib.h> #include <stdio.h> #include <string.h> #include "swis.h" #include "flex.h" #include "HTMLLib.h" /* HTML library API, Which will include html2_ext.h, tags.h and struct.h */ #include "wimp.h" #include "event.h" #include "svcprint.h" #include "Global.h" #include "MiscDefs.h" #include "Utils.h" #include "About.h" #include "Authorise.h" #include "Browser.h" #include "Cookies.h" #include "Encoding.h" #include "Fetch.h" /* (Which itself includes URLstat.h) */ #include "Filetypes.h" #include "History.h" #include "Save.h" /* (For Save_ScrapFile only)*/ #include "URLutils.h" #include "URLveneer.h" #include "FetchHTML.h" /* Statics */ static char * fetch_buffer = NULL; /* Address of buffer for getting data from the URL module in html_get_next_token. */ /* Local definitions */ #define FetchBufferSize 8192 /* Size of buffer for getting data from the URL module in html_get_next_token. */ /* Local compilation options */ #undef DUMP_HEADERS /*************************************************/ /* html_get() */ /* */ /* Fetches and optionally starts parsing HTML. */ /* */ /* Parameters: Pointer to URL to fetch; */ /* */ /* Pointer to a pointer for extra */ /* data for POST etc. (allows this */ /* to be in a flex block); */ /* */ /* Pointer to an int into which a */ /* handle for this fetch will be */ /* placed; */ /* */ /* The fetch method, e.g. POST or */ /* GET; */ /* */ /* Pointer to the user name for */ /* MailServ (if in a multiuser */ /* environment); */ /* */ /* 1 to allow parsing, else 0; */ /* */ /* 1 to allow proxying, else 0 (e.g. */ /* to force a refetch, rather than */ /* going via. a cache). */ /*************************************************/ _kernel_oserror * html_get(char * url, char ** extradata, int * handle, int method, char * user, int allowparse, int proxy) { _kernel_oserror * e; int ok; unsigned int h; #ifdef TRACE if (tl & (1u<<6)) Printf("html_get: Called\n"); #endif *handle = 0; /* Register the session with the URL module */ e = url_register(0, &h); // Sort out the proxying code properly!... /* Deal with proxying if necessary */ if (!e && choices.use_proxy) { char method[64]; char * method_ptr; int method_len; /* Extract the fetch method from the proxy address */ method_ptr = strstr(choices.proxy_address, ":"); if (method_ptr) { method_len = (int) method_ptr - (int) choices.proxy_address + 1; if (method_len > sizeof(method) - 1) method_len = sizeof(method) - 1; strncpy(method, choices.proxy_address, method_len); method[method_len] = 0; } else strncpy(method, "http:", sizeof(method)); e = url_set_proxy(0, h, choices.proxy_address, method, 0); } if (!e) { urlstat * up; #ifdef TRACE if (tl & (1u<<6)) Printf("html_get: Session registered, ID is %d\n",h); #endif e = urlstat_add_entry(1, &up); if (e) { url_deregister(0,h); return e; } /* Initialise the new structure */ up->session = *handle = (int) h; /* The fetch's session handle */ up->type = TYPE_HTMLFILE; /* Type of file - state it is an HTML file for now */ up->fetching = 1; /* We are still fetching */ up->method = method; /* Current fetch method */ up->extradata = NULL; /* Filled in later, if there is extra data */ up->allowparse = allowparse; /* Do we parse the data? */ /* If there is any extra data for POST or whatever, deal with it. */ /* The POST request entries must come first in the extra header */ /* info, so that the browser can make the assumption that */ /* everything from Content-Type forwards may be stripped in the */ /* event of a redirection when the current fetch method is POST. */ if (extradata && *extradata) { int len; len = strlen(*extradata); /* Allocate space for the extra data, the anchor stored in up->extradata */ #ifdef TRACE if (tl & (1u<<12)) Printf("html_get: flex_alloc %d for 'extradata' store\n",len + 3); #endif if (!flex_alloc((flex_ptr) &up->extradata, len + 3)) { url_deregister(0,h); return make_no_fetch_memory_error(2); } else { char head[80]; #ifdef TRACE flexcount += (len + 3); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif /* CR+LF into the top of the new block of memory */ up->extradata[0] = '\r'; up->extradata[1] = '\n'; /* Copy the extra data under the CR+LF */ strcpy(up->extradata + 2, *extradata); /* Header entry for the extra data - again, the removal routines in the */ /* fetcher's redirection code assume that this comes after Content-Type */ /* and the body content comes after this, to make life easy there. */ sprintf(head, "Content-Length: %d", len); /* Insert the header entries above the extra data already in the block. */ ok = html_insert_header(head, (flex_ptr) &up->extradata); /* (html_insert_header returns 1 for success, 0 for memory claim failure) */ if (!ok) { url_deregister(0, h); return make_no_fetch_memory_error(3); } StrNCpy0(head, "Content-Type: application/x-www-form-urlencoded"); ok = html_insert_header(head, (flex_ptr) &up->extradata); if (!ok) { url_deregister(0, h); return make_no_fetch_memory_error(4); } } } #ifndef SINGLE_USER /* If user details are given, insert the appropriate header entry */ if (user) { char head[Limits_Multi_UserName + 80]; sprintf(head, "Mailserv-User: %s", user); ok = html_insert_header(head, (flex_ptr) &up->extradata); if (!ok) { url_deregister(0, h); return make_no_fetch_memory_error(5); } } #endif /* If we aren't to use a proxy - actually, this is Customer-speak */ /* for no cache - say so in the header, in various ways. */ if (!proxy) { /* For Customer compatability first, most servers will get the */ /* second, and newer servers prefer the last. */ ok = html_insert_header("X-NoProxy:", (flex_ptr) &up->extradata); if (ok) ok = html_insert_header("Pragma:no-cache", (flex_ptr) &up->extradata); if (ok) ok = html_insert_header("Cache-Control:no-cache", (flex_ptr) &up->extradata); if (!ok) { url_deregister(0, h); return make_no_fetch_memory_error(6); } } /* Last but not least, do, er, something... */ { char c = 0; char * p = NULL; /* If non-zero on exit, p will point to the position of a hash */ /* in the URL (i.e., this finds out if an anchor is specified) */ p = fetch_find_name_tag(url); /* If there is a hash, turn it into a zero for now so the string */ /* contains just the URL and not the anchor. */ if (p) c = *p, *p = 0; e = url_get_url(URL_GetURL_AgentGiven, /* Should use a custom User Agent */ h, /* Session handle */ method, /* Fetch method */ url, /* URL to get */ &up->extradata, /* Any extra data for POST etc. */ NULL, /* (Would be a status word) */ 2); /* Mode; 2 = header and data */ /* Put the hash back if was removed earlier. */ if (p) *p = c; } } #ifdef TRACE if (tl & (1u<<6)) { if (!e) Printf("html_get: Successful\n"); else Printf("html_get: Exitting with an error\n"); } #endif return e; } /*************************************************/ /* html_insert_header() */ /* */ /* Inserts a string into the header for an HTML */ /* fetch (for POST). Puts it at the top. */ /* */ /* Parameters: Pointer to the null terminated */ /* string to insert (this ends up */ /* CR+LF terminated in the header); */ /* */ /* Pointer to a flex anchor, which */ /* points to existing header data or */ /* is NULL if there is no header at */ /* the time of the function call. */ /* */ /* Returns: 1 if successful, or 0; you must */ /* externally generate an error */ /* appropriate to the memory claim */ /* having failed. */ /*************************************************/ int html_insert_header(char * header, flex_ptr data) { if (header) { int ok, s, len; len = strlen(header) + 2; /* 'data' points to an anchor; if this isn't null, find the */ /* size of the block the anchor points to */ if (*data) s = flex_size(data); else s = 0; /* If the block is > 0 bytes, extend it to a block big */ /* enough to hold the extra header data, else allocate a */ /* new block to hold it. Note that s will be zero if a new */ /* block was allocated, else it holds the old block size. */ #ifdef TRACE if (tl & (1u<<12)) { if (s) Printf("html_insert_header: flex_extend to %d for header store\n",len + s); else Printf("html_insert_header: flex_alloc %d for header store\n",len + 1); } #endif if (s) ok = flex_extend(data, len + s); else ok = flex_alloc(data, len + 1); /* Note len *plus 1*. */ if (!ok) return 0; #ifdef TRACE if (s) flexcount += len; else flexcount += (len + 1); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif /* Shuffle the header data down to make room for the new */ /* stuff at the top, if there was any data there to move. */ #ifdef TRACE if (tl & (1u<<18)) Printf("\0213html_insert_header: memove from %p to %p for %d bytes\0217\n",((int) (*data)) + len, *data, s); #endif if (s) memmove((void *) (((int) (*data)) + len), *data, s); /* Copy the new data into the top of the header. Don't want */ /* to overflow so use strncpy for extra caution... */ strncpy(*data, header, len - 2); /* Terminate the string with CR+LF */ ((char *) (*data))[len - 2] = '\r'; ((char *) (*data))[len - 1] = '\n'; /* If s is zero, i.e. a new block was created here, make */ /* sure it ends in zero (so C will think the string has */ /* ended properly if a string is read from the buffer). We */ /* can reference (array)[len] as the block allocation was */ /* done to len plus 1 bytes (see above). */ if (!s) ((char *) (*data))[len] = 0; } return 1; } /*************************************************/ /* html_close() */ /* */ /* Closes the specified handle, aborting any */ /* fetch and freeing up memory relating to it. */ /* */ /* Parameters: A fetch handle (usually from the */ /* browser_data->fetch_handle */ /* field). */ /*************************************************/ _kernel_oserror * html_close(int handle) { urlstat * up; #ifdef TRACE if (tl & (1u<<6)) Printf("html_close: Called\n"); #endif url_deregister(0, handle); /* It's another linked list traversal... As long as we aren't at the */ /* end of the list, and we haven't reached the item relating to this */ /* fetch, keep looking. */ up = urlstat_find_entry(handle); /* After the above loop, 'up' points to the structure for this */ /* fetch or is null; in the latter case, give an error. */ if (!up) { erb.errnum = Utils_Error_Custom_Fatal; StrNCpy0(erb.errmess, lookup_token("StrNotFd:Internal error: Can't find structure in %0.", 0, "html_close")); #ifdef TRACE if (tl & (1u<<6)) Printf("html_close: Exiting with error\n"); #endif return &erb; } /* If there is HTMLLib derived data attached, deal with this */ if (up->stream) { unsigned int context = HtmlReturnContext(up->stream); browser_data * browser = last_browser; /* Should Never Happen...! */ if (!context) { erb.errnum = Utils_Error_Custom_Fatal; StrNCpy0(erb.errmess, lookup_token("NoContxt:Serious internal error - Block is already free or was not HtmlAlloc'd in html_close; must exit immediately.", 0, 0)); return &erb; } /* Ensure that any HStream pointers inside any current browser_data */ /* structures are not part of this stream - if so, clear them. */ while (browser) { /* For now, just a few selected items */ if (browser->selected && HtmlReturnContext(browser->selected) == context) { browser_clear_selection(browser, 0); browser->selected = NULL; } if (browser->highlight && HtmlReturnContext(browser->highlight) == context) { browser_clear_highlight(browser, 0); browser->highlight = NULL; } if (browser->pointer_over && HtmlReturnContext(browser->pointer_over) == context) browser->pointer_over = NULL; browser = browser->previous; } #ifdef TRACE if (up->stream) { if (tl & (1u<<12)) Printf("html_close: Calling HtmlStreamFree on %p\n",up->stream); if (tl & (1u<<18)) Printf("\0212Closing stream %p\0217\n",up->stream); } #endif HtmlStreamFree(up->stream); } /* If there is extra context data allocated, free it */ if (up->context) { #ifdef TRACE if (tl & (1u<<12)) Printf("html_close: free block %p for 'context' field of 'urlstat' structure\n",up->context); #endif HtmlEndParse(up->context); up->context = NULL; } #ifdef TRACE if (up->extradata) { if (tl & (1u<<12)) Printf("html_close: flex_free block %p for 'extradata' field of 'urlstat' structure\n",&up->extradata); flexcount -= flex_size((flex_ptr) &up->extradata); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); } #endif /* If there is extra flex data attched, free this too */ if (up->extradata) flex_free((flex_ptr) &up->extradata); /* Finally, get rid of the structure itself */ urlstat_remove_entry(up); #ifdef TRACE if (tl & (1u<<6)) Printf("html_close: Successful\n"); #endif return NULL; } /*************************************************/ /* html_get_next_token() */ /* */ /* Gets a chunk of document source from a given */ /* fetch handle, and may generate new HStream */ /* structures as the document is passed over to */ /* the HTML library parser. */ /* */ /* Parameters: Pointer to a browser_data struct */ /* relevant to the fetch or NULL; */ /* */ /* The fetch handle; */ /* */ /* Pointer to int into which the */ /* number of bytes still to be */ /* fetched is played; */ /* */ /* Pointer to int into which the */ /* number of bytes fetched so far is */ /* placed; */ /* */ /* Pointer to an HStream *, into */ /* which the address of the base of */ /* the token list is written, or */ /* NULL to signal 'not ready'; */ /* */ /* Pointer to an int, into which a */ /* reason code is placed: */ /* */ /* 0: Token has been received OK, */ /* 1: We are waiting for something, */ /* 2: A redirect has been detected */ /* (in this case, *remaining will */ /* point at the new URL), */ /* 3: This data is not parseable (in */ /* this case, *remaining holds a */ /* filetype); */ /* */ /* Pointer to pointer to the store */ /* for the whole of the data fetched */ /* so far (if any), be it an HTML */ /* document, image, or whatever; */ /* */ /* Pointer to string holding the URL */ /* that is being fetched; */ /* */ /* 1 if this is an image fetch, else */ /* 0 for HTML or unknown. */ /* */ /* Assumes: That if the browser_data struct */ /* pointer is NULL, the fetch is not */ /* for an internal URL. The other */ /* pointers must NOT be NULL unless */ /* it is specifically stated that */ /* they may be in the parameters */ /* list. */ /*************************************************/ _kernel_oserror * html_get_next_token(browser_data * b, unsigned int handle, int * remaining, int * size, HStream ** token, int * waiting, flex_ptr source, char * url, int image) { _kernel_oserror * e = NULL; int r = 0; urlstat * up; char ref_url[Limits_URL]; #ifdef TRACE if (tl & (1u<<6)) Printf("html_get_next_token: Called\n"); #endif /* Start in the default state of having no HStream to pass back */ /* through *token. */ if (token) *token = NULL; /* Until we know better, signal that we're waiting */ *waiting = 1; /* Ensure a fetch buffer is allocated */ if (!fetch_buffer) { fetch_buffer = malloc(FetchBufferSize); /* See top of this file */ if (!fetch_buffer) { #ifdef TRACE if (tl & (1u<<6)) Printf("html_get_next_token: Exiting with error\n"); #endif return make_no_cont_memory_error(8); } } /* Get the urlstat structure for this fetch */ up = urlstat_find_entry(handle); if (!up) { erb.errnum = Utils_Error_Custom_Fatal; StrNCpy0(erb.errmess, lookup_token("StrNotFd:Internal error: Can't find structure in %0.", 0, "html_get_next_token")); #ifdef TRACE if (tl & (1u<<6)) Printf("html_get_next_token: Exiting with error\n"); #endif return &erb; } /* Only look for an anchor and use url_read_data for URLs which */ /* are not internal or are direct data saves. */ StrNCpy0(ref_url, url); if (image || b->displayed == Display_Fetched_Page || b->save_link) { int status = 0; /* Want to make sure we work on a URL which doesn't */ /* have an anchor in it, so copy over the url to */ /* a local buffer and if there's a '#' marking an */ /* anchor, replace it with a string terminator. */ char * p = fetch_find_name_tag(ref_url); if (p) * p = 0; /* If there isn't an authorisation request in progress, and the */ /* fetch is apparently in progress, and the authorisation status */ /* isn't '1' (which means 'doing'), get some data from the URL */ /* module. The url_read_data call puts the number of bytes read */ /* into r. */ if (!authorising && up->fetching && (up->authorised != 1)) { e = url_read_data(0, /* Flags - must be 0 at present */ handle, /* Session handle */ fetch_buffer, /* Buffer to receive data */ FetchBufferSize, /* The buffer's size */ &status, /* Protocol status */ &r, /* Number of bytes read */ remaining); /* Number of bytes left to get */ /* Deal with cookies */ if (!e && (status & (1u<<16))) e = cookies_process_cookie(b); /* Deal with the size information */ if (*remaining > 0 && !b->data_size) b->data_size = *remaining + r; } } else if (up->fetching) /* Don't repeat this over and over... */ { /* This is an internal URL, so treat specially */ int ok; char * extra = ""; char * tail = ""; int len, exoff, toff; if (*source) flex_free(source); /* Work out the length that the HTML file we're about to generate will be */ switch (b->displayed) { case Display_External_Image: { int protolen; /* Look up the token embedded in the URL */ lookup_token(url + Int_URL_Len, 1, 0); /* Find a ':' separating extra information and point just past it */ exoff = urlutils_internal_extra(url); if (exoff) extra = url + exoff; len = strlen(tokens) + 1; if (*extra) { toff = urlutils_internal_tail(url); if (toff) tail = url + toff; } /* Is this a system variable name for fetching, e.g. Wimp$Scrap? */ protolen = strlen(FileMethod ProtocolSepShort "<"); /* (URLutils.h) */ if ( !strncmp(extra, FileMethod ProtocolSepShort "<", protolen) && extra[strlen(extra) - 1] == '>' ) { int required; _kernel_swi_regs r; /* We'll have to trash the ref_url block. We know ref_url is at */ /* least as long as URL, so extract the system variable name to it. */ strncpy(ref_url, extra + protolen, strlen(extra) - protolen - 1); /* -1 to skip the closing '>' */ ref_url[strlen(extra) - protolen - 1] = 0; /* Now find out how long the expanded form would be */ r.r[0] = (int) ref_url; r.r[1] = (int) NULL; r.r[2] = -1; r.r[3] = 0; r.r[4] = 0; /* _swix will not work correctly for this particular SWI if */ /* requiring the returned R2 value. Something to do with */ /* the call relying on generating an error, but _swix spots */ /* it and pulls out earlier than the call expects. Or some */ /* such thing... */ _kernel_swi(OS_ReadVarVal, &r, &r); required = -r.r[2]; /* Woah - system variable wasn't defined... */ if (!required) { erb.errnum = Utils_Error_Custom_Normal; strcpy(erb.errmess, "<"); strcat(erb.errmess, ref_url); strcat(erb.errmess, ">"); strcat(erb.errmess, lookup_token("NotDefined: not defined.",0,0)); return &erb; } if (required < 0 || required >= sizeof(ref_url) - 1) /* -1 = allow for terminator */ { /* Well, we sort of haven't got enough memory. Ahem. */ return make_no_memory_error(8); } /* Otherwise, expand the variable (_swix is OK here as we don't */ /* want any returned register value). */ _swix(OS_ReadVarVal, _INR(0,4), ref_url, ref_url, sizeof(ref_url), 0, 4); /* Ensure it is terminated correctly */ ref_url[required - 1] = 0; /* Turn it into a fetchable URL */ urlutils_pathname_to_url(ref_url, sizeof(ref_url)); /* Point to this URL */ extra = url = ref_url; } /* Note that this is a very slow function call... */ len = utils_len_printf(tokens, extra, extra, tail); if (len < 0) { /* If the above fails, do our best to calculate the length. */ /* This will always overestimate the size (safer to do this */ /* than underestimate!). */ len = strlen(tokens) + 1; /* For external images, need to fit the extra data in twice, and */ /* try to find a filename separator for a picture caption (put */ /* this in 'tail'). */ if (*extra) len += strlen(extra) * 2 + strlen(tail) + 2; } } break; case Display_Scrap_File: { int found, type; if (*source) flex_free(source); /* Find the file length */ _swix(OS_File, _INR(0,1) | _OUT(0) | _OUT(4) | _OUT(6), 23, /* Read catalogue info for named, stamped object */ Save_ScrapFile, &found, &len, &type); if (found != 1) { erb.errnum = Utils_Error_Custom_Normal; /* Error message will either be 'can't find the page', or, if this is */ /* a frame and the frame source matches the fetching URL, 'can't find */ /* the frame'. */ if ( b->ancestor && b->frame && b->frame->src && browser_fetch_url(b) && !strcmp(b->frame->src, browser_fetch_url(b)) ) { StrNCpy0(erb.errmess, lookup_token("WhatFrame:The sending application could not supply the page contents for this frame.", 0, 0)); } else { StrNCpy0(erb.errmess, lookup_token("WhatScrap:Cannot find the page to load; the sending application may have died.", 0, 0)); } return &erb; } /* Is this a text file? */ if (type == FileType_TEXT) { b->page_is_text = 1; up->type = TYPE_TEXTFILE; } else b->page_is_text = 0; } break; case Display_About_Page: { len = 0; } break; } /* If required, claim memory for the page; complain if this fails */ if (len) ok = flex_alloc(source, len); else ok = 1; if (!ok) { #ifdef TRACE if (tl & (1u<<6)) Printf("html_get_next_token: Exiting with error\n"); #endif return make_no_cont_memory_error(1); } /* Construct the page in the claimed block (or build it in a */ /* new block). */ switch (b->displayed) { case Display_About_Page: { RetError(about_build_page(source)); } break; case Display_External_Image: { memset(*source, 0, len); sprintf(*source, tokens, extra, extra, tail); } break; case Display_Scrap_File: { FILE * file; /* Load and delete the scrap file */ file = fopen(Save_ScrapFile, "rb"); if (!file) RetLastE; if (fread(*source, 1, len, file) < len) { fclose(file); RetLastE; } fclose(file); remove(Save_ScrapFile); } break; } /* Set up fetch flags to say that a fetch has been completed; since */ /* we've filled in the document source store here, say that zero */ /* bytes have been fetched (otherwise code below will try to copy */ /* data out of the fetch_buffer block). */ r = 0; remaining = 0; up->identified = 1; up->allowparse = 1; up->fetched = 1; up->fetching = 0; } /* If there isn't an error, and more than zero bytes have been read, */ /* deal with the data (if any) returned from the above call. */ if (r && !e) { int ok, oldsize; /* 'fetched' is a flag which if set indicates at least 1 byte has been */ /* got so far. If fetched is zero, and there is data in the source */ /* store (i.e. 'source' is not NULL) then free up the store as it does */ /* not hold any valid data (must be from an old fetch). */ if (!up->fetched && *source) { #ifdef TRACE if (tl & (1u<<12)) Printf("html_get_next_token: (1) flex_free block %p which held page source\n",source); flexcount -= flex_size(source); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif flex_free(source); *source = NULL; } /* Signal that there's definitely data fetched now. */ up->fetched = 1; /* If there's store allocated at this point, it holds valid source; extend */ /* it by the number of bytes read from the url_read_data call. Else, alloc */ /* a new buffer to hold the data. */ #ifdef TRACE if (tl & (1u<<12)) { if (*source) Printf("html_get_next_token: flex_extend by %d to %d for page source store\n",r,flex_size(source) + r); else Printf("html_get_next_token: flex_alloc %d for page source store\n",r); } #endif if (*source) { oldsize = flex_size(source); ok = flex_extend(source, oldsize + r); } else { oldsize = 0; ok = flex_alloc(source, r); } #ifdef TRACE flexcount += r; if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif if (size) *size = oldsize + r; /* Report an error if the allocation failed */ if (!ok) { #ifdef TRACE if (tl & (1u<<6)) Printf("html_get_next_token: Exiting with error\n"); #endif return make_no_cont_memory_error(1); } /* The data block has been created/extended successfully, so copy the */ /* data from the url_read_data call into it. */ #ifdef TRACE if (tl & (1u<<18)) Printf("\0216html_get_next_token: memcpy from %p to %p for %d bytes\0217\n",((char *) (*source)) + oldsize, fetch_buffer, r); #endif memcpy(((char *) (*source)) + oldsize, fetch_buffer, r); } /* If we're not authorising the transfer and data has been fetched, proceed normally */ if (!authorising && up->fetched) { unsigned int hf = 0; HStream * new = NULL; /* If the stream has been identified as HTML... */ if (up->identified) { /* Really need to make our minds up at this point about the */ /* encoding. fetch_start may have set priority to default */ /* to clear out any old HTTP header info, but without */ /* changing the encoding, as the menu should only change */ /* when the document comes in. That's now, folks. */ if (b->encoding_priority == priority_default) { b->encoding = choices.encoding; encoding_update_menus(b); } /* If there's no parsing context, get one by calling HtmlParse - */ /* this initialises the HTML parser, getting it ready to parse a */ /* document (though it need not be present at this stage). */ /* */ /* First time round, this won't be called as the stream hasn't */ /* been identified with HtmlIdentify yet. */ if (up->context == NULL) { up->context = HtmlParse(ref_url, /* Full page URL, so parser can handle relative links */ 0, /* Length of document - zero at present (not known) */ up->type, /* Return type from the HtmlIdentify call */ choices.support_frames, /* 1 to have FRAMESETs parsed, else 0 */ choices.support_object, /* 1 to handle OBJECT etc., else get alt. HTML stream */ choices.support_tables, /* 1 to handle TABLE etc., else 0 */ b->encoding, b->encoding_priority); if (up->context) HtmlSetEncodingCallback(up->context, encoding_changed_by_meta, b); r = *source ? flex_size(source) : 0; } /* If there is new data in the source store (size = r) and no error at */ /* present, attempt to parse the chunk of data with HtmlGetStream. */ if (r && !e) { new = HtmlGetStream(up->context, /* Parser context, from HtmlParse */ (char **) source, /* Pointer to start of the complete document */ r, /* Size of the chunk that has been added */ &hf); /* Flags from HTMLLib, e.g. 'have more data' */ up->stream = new; #ifdef TRACE if (tl & (1u<<18)) Printf("\0211(New stream for %p, %p)\0217\n", b, up->stream); #endif } if (!new) { /* There are no new HTML library structures */ if (up->lasttoken) { /* There is no new data, but lasttoken indicates there are more tokens */ /* left in the token stream from earlier calls that haven't been dealt */ /* with. So move to the next one. */ up->lasttoken = up->lasttoken->next; if (token) *token = up->lasttoken; } } else { /* There are some new HTML library structures. */ if (!(hf & HTML_GOT_MORE_IN_A_TABLE)) { /* The flag is unset, so the structures were added to the main token */ /* stream and not to part of a table structure. */ if (up->lasttoken) { /* Even though there are new structures, we still have older ones */ /* that are not dealt with, so move to the next one (the remote */ /* server is sending us data than we're processing it - yay!). */ up->lasttoken = up->lasttoken->next; if (token) *token = up->lasttoken; } else { /* There are no earlier structures left to deal with, so start on */ /* the first of the new batch. */ up->lasttoken = new; if (token) *token = up->lasttoken; } } else { /* The HTML_GOT_MORE_IN_A_TABLE flag is set, so structures were added to */ /* a table arrangement, as well as (possibly) the main stream after it. */ if (!up->lasttoken) { /* We weren't waiting to process anything from an earlier call, so */ /* start on this new table structure. */ up->lasttoken = new; if (token) *token = up->lasttoken; } else { /* We have undealt with structures from a previous fetch. Now, if */ /* we are already on the same table structure as returned by the */ /* HtmlGetStream call, then stay there (i.e. process the new data */ /* inside the table). Otherwise, move on. */ if (up->lasttoken != new) up->lasttoken = up->lasttoken->next; if (token) *token = up->lasttoken; } } } /* If we've moved on to, or were already on no token, then whether or */ /* not the fetch is still in progress determines whether or not we're */ /* waiting. Otherwise, we aren't waiting for anything. */ if (!up->lasttoken) *waiting = !!up->fetching; else *waiting = 0; } else if (up->authorised != 1) { /* The stream hasn't been identified as HTML, text or whatever, */ /* but there isn't an authorisation in progress. */ int s, o = 0; char * redirect; int code; int type; int parseable; /* Get the fetch status */ if (image || b->displayed == Display_Fetched_Page || b->save_link) { e = url_status(0, handle, &s, NULL, NULL); if (e) return e; } else s = URL_Status_Done; redirect = NULL; type = TYPE_UNKNOWN; parseable = 0; /* HttpStripHeaders, when passed a pointer to some document data, */ /* and an offest into that stream, returns the offset into the */ /* stream at which it starts assuming HTTP style headers (if */ /* there is such a point). */ o = *source ? HttpStripHeaders((char *) *source, flex_size(source)) : 0; /* If o is 0, there were no HTTP headers. If o is -1, there wasn't */ /* enough data to tell. Else, there were headers, and o is the */ /* offset into the stream of the data that follows those headers. */ if (o > 0) { int encoding; #ifdef DUMP_HEADERS { FILE * file; int byte; file = fopen("<Wimp$ScrapDir>.Headers", "ab"); if (file) { if (!image) fprintf(file, "For URL '%s', received header:\r\n\r\n", url); else fprintf(file, "For image '%s', received header:\r\n\r\n", url); for (byte = 0; byte < o; byte++) { fputc((int) (*((char *) (((int) *source) + byte))), file); } fclose(file); } } #endif /* There are HTTP style headers in the data */ /* stream; try to identify that stream. */ code = HtmlIdentify(ref_url, /* Allow relative redirections to work */ (char *) *source, /* Pointer to head of data stream */ flex_size(source), /* Amount of data in the stream */ (s & URL_Status_Done) != 0, /* Is it a complete stream? 1 = yes */ &redirect, /* Will point to a URL if code = 302 */ &type, /* Will hold a filetype */ &parseable, /* Will say if the data is parseable */ &encoding); /* Will say if it specified an encoding */ /* Set the encoding if specified in the HTTP header */ if (encoding && up->allowparse && b->encoding_priority < priority_http) { b->encoding_priority = priority_http; b->encoding = encoding; encoding_update_menus(b); } /* Discard the stuff before the HTTP style headers by moving the data over them */ if (o != flex_size(source)) { #ifdef TRACE if (tl & (1u<<18)) Printf("\0213html_get_next_token: memove from %p to %p for %d bytes\0217\n", *source, (char*) (((int) *source) + o), flex_size(source) - o); #endif memmove(*source, (char*) (((int) *source) + o), flex_size(source) - o); /* Set o to the size of the data stream that is now in use, */ /* and shrink the source store to this size. */ #ifdef TRACE if (tl & (1u<<12)) Printf("html_get_next_token: flex_extend to shrink source code store by %d to %d\n",o,flex_size(source) - o); flexcount -= o; if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif } o = flex_size(source) - o; /* If the size of the store minus o is less than 0, HttpStripHeaders */ /* has failed completely and we must get out before everything else */ /* comes down...! */ if (o < 0) { erb.errnum = Utils_Error_Custom_Fatal; StrNCpy0(erb.errmess, lookup_token("HSHOvrrn:Serious internal error - HttpStripHeaders has failed; must exit immediately.", 0, 0)); show_error_cont(&erb); /* This will cause exit(EXIT_FAILURE) eventually. */ } flex_extend(source, o); /* (Which shrinks the source store) */ /* Interpret the codes returned by HtmlIdentify. */ switch (code) { #ifndef STRICT_PARSER /* Moved permanently - drop through to redirection code. Only */ /* do this if not in a STRICT_PARSER build, as in the latter */ /* case slightly broken links (missing '/'s off the ends, and */ /* so-on) aren't hidden by the browser moving on. */ case 301: /* Redirect to a GET request; this means 'definitely don't */ /* repost POST forms data' unlike 302 where you're supposed */ /* to but we don't because Navigator doesn't and some sites */ /* expect this broken behaviour (see below). So we can just */ /* drop through to the 302 case. */ case 303: #endif /* Redirect; 'redirect' is a pointer to a new URL. */ case 302: { /* Stop the current fetch and free the source store, */ /* remembering to invalidate the anchor pointing to it */ url_stop(0, handle); #ifdef TRACE if (tl & (1u<<12)) Printf("html_get_next_token: (2) flex_free block %p which held page source\n",source); flexcount -= flex_size(source); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif flex_free(source); *source = NULL; /* Ensure POST requests are now cleared (we shouldn't continue */ /* POSTing to redirected URLs - er, because Netscape Navigator */ /* doesn't, even though we technically should.) */ if (up->method == URL_Method_http_POST) { /* When the headers are built, the POST data (starting with a Content-Type */ /* entry) is put in first and everything else is inserted above it. As the */ /* comments on this code say, this is organised so we can simplify things */ /* here and just chop off everything at Content-Type and below, rather */ /* than having to carefully remove the appropriate header lines and body.� */ if (up->extradata) { int len; int oldbudge; char * strip; oldbudge = flex_set_budge(0); #ifdef DUMP_HEADERS { FILE * file; file = fopen("<Wimp$ScrapDir>.Headers", "ab"); if (file) { fprintf(file, "Redirection from POST\r\nThere is this extra data before stripping POST-specific info:\r\n\r\n[%s]\r\n\r\n", up->extradata); fclose(file); } } #endif strip = strstr(up->extradata, "Content-Type: "); if (strip) { /* How much do we want to keep? */ len = strip - up->extradata + 1; /* + 1 so we can put in a string terminator where the 'C' was */ #ifdef TRACE { int rmv = flex_size((flex_ptr) &up->extradata) - len; flexcount -= rmv; if (tl & (1u<<13)) Printf("** flexcount: %d\n",flexcount); } #endif /* Resize the block */ flex_extend((flex_ptr) &up->extradata, len); *strip = 0; } #ifdef DUMP_HEADERS { FILE * file; file = fopen("<Wimp$ScrapDir>.Headers", "ab"); if (file) { fprintf(file, "Redirection from POST\r\nThere is this extra data after stripping POST-specific info:\r\n\r\n[%s]\r\n\r\n", up->extradata); fclose(file); } } #endif flex_set_budge(oldbudge); } /* Change the fetch method to GET */ up->method = URL_Method_http_GET; } /* Customer specific */ #ifdef CUSTOMER_SPECIAL if ( !strcmp(redirect,"http://www.customer.com/login.html") || !strcmp(redirect,"http://www.customer.com/index.html") ) { // Send out cookie... redirect = (char *) "http://www.customer.com/simple.html"; } #endif /* Set the fetch's urlstat structure to say that */ /* no data has been fetched */ up->fetched = 0; /* Start a fetch on the new URL */ e = url_get_url(URL_GetURL_AgentGiven, /* Use a custom User Agent string */ handle, /* Session handle */ up->method, /* Fetch method */ redirect, /* URL to get */ &up->extradata, /* Extra data for POST etc. */ NULL, /* We're ignoring the returned status */ 2); /* Mode 2 = fetch both header and data */ /* Return any errors that url_get_url generated */ if (e) return e; /* This function returns the address of the new URL in */ /* 'remaining', flagging this with a waiting status of 2 */ /* - and yes, this is quite odd. */ *waiting = 2; *remaining = (int) redirect; /* Not redirect_to, as it may be freed now */ } break; /* Authorise; the server requested authorisation before it */ /* would deliver the page. */ case 401: { char * realm; char host [Limits_HostName]; char username [Limits_AuthUserWrit]; char password [Limits_AuthPassWrit]; int po; /* Try to find the host and realm */ urlutils_host_name_from_url(ref_url, host, sizeof(host)); /* (The realm will lie in the string pointed to by */ /* 'redirect', between two double quotes). */ realm = authorise_read_realm(redirect); /* If we've already tried this, then the authorisation failed, */ /* so display whatever authorisation failure page the server */ /* sent with the 401 response. */ if (up->authorised >= 2) { authorise_forget(host, realm); erb.errnum = Utils_Error_Custom_Message; StrNCpy0(erb.errmess, lookup_token("BadAuthor:Authorisation failed; you must use a valid user name and password.", 0, 0)); return &erb; } /* Ditch any document data got so far, we don't need it now */ /* (it only contains e.g. header information). */ if (source) { flex_free((flex_ptr) source); *source = NULL; } up->fetched = 0; /* Stop the URL module trying to get anything else, and set */ /* the flag to say we're authorising this fetch. */ url_stop(0, handle); up->authorised = 1; /* If there is a user name and / or password available already, */ /* use that and authenticate immediately. */ *username = *password = 0; po = authorise_find_user_name(host, realm); if (po >= 0) { StrNCpy0(username, authorise + po); po = authorise_find_password(host, realm); if (po >= 0) { StrNCpy0(password, authorise + po); } fetch_authorisation_proceed(b, up, realm, url); } /* Otherwise, get this information from a dialogue box and */ /* allow the authentication to happen later, when the user */ /* has done relevant things with the dialogue. */ else { char prompt[Limits_AuthPrompt]; int f, is_ftp; ObjectId dbox; /* Ensure the authorisation dialogue is created and event handlers */ /* are registered for it. */ e = authorise_create_dialogue((void *) b, &dbox); if (e) return e; /* Is this an FTP fetch? */ if (!strncmp(browser_fetch_url(b), FTPmethod, strlen(FTPmethod))) is_ftp = 1; else is_ftp = 0; /* -4 corrects for %s being replaced by host / realm strings, plus */ /* a terminator at the end of the whole lot. */ f = strlen(realm) + strlen(host); /* (But no terminators needed for these, so no '+ 1's) */ if (!is_ftp) lookup_token("Authorise:Please enter a user name and a password for %%s at %%s.",0,0); else lookup_token("AuthorFTP:Please enter a password for %%s.",0,0); f += ((signed int) strlen(tokens)) - 4 + 1; /* Minus 4 for two lots of '%s', plus 1 for terminator */ /* If the string is too big for the prompt or null, put a */ /* simple version in instead. */ if (f <= 0 || f > sizeof(prompt)) { if (is_ftp) lookup_token("AuthorFSh:Please enter a password.",0,0); else lookup_token("AuthorShr:Please enter a user name and a password.",0,0); e = button_set_value(0, dbox, AuthPrompt, tokens); } else { if (is_ftp) sprintf(prompt, tokens, host); else sprintf(prompt, tokens, realm, host); e = button_set_value(0, dbox, AuthPrompt, prompt); } if (e) return e; /* Show the dialogue */ RetError(toolbox_show_object(Toolbox_ShowObject_AsMenu, dbox, Toolbox_ShowObject_Centre, NULL, b->self_id, -1)); /* If using FTP, put the user name in the writable field */ /* and grey it out. Else just empty the field. */ if (is_ftp) { /* Let errors happen silently here */ writablefield_set_value(0, dbox, AuthUserWrit, realm); set_gadget_state(dbox, AuthUserWrit, 1); } else e = writablefield_set_value(0, dbox, AuthUserWrit, ""); if (e) return e; /* Empty the password writable */ RetError(writablefield_set_value(0, dbox, AuthPassWrit, "")); /*set the authorising flag */ menusrc = Menu_Authorise, authorising = 1; } } break; /* Catch anything else 'just in case'. Guess that the data */ /* is parseable and let this drop through to the ordinary */ /* URL handling code. */ default: parseable = TYPE_HTMLFILE; /* ...so no 'break' */ /* An ordinary URL. */ case 200: { /* If the urlstat structure says not to parse this data, flag it as */ /* an unknown type to prevent parsing. */ if (!up->allowparse) parseable = TYPE_UNKNOWN; b->page_is_text = 0; if (b->save_link) { /* If saving out the link, flag as not parseable whatever HTMLLib said */ *waiting = 3; } else { /* If the data is apparently parseable, flag the status as 'waiting' */ /* else flag it as 'not parseable'. */ switch (parseable) { default: case TYPE_UNKNOWN: case TYPE_IMAGEFILE: *waiting = 3; break; /* For a text file, check the filename extension. It may */ /* be that the server is sending garbage Content-Type */ /* headers, and we'll choose the filename extension over */ /* this (initially for "http://www.batalarms.co.uk/"). */ /* */ /* If the server says something is HTML, we'll trust it. */ case TYPE_TEXTFILE: { type = urlutils_filetype_from_url(browser_fetch_url(b)); if (type == FileType_TEXT) { b->page_is_text = 1; *waiting = 1; } else if (type == FileType_HTML) { parseable = TYPE_HTMLFILE; *waiting = 1; } else { parseable = TYPE_UNKNOWN, *waiting = 3; } } break; case TYPE_HTMLFILE: *waiting = 1; break; } } if (parseable == TYPE_IMAGEFILE && !b->save_link) { /* For images, stop the current fetch and 'redirect' to */ /* an internal page which will fetch the image inline. */ /* This is inefficient as you start to fetch the image */ /* twice; on slow servers, something of a killer... */ /* Unfortunately, pressure of time (yet *again*) */ /* precludes a more elegant solution for the moment. */ url_stop(0, handle); #ifdef TRACE if (tl & (1u<<12)) Printf("html_get_next_token: (2) flex_free block %p which held page source\n",source); flexcount -= flex_size(source); if (tl & (1u<<14)) Printf("** flexcount: %d\n",flexcount); #endif flex_free(source); *source = NULL; /* Set the fetch's urlstat structure to say that */ /* no data has been fetched */ up->fetched = 0; /* Want to only offer a 'back to previous page' link if you can indeed go back. */ /* Because this page can itself add to the history, we shouldn't check the */ /* 'history_can_go_backwards' function result as this may return 0, even though */ /* by the time this new page has fetched you *can* go back. So check whether or */ /* not the local history is empty. However, if you've stepped back to the front */ /* of the history, then you shouldn't give a 'back' link... So need to also see */ /* if, should history_can_go_backwards return 0, history_can_go_forwards does */ /* as well. */ if ( !history_empty(b) && ( history_can_go_backwards(b) || ( !history_can_go_backwards(b) && !history_can_go_forwards(b) ) ) ) redirect = Internal_URL ForExternalHImage; /* Have a 'Go back' link */ else redirect = Internal_URL ForExternalNImage; /* No 'Go back' link */ /* Start a fetch on the new URL */ e = url_get_url(URL_GetURL_AgentGiven, /* Use a custom User Agent string */ handle, /* Session handle */ up->method, /* Fetch method */ redirect, /* URL to get */ &up->extradata, /* Extra data for POST etc. */ NULL, /* We're ignoring the returned status */ 2); /* Mode 2 = fetch both header and data */ if (e) return e; b->displayed = Display_External_Image; /* This function returns the address of the new URL in */ /* 'remaining', flagging this with a waiting status of 2 */ /* - and yes, this is quite odd. */ *waiting = 2; *remaining = (int) redirect; } else { *remaining = type; /* Set the type according to the parseable flag */ up->type = parseable; /* Flag that we've identified the stream */ up->identified = 1; } } break; /* If 0, haven't identified the stream yet */ case 0: break; /* Closure of 'switch' statement checking the return code */ /* from the HtmlIdentify call */ } /* Closure of 'if' statement that checked there was recognised data */ /* following HTTP style headers in the data stream */ } /* Closure of 'if' statement that checked the urlstat structure */ /* to see if authorisation was in progress (and only proceeded */ /* if it was not) */ } /* Closure of 'if' statement that followed the url_read_data */ /* function call and associated memory allocation procedures, */ /* and only proceeded if data had been fetched and authoristion */ /* was not flagged as being in progress. */ } /* If we're not in the middle of authorisation, there hasn't */ /* been an error, and the urlstat structure for the fetch */ /* flags that fetching is still in progress, ask the URL */ /* module if things are finished yet. If so, do the relevant */ /* tidying up and unset the 'fetch in progress' flag. */ if (!e && up->authorised != 1 && !authorising && up->fetching) { int s; if (image || b->displayed == Display_Fetched_Page || b->save_link) { e = url_status(0, handle, &s, NULL, NULL); if (e) return e; if (s & URL_Status_Done) { up->fetching = 0; *waiting = 0; } } } /* If we've been passed somewhere to put the size of the store, */ /* and if the store is present, return the size of it. */ if (size) *size = (*source) ? flex_size(source) : 0; #ifdef TRACE if (tl & (1u<<6)) { if (!e) Printf("html_get_next_token: Successful\n"); else Printf("html_get_next_token: Exitting with an error\n"); } #endif /* Exit, passing on any error if there is one */ return (e); }