/*
 * File: SURF.CPP
 * Surf is a Win 95/98/NT open source freeware Internet text web browser.
 *
 * Surf harvests web pages, reducing them to wrapped text files containing
 * only their HTML base, title, and anchor tags, suitable for reading in a
 * text editor or with a conventional web browser like Netscape Navigator
 * or Internet Explorer. Surf can also send queries to many search engines.
 *
 * Surf is executed from the command line inside an MS-DOS prompt window.
 * Surf with no arguments gives a usage message if no list file is found.
 * Surf shows summaries as it fetches pages, and stops upon any keystroke.
 *
 * -----
 *
 * Copyright (C) 1999 Glenn Scheper. This program is free software;
 * you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation;
 * either version 2 of the License, or any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details. You should have
 * received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation,
 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * -----
 *
 * The following examples all indicate a "-<Letter>" as the first
 * MS-DOS command line argument to invoke Surf, which means that
 * Surf will use the default list file name, "list.txt", in the
 * current directory.
 *
 * If you want to name a different list file, supply an exactly
 * specified file name as the first Surf command line argument,
 * and then let the "-<Letter>" be the second Surf argument.
 *
 *
 * ---- ARGUMENT: MINUS F -----
 *
 * Surf can be started by having it fetch a web page at a specified URL,
 * from which it will siphon off all the links to create a new list file.
 * Then you can use any editor or browser to read the locally saved file,
 * and edit the list file to remove asterisks to other links of interest.
 *
 * Usage: SURF -F URL1 [URL2 ...]
 *
 * For example, saying surf -f www.hughes.net/~scheper creates a
 * local file named ~scheper in the current directory, like this:
 *
 * ------------------------------ sample ------------------------------
<BASE HREF=http://www.hughes.net/~scheper/>
<TITLE> Glenn Scheper Knows! </TITLE>
<PRE><A HREF = "http://www.hughes.net/~scheper/"> Glenn Scheper Knows! </A>
 Glenn Scheper Knows!

 Jeannette and Glenn.

 Homepage of Glenn Scheper, featuring a huge collection of
 links to christian, gnostic, mystery, prophecy, revelation,
 and end-times web pages. I also give away a free web browser
 SURF and other Windows 95 freeware, and present some novel
 heretical christian spiritual ideas.

	<a href = #Links >

 Links

	<a href = #Rings >

 Rings
....
 * ---------------------------- end sample ----------------------------
 *
 * Surf maintains a list file that catalogs all of the URLs encountered
 * so far, both those that have been fetched and all other novel URLs.
 *
 * A particular list file name can be specified in the optional first
 * argument to SURF. It must give a fixed file name without wildcards,
 * but the argument may contain a drive letter and it may show a path.
 * If no list file name is specified, the default filename "list.txt"
 * in the current directory is assumed.
 *
 * In the list file, each listed URL appears on one line by itself, in
 * text column one. Additional note lines may follow each URL, indented
 * with a space. If there are any note lines, the first note may start
 * with "#" ( number sign ) to show that the preceeding URL has already
 * been fetched, or may start with "*" ( asterisk ) to show that since
 * the time the preceeding novel URL was first discovered by SURF in a
 * web page, the user has not authorized SURF to fetch that resource.
 *
 * For example, saying surf -f http://www.hughes.net/~scheper yields:
 *
 * ------------------------------ sample ------------------------------
 SURF URL file: 'list.txt'
 bible biblkwic book code details divide executable git glenn god

 # prevents re-downloading web pages that you already have.
 * prevents downloading novel links. Delete * to SURF them.

	----LOCAL FILES----
http://www.hughes.net/~scheper/
 #246 ~scheper
 Glenn Scheper Knows!
 surf scheper code glenn bible hughes jesus release details txt
 executable god book git version biblkwic divide gutenberg people

http://www.hughes.net/~scheper
 #000 Error.00301

	----NOVEL LINKS----
http://members.tripod.com/~TempleOfGnosis/index.html
 * An area of: Glenn Scheper Knows!

http://www-personal.umich.edu/~airyn/spiritring.html
 * Spirit Ring

http://www.hughes.net/~scheper/anci.htm
 * anci: 693 SURF links: history war section ancient world information archaeology american

http://www.hughes.net/~scheper/astr.htm
 * astr: 1704 SURF links: dream earth life spiritual people world just astrology dreams

http://www.hughes.net/~scheper/bibl66bk.c
 * Source Code

http://www.hughes.net/~scheper/bibl66bk.exe
 * Executable file
....
 * ---------------------------- end sample ----------------------------
 *
 * All of the fetched URLs appear first in the list file, sorted by quality.
 * That helps you organize your reading to start with the best saved pages.
 *
 * In the notes after each fetched URL are first a "#" to block refetching,
 * then three page quality indicator digits that are based on the number of
 * sentences, the number of unique uncommon words, and the number of links
 * found on that page, then the local filename in which the page was saved.
 * The next note shows the title text extracted from the page, and finally
 * two note lines show the most frequent uncommon words used on that page.
 *
 * Unfetched URLs appear last. Every unfetched URL is followed by a text
 * line containing the longest anchor text ever associated with that URL.
 * An asterisk is added to every novel URL that surf encounters, to keep
 * surf from trying to download all the internet, everywhere, all at once.
 *
 * ---- ARGUMENT: None, or just an exact list filename -----
 *
 * You select URLs that have interesting titles for future downloading
 * by editing the list file to remove each asterisk after desired URLs.
 * Then, you execute SURF again to fetch those pages that you selected.
 * URLs not marked by either "#" or "*" will be fetched in a random order.
 * Multiple copies of surf can be run by creating multiple MS-DOS windows.
 *
 * Usage: SURF
 * Usage: SURF [Exact File specification. If none, default is LIST.TXT]
 *
 * ---- ARGUMENT: MINUS A, also MINUS B -----
 *
 * Surf can also be started to have it create a new list file or augment
 * an existing list file by parsing preexisting local HTML files. Those
 * might be files that have been downloaded using surf, of files found
 * in your conventional browser's cache folders. Local files stored by
 * SURF always contain an HTML BASE markup tag, so that relative URLs
 * can later be resolved, but files cached by conventional browsers
 * typically have no BASE tag. If a file has no BASE tag, surf can only
 * extract the URLs in it that are absolute, that is, fully specified.
 *
 * Usage: SURF -A [Filespec, else defaults to *]
 * Usage: SURF -B [Filespec, else defaults to *]
 *
 * Use the -A form to extract all of the novel unfetched anchor URLs from
 * the specified local files for possible further explorations with SURF.
 * Use the -B form if you have a lot of files, and you are only interested
 * in cataloging the fetched URLs, as when you want to next create a links
 * web page using the -W feature. I also use -B to catalog what I have, to
 * use that list file to start future searches, to avoid refetching files.
 *
 * For example, I can review my recent Netscape browsings by saying:
 * surf \review -a "C:\Program Files\Netscape\Users\gscheper\Cache"
 *
 * ---- ARGUMENT: MINUS Q -----
 *
 * Surf can also be started by having it query a number of search engines
 * with some search terms. Because the same terms will be sent to some 40
 * diverse seach engines, it's probably best to avoid boolan expressions
 * and complicated syntax. Use lowercase generally, except when searching
 * for proper names. Surf tries to invoke the boolean AND form of queries,
 * but some search engines may treat multiple terms with the boolean OR.
 *
 * Usage: SURF -Q KEYWORDS
 *
 * When invoked with -Q, the keywords are mixed with some 40 predefined
 * partial search engine query URLs. Those complete URLs are then added
 * to the list file. They are added to the list without any asterisks,
 * so that surf will automatically go on to send out all those queries.
 * When those search engine result pages are parsed, all of the anchor
 * URLs are added to the list as usual, which typically consist of not
 * only the matching web page hits, but also the search engine's other
 * internal URLs, additional result page URLs, advertisement URLs, and
 * URLs to perhaps translate or search for similar pages, etc.
 *
 * Surf uses a few simple heuristics to recognize which URLs represent
 * the matching web page hits, so that they can also be listed without
 * an asterisk. After fetching the first search engine result page, or
 * a few, the desired matching web pages will begin to get fetched too.
 *
 * When some fetched page summaries interest you, stop surf by hitting
 * any key, or leave it running and read the local file by opening yet
 * another DOS prompt, or browse and open them using Windows Explorer.
 *
 * Soon there will be a directory full of both query result pages and
 * the desired mathing web pages. The query result page filenames all
 * start with an underscore, so if you have no further use for them
 * you can delete them all by saying "del _*". Then you may wish to
 * delete that list file, then make a new list using SURF -A or -B.
 *
 * The list serves as your guide to quality reading material. Start
 * with the first files listed, because they probably have the most
 * apparent sentences, the greatest vocabulary, and the most links.
 *
 * For example, letting "surf -q browser freeware" run until all the hits
 * were exhausted fetched 49 query result pages, 820 web pages, and after
 * rebuilding LIST without the query pages, there were 10,000 novel links.
 *
 * You may wish to experiment with booleans and special punctuation
 * like double quotes, which indicate phrases to some search engines.
 * To enter double quotes, backslash them inside other double quotes.
 * For example, SURF -q "\"Exact phrase match\""
 *
 * ---- ARGUMENT: MINUS R -----
 *
 * Surf can automatically reorganize files into new directories according
 * as they contained words of interest that you edited into the list file.
 *
 * Usage: SURF ListFileSpec -R [Filespec, else defaults to *]
 *
 * Surf expands local file specifications for -A, -B and -R recursively,
 * so "*" will match all files in the current directory, and all files in
 * any directory at any depth under the current directory. That allows you
 * to arrange all your saved web page files in sub-directories according
 * to their predominant topics.
 *
 * Surf's reorganize feature will only move files that contain a BASE tag,
 * so as to prevent surf from recursively destroying a whole file system.
 *
 * Under -R, surf will not rewrite the URL list file, so you can reference
 * a single constant file name to reorganize fetched files again and again.
 *
 * Surf -R used with the following list file will create all three of the
 * needed destination directories ( \Y, \Y\ANCI and \Y\NOMATCH ).
 * ------------------------------ sample ------------------------------
@\y\anci ancient archaeolog* caesar* civil histor* holocaust medieval ...
@\y\nomatch
 * ---------------------------- end sample ----------------------------
 *
 * ---- ARGUMENT: MINUS W -----
 *
 * Surf can convert its list file into an HTML links page for sharing
 * again on the internet. The links will be shown sorted according to
 * their overall page quality, or they can be grouped into categories
 * if some of the '@' reorganizing lines also appear in the list file.
 * The title shows the most-used words counted from the most-used words.
 *
 * Usage: SURF -W > LINKS.HTM
 *
 * For example, SURF -w > LINKS.HTM created this from the earlier list:
 * ------------------------------ sample ------------------------------
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<HTML>
<HEAD>
<TITLE>2 SURF links: bible biblkwic book code details divide executable git glenn god</TITLE>
</HEAD>
<BODY>
<H1>2 SURF links</H1>
<H2>bible biblkwic book code details divide executable git glenn god</H2>


<p> 1. <a href ="http://www.hughes.net/~scheper/">
 Glenn Scheper Knows!</a><br>
 surf scheper code glenn bible hughes jesus release details txt
 executable god book git version biblkwic divide gutenberg people

<p> 2. <a href ="http://www.hughes.net/~scheper">
No Title</a><br>

<P>
</BODY>
</HTML>
 * ---------------------------- end sample ----------------------------
 *
 * Obviously, some manual intervention would have been desirable to groom
 * that list file before sharing it with the world. In the case above, the
 * original URL that lacked a final / got a 301 redirection, but the orginal
 * URL was kept in the list to prevent fetching it if ever encountered again.
 *
 * -----
 *
 * Users who embrace a windowed life may try the following
 * method to avoid using the MS-DOS prompt to invoke SURF:
 * Activate Windows Explorer. Use it to create a new folder,
 * enter that folder and create a new text file, which text
 * file contains the surf invocation line as you would type
 * at the command line, and which file name end with a .BAT
 * extension. Then double click the text file to execute it.
 *
 * -----
 *
 * Download the latest SURF Windows 95/NT/98 executable file at:
 *		<URL:http://www.hughes.net/~scheper/surf.exe>
 *
 * Then copy SURF.EXE to a directory in your path, for example, to the
 * filename C:\WINDOWS\SURF.EXE. That's all the installation it needs.
 *
 * Download the latest SURF C++ source file at:
 *		<URL:http://www.hughes.net/~scheper/surf.cpp>
 *
 * Download the latest SURF read-me ASCII text file at:
 *		<URL:http://www.hughes.net/~scheper/surf.htm>
 *
 * Download the latest pkzip of ( SURF.CPP, SURF.EXE, SURF.HTM ) at:
 *		<URL:http://www.hughes.net/~scheper/surf.zip>
 *
 * Download the GNU General Public License at:
 *		<URL:http://www.fsf.org/copyleft/gpl.html>
 *
 * Contact SURF's author Glenn Scheper at:
 *		<URL:mailto:scheper@hughes.net>.
 *
 * Glenn Scheper's personal home page is at:
 *		<URL:http://www.hughes.net/~scheper/>.
 *
 * -----
 *
 * Compiled under Microsoft Developer Studio and
 * Microsoft Visual C++ 5.0 compiler as follows:
 *
 * Go where you make new VC++ projects. ( I use C:\I\VC. )
 * Make a new directory SURF.
 * Copy SURF.CPP to directory SURF.
 * Execute MS VC++.
 * Do: File | New...
 * Click on the "Projects" tab.
 * Make sure or fix so Location says "C:\I\VC" or whatever.
 * Scroll down to "Win32 Console Application".
 * Tab to "Project Name". Type in "SURF". Click OK.
 * A new workspace for SURF will appear.
 * Click on the "File View" tab.
 * Right-Click on "SURF Files".
 * Select "Add Files to Project...".
 * Select the file "SURF.CPP". Click OK.
 * Do: Build | "Set Active Configuration..."
 * Select "SURF - Win32 release". Click OK.
 * Select Project, Settings...
 * Click on the "C/C++" tab.
 * Select Category=Code Generation, Run-time libraries=Multithreaded.
 * Click Ok.
 * Do: Build, Rebuild all.
 * Copy ./release/SURF.exe to a directory in the path. Try C:\WINDOWS.
 *
 * Note: If you get two undefined function name errors after compiling,
 * it means that you forgot the step to choose Multithreaded libraries.
 */

char SurfVersion [] = "SURF v1.4"; // For the http agent line
char SurfVersionAndDate [] = "SURF version 1.4, 1999-07-28.";

#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <io.h>
#include <fcntl.h>
#include <conio.h>
#include <ctype.h>
#include <direct.h>
#include <afx.h>
#include <afxwin.h>
#include <afxinet.h>
#include <iostream.h>

// A drastic debugging measure
#if 0
#define Flow( MacroNumber ) fprintf( stderr, " %d, ", MacroNumber )
#else
#define Flow( x )
#endif

void GiveUsageMessage( )
{
	fprintf( stderr,
"Surf fetches HTML web pages using HTTP, creating local files and a URL list.\n"
"Use the Windows 95 MS-DOS prompt, go to a clean directory, try the -Q or -F.\n"
"When surf stops fetching, there may remain more novel URLs in the list file.\n"
"Edit the list file and remove the asterisk after any URLs you wish to fetch.\n"
"SURF [LIST] -Q QUERY KEYS...; Queries several search engines, fetches pages.\n"
"SURF [LIST] -F URL1 [URL2...]; Fetches the web pages for [HTTP://]URL1, etc.\n"
"SURF [LIST]; Resumes fetching non-asterisked URLs recorded in the list file.\n"
"SURF [LIST] -B [*]; Reparses cached HTML files to list all of the BASE URLs.\n"
"SURF [LIST] -A [*]; Like -B but also lists all of the unfetched anchor URLs.\n"
"SURF [LIST] -R [*]; Rearranges local HTML files according to list '@' lines.\n"
"SURF [LIST] -W > LINKS.HTM; Writes an HTML page about the URLs in list file.\n"
"SURF will use fixed filename LIST.TXT if you do not specify a LIST filename.\n"
"%s Copyright (C) 1999 Glenn Scheper.\n" // ...SurfVersionAndDate here
"This software comes with ABSOLUTELY NO WARRANTY; Read surf.htm for details.\n"
"This is free software, and you are welcome to redistribute it under certain\n"
"conditions; Read surf.htm for details. scheper@hughes.net. You can download\n"
"zipped surf.zip or surf.cpp, surf.exe, surf.htm at www.hughes.net/~scheper/.\n"
	, SurfVersionAndDate );
	return;
}

/*
 * Here's a poor-man's glossary of the error conditions that may occur.
 * Don't be worried about 301/302: Surf will automatically follow them.
 *
 * 200 OK request completed
 * 201 CREATED object created, reason = new URI
 * 202 ACCEPTED async completion (TBS)
 * 203 PARTIAL partial completion
 * 204 NO_CONTENT no info to return
 * 300 AMBIGUOUS server couldn't decide what to return
 * 301 MOVED object permanently moved
 * 302 REDIRECT object temporarily moved
 * 303 REDIRECT_METHOD redirection w/ new access method
 * 304 NOT_MODIFIED if-modified-since was not modified
 * 400 BAD_REQUEST invalid syntax
 * 401 DENIED access denied
 * 402 PAYMENT_REQ payment required
 * 403 FORBIDDEN request forbidden
 * 404 NOT_FOUND object not found
 * 405 BAD_METHOD method is not allowed
 * 406 NONE_ACCEPTABLE no response acceptable to client found
 * 407 PROXY_AUTH_REQ proxy authentication required
 * 408 REQUEST_TIMEOUT server timed out waiting for request
 * 409 CONFLICT user should resubmit with more info
 * 410 GONE the resource is no longer available
 * 411 AUTH_REFUSED couldn't authorize client
 * 500 SERVER_ERROR internal server error
 * 501 NOT_SUPPORTED required not supported
 * 502 BAD_GATEWAY error response received from gateway
 * 503 SERVICE_UNAVAIL temporarily overloaded
 * 504 GATEWAY_TIMEOUT timed out waiting for gateway
 *
 * 12001 INTERNET_OUT_OF_HANDLES
 * 12002 INTERNET_TIMEOUT
 * 12003 INTERNET_EXTENDED_ERROR
 * 12004 INTERNET_INTERNAL_ERROR
 * 12005 INTERNET_INVALID_URL
 * 12006 INTERNET_UNRECOGNIZED_SCHEME
 * 12007 INTERNET_NAME_NOT_RESOLVED
 * 12008 INTERNET_PROTOCOL_NOT_FOUND
 * 12009 INTERNET_INVALID_OPTION
 * 12010 INTERNET_BAD_OPTION_LENGTH
 * 12011 INTERNET_OPTION_NOT_SETTABLE
 * 12012 INTERNET_SHUTDOWN
 * 12013 INTERNET_INCORRECT_USER_NAME
 * 12014 INTERNET_INCORRECT_PASSWORD
 * 12015 INTERNET_LOGIN_FAILURE
 * 12016 INTERNET_INVALID_OPERATION
 * 12017 INTERNET_OPERATION_CANCELLED
 * 12018 INTERNET_INCORRECT_HANDLE_TYPE
 * 12019 INTERNET_INCORRECT_HANDLE_STATE
 * 12020 INTERNET_NOT_PROXY_REQUEST
 * 12021 INTERNET_REGISTRY_VALUE_NOT_FOUND
 * 12022 INTERNET_BAD_REGISTRY_PARAMETER
 * 12023 INTERNET_NO_DIRECT_ACCESS
 * 12024 INTERNET_NO_CONTEXT
 * 12025 INTERNET_NO_CALLBACK
 * 12026 INTERNET_REQUEST_PENDING
 * 12027 INTERNET_INCORRECT_FORMAT
 * 12028 INTERNET_ITEM_NOT_FOUND
 * 12029 INTERNET_CANNOT_CONNECT
 * 12030 INTERNET_CONNECTION_ABORTED
 * 12031 INTERNET_CONNECTION_RESET
 * 12032 INTERNET_FORCE_RETRY
 * 12033 INTERNET_INVALID_PROXY_REQUEST
 * 12034 INTERNET_NEED_UI
 * 12036 INTERNET_HANDLE_EXISTS
 * 12037 INTERNET_SEC_CERT_DATE_INVALID
 * 12038 INTERNET_SEC_CERT_CN_INVALID
 * 12039 INTERNET_HTTP_TO_HTTPS_ON_REDIR
 * 12040 INTERNET_HTTPS_TO_HTTP_ON_REDIR
 * 12041 INTERNET_MIXED_SECURITY
 * 12042 INTERNET_CHG_POST_IS_NON_SECURE
 * 12043 INTERNET_POST_IS_NON_SECURE
 * 12044 INTERNET_CLIENT_AUTH_CERT_NEEDED
 * 12045 INTERNET_INVALID_CA
 * 12046 INTERNET_CLIENT_AUTH_NOT_SETUP
 * 12047 INTERNET_ASYNC_THREAD_FAILED
 * 12048 INTERNET_REDIRECT_SCHEME_CHANGE
 * 12110 FTP_TRANSFER_IN_PROGRESS
 * 12111 FTP_DROPPED
 * 12130 GOPHER_PROTOCOL_ERROR
 * 12131 GOPHER_NOT_FILE
 * 12132 GOPHER_DATA_ERROR
 * 12133 GOPHER_END_OF_DATA
 * 12134 GOPHER_INVALID_LOCATOR
 * 12135 GOPHER_INCORRECT_LOCATOR_TYPE
 * 12136 GOPHER_NOT_GOPHER_PLUS
 * 12137 GOPHER_ATTRIBUTE_NOT_FOUND
 * 12138 GOPHER_UNKNOWN_LOCATOR
 * 12150 HTTP_HEADER_NOT_FOUND
 * 12151 HTTP_DOWNLEVEL_SERVER
 * 12152 HTTP_INVALID_SERVER_RESPONSE
 * 12153 HTTP_INVALID_HEADER
 * 12154 HTTP_INVALID_QUERY_REQUEST
 * 12155 HTTP_HEADER_ALREADY_EXISTS
 * 12156 HTTP_REDIRECT_FAILED
 * 12157 INTERNET_SECURITY_CHANNEL_ERROR
 * 12158 INTERNET_UNABLE_TO_CACHE_FILE
 * 12159 INTERNET_TCPIP_NOT_INSTALLED
 * 12160 HTTP_NOT_REDIRECTED
 */

/*
 * History of Glenn Scheper's own versions of SURF:
 * Download latest source and Win-95 executable at:
 * <URL:http://www.hughes.net/~scheper/surf.cpp>
 * <URL:http://www.hughes.net/~scheper/surf.exe>
 *
 * Various "PRE-SURF" program versions parsed HTML files, etc.
 *
 * 12-31-97 First reasonably functional SURF.EXE fetches HTML files
 * at random as given in a URL list file, parses the HTML files and
 * saves the text + links parts on local drive, and grooms URL list.
 *
 * 2-23-98 Fixed lack of ( LPCTSTR ) typecast on a TCHAR named szErr.
 * 2-23-98 Aggressive work adding file vocabulary info to list file.

 * SURF Version 1.2, Mar 22 1998:
 * Started a version number, to fit the GNU licensing requirements.
 * Added GNU style copyright notice to source and to usage message.
 * Totally reorganized command line invocation format with a -FLAG.
 * Fixed random numbers: 32K was too small to span a huge URL list.
 * Changed the sort on cached URLs to use sum of 3 quality indices.
 * Improved the handling of titles and URL notes on AREA and FRAME.
 * Added the URL List -> HTML file and Query URLs -> List features.

 * SURF Version 1.3, April 12 1998:
 * I no longer require a user to type http://before the url argument.
 * I now output the two lines of most used words when fetching a file.
 * I changed to no-fetch asterisk all but four popular search engines.
 * I fixed the LocalFilename to use \ rather than / as path separator.
 * I fixed so final word of URL annotation is not lost during fetches.
 * I added most-used-of-the-most-used-words line to LIST and LIST.HTM.
 * I added File Naming Control '@path word word..." idea to LIST file.
 * I added File Naming Control and -M option to rearrange local files.
 * I added URL annotation based on title text for AREA and FRAME tags.
 * I changed third quality metric from word count to vocabulary count.

 * SURF Version 1.4, July 28, 1999:
 * Fixed my web address to have a "www." prefix in correct domain name.
 * Fixed bad URL creation that said "An Area of..." or "A Frame of...".
 * Omit the * from "* A Frame of..." note, so SURF will fetch such URL.
 * Fixed possible crash for lack of a check: "&& LastUrlPtrSlot != -1".
 * Convert any HTML BASE tags to effete "xase" in the saved text files.
 * Recognize HTML status 302 as redirect, record URL specially in list.
 * Save redirected URL in list without asterisk, so it will be fetched.
 * The -Q option will now automatically initiate a file fetching phase.
 * Replaced -Q query URLs with an updated collection of search engines.
 * Record search result URLs in list without asterisks when recognized.
 * Allow URL paths with .CGI extensions -- used in some search engines.
 * Added more search engine result parsing heuristics to the -Q smarts.
 * Pick even nicer filename letters: especially avoid [012] and [oilz].
 * I have ripped out the -M command line option that moved files about.
 * There is a new feature -R to rearrange files better than the old -M
 * feature, because -R re-parses the local HTML files instead of using
 * the LIST, so -R can use the file's entire uncommon-words vocabulary
 * to match the user's favorite words entered on @path word word lines.
 * Notice that -R will not be correcting the filenames of URLs in LIST,
 * nor re-write LIST at end, so -R can point to a reusable naming file.
 * So, after doing -R to rearrange files, do -A to recreate a new LIST.
 * The -R feature will not move any file unless it contains a BASE tag.
 * The -R feature now accepts '*' at the end of a word to finish match.
 * The URL fetching phase will not do file rearrangement. Use -R after.
 * Ran lint. Not embarrassing. Checked more return values, added notes.
 * Made the argv[1] optional, assuming file LIST if argv[1] is a -flag.
 * A few users found it cryptic, so I rewrote the entire usage message.
 * Fixed a bug: The -W feature wrote /BODY end tag after /HTML end tag.
 * Trimmed the common words from 1612 to 719 short non-trademark words.
 * Trimmed the CommonWordList format from 3-15 to 3-7 chars plus space.
 * The -Q query topics do character escaping for / ; and ? to %XX form.
 * Added quickie -B variant of -A omiting unfetched urls to run faster.
 * Change title annotation starting with '#' or '*' to a '!' character.
 * Use an unequal ( 9-7-5 ) weighting to sort the three quality digits.
 * Add title words to vocabulary, so AreaOf/FrameOF same for -F and -A.
 * Added a test for keyins during the fetching phase to abort download.
 * Here's a great idea came in from a user: Change the random filenames
 * into meaningful filenames using the final file part of the URL path.
 * Fixed bug: 301 etc. redirected URL lost if bare domain with no path.
 * Aggressive study of best search engine URLs and result page parsing.
 * Added a <PRE> tag, so wrapped text can be viewed nicely in browsers.
 * Added BASE as an URL: anchor, so can browse to original file easily.
 * Fixed a bug that %20 etc. in urlpath part of URL got expanded twice.
 * Homogenized file extensions to *.HTM and LIST.TXT, for Win impaired.
 * Reduced list to 38 world wide search engines, in multiple languages.
 * Prejudice file renaming so all query URLs go into DefaultNamingPath.
 * Fix a bug that www.com creates filename COM, etc., stopping program.
 * Fixed failure to add domain-only anchor lacking final / to URL list.
 * I gave myself months to write the user guide, check for bugs... Bye!
 */

/********************************************************

Known bug with no improvement intended:
Fatal Error believed to be outside of SURF.EXE: Attempting to fetch
<URL:http://wwcol.com/con/> caused this program to hang up forever,
and attempts to kill the program with Ctrl+Alt+Del became confused,
ending in rebooting the Win 95 ( 4.00.950 ) computer. The same thing
including the messy reboot happened when trying to fetch that URL
with Internet Explorer V.3 ( 4.70.1215 ), which last said "Website
found. Waiting for reply". It also happened to GIT.EXE, my other
utility based on the sample program TEAR distributed with MS VC++.
That website server is an "Apache/1.2.6 Red Hat", and I'd suspect
the confustion comes from that path /con/ somehow getting related
to the "console" filename, con, which may never get an end-of-file.
Curiously, Netscape v.4.05 is able to fetch that url without any
problem. The several other programs that failed on the bare /con/
path were able to fetch from /con/index.htm without any problem.
Some other host with no /com/ directory just returned 404 status.
I do not have a workaround, and if that happens to you, you will
have to reboot, therefore losing the contents of the LIST file.
You can reconstruct the LIST file from the already gathered .HTM
files using SURF -A, but there will be no record of the 301, 302,
404, etc. URLs which only existed in the LIST. Before restarting
SURF, note the last URL showing on the console, and manually edit
the LIST to remove that URL, otherwise SURF will halt again later.

Known bug with no improvement intended:
When surf fetched a dictionary file, which had 87000 ascii words
in ascending order, the vocabulary sorting mechanism hit a worst
case, and took just about forever to process that input file.

======

Stuff on the wish list:

Maybe someday: If real anchor text is encountered for any URL that
has already been saved as "an area of..." or "a frame of...", the
real anchor text should replace the SURF-generated anchor text.

Maybe someday: The exact vocabulary list changes between -F or -Q
fetches and a later -A or -B reparse, which affects URL sort order.

Maybe someday: Add some new mode that will compare all the locally
collected files, to remove any redundant files; based on having
the identical base url, or possibly other recognition heuristics.

Maybe someday: Add a mode to refresh all collected files by doing
a query of the host with a header specifying only if the remote
file has changed since the local file's creation date and time.
That should also honor 302's to clean up old files not so fixed,
and should honor 404's etc. to delete files no longer accessible.

Maybe someday: Add a feature to match vocabularies, so user can pick
one file, and other similar files will be accumulated around it, etc.

Maybe someday: If -A or -B is parsing a local file with no BASE tag,
put a URL in LIST such as "file://filename.htm" Otherwise, all local
files without base tags overwrite same "://" and merge into one item.

Maybe someday: Do not compromise on the URL parse, by a too-early test
for no locn && no pathpart in the routine CombineLinkUrlWithBase( ),
because I've seen pages with BASE=query, Anchors=new query part only.

Maybe someday: Support the ftp: and gopher: methods.

Maybe someday: A multithreaded version to really devour Internet.

********************************************************/


	// This lookup table operates like SortedEntityNames,
	// listing words to ignore of 3-7 chars plus a space.
	// This string is first to not overtax compiler heap.

	// I'd appreciate feedback to better adjust this list:
	// ( and for anything else you'd like in this program! )
	// E-mail me: Glenn Scheper - scheper@hughes.net

char CommonWordList [] = {
"! ......"  // low sentinel
"aber ..."
"able ..."
"about .."
"above .."
"accept ."
"across ."
"act ...."
"acts ..."
"actual ."
"add ...."
"added .."
"adding ."
"adds ..."
"after .."
"again .."
"ago ...."
"ahead .."
"aka ...."
"all ...."
"allow .."
"allows ."
"almost ."
"along .."
"als ...."
"also ..."
"alt ...."
"always ."
"among .."
"and ...."
"anti ..."
"any ...."
"anyone ."
"apart .."
"appear ."
"apply .."
"apr ...."
"are ...."
"area ..."
"areas .."
"aren ..."
"around ."
"aside .."
"ask ...."
"asked .."
"asking ."
"assume ."
"auch ..."
"auf ...."
"aug ...."
"aus ...."
"author ."
"avait .."
"ave ...."
"away ..."
"back ..."
"bad ...."
"became ."
"become ."
"been ..."
"before ."
"began .."
"begin .."
"begun .."
"behind ."
"bei ...."
"being .."
"belong ."
"below .."
"beside ."
"beyond ."
"big ...."
"bit ...."
"both ..."
"bottom ."
"bring .."
"brings ."
"build .."
"built .."
"but ...."
"button ."
"byte ..."
"bytes .."
"call ..."
"called ."
"calls .."
"came ..."
"can ...."
"cannot ."
"carry .."
"case ..."
"cause .."
"caused ."
"causes ."
"cease .."
"ceased ."
"ceases ."
"chance ."
"change ."
"check .."
"choose ."
"chosen ."
"clean .."
"clear .."
"click .."
"close .."
"closer ."
"com ...."
"come ..."
"comes .."
"coming ."
"cool ..."
"copy ..."
"could .."
"couldn ."
"count .."
"create ."
"cut ...."
"dans ..."
"das ...."
"date ..."
"dates .."
"day ...."
"days ..."
"dec ...."
"del ...."
"dem ...."
"den ...."
"dept ..."
"der ...."
"des ...."
"did ...."
"didn ..."
"die ...."
"dies ..."
"diese .."
"dir ...."
"does ..."
"doesn .."
"doing .."
"don ...."
"down ..."
"due ...."
"durch .."
"during ."
"each ..."
"early .."
"edited ."
"editor ."
"edu ...."
"eight .."
"eighth ."
"ein ...."
"eine ..."
"einem .."
"einer .."
"either ."
"eleven ."
"else ..."
"end ...."
"ended .."
"ending ."
"ends ..."
"enough ."
"enter .."
"entry .."
"equal .."
"etait .."
"etc ...."
"even ..."
"ever ..."
"every .."
"exact .."
"except ."
"exe ...."
"exist .."
"exists ."
"extra .."
"fall ..."
"fallen ."
"falls .."
"far ...."
"fast ..."
"feb ...."
"feel ..."
"fell ..."
"few ...."
"field .."
"fields ."
"fifth .."
"fifty .."
"file ..."
"files .."
"filled ."
"final .."
"find ..."
"finds .."
"first .."
"fit ...."
"five ..."
"follow ."
"for ...."
"form ..."
"format ."
"forms .."
"found .."
"four ..."
"fourth ."
"frame .."
"frames ."
"free ..."
"fri ...."
"from ..."
"front .."
"ftp ...."
"full ..."
"fully .."
"fur ...."
"future ."
"gave ..."
"get ...."
"gets ..."
"gif ...."
"give ..."
"given .."
"gives .."
"goes ..."
"going .."
"gone ..."
"good ..."
"got ...."
"gov ...."
"haben .."
"had ...."
"hard ..."
"has ...."
"hast ..."
"hath ..."
"have ..."
"haven .."
"having ."
"hear ..."
"heard .."
"hears .."
"held ..."
"hence .."
"her ...."
"here ..."
"herein ."
"high ..."
"higher ."
"highly ."
"him ...."
"his ...."
"hold ..."
"holds .."
"home ..."
"hope ..."
"host ..."
"hot ...."
"hour ..."
"hours .."
"how ...."
"href ..."
"htm ...."
"html ..."
"http ..."
"ihm ...."
"ils ...."
"image .."
"images ."
"inc ...."
"indeed ."
"index .."
"infer .."
"info ..."
"inner .."
"input .."
"inside ."
"into ..."
"invoke ."
"ist ...."
"item ..."
"items .."
"its ...."
"itself ."
"jan ...."
"join ..."
"jpg ...."
"jul ...."
"jun ...."
"key ...."
"knew ..."
"know ..."
"known .."
"knows .."
"las ...."
"last ..."
"lastly ."
"late ..."
"lately ."
"later .."
"latter ."
"lay ...."
"lead ..."
"leads .."
"learn .."
"least .."
"leave .."
"led ...."
"left ..."
"length ."
"les ...."
"less ..."
"lesser ."
"lest ..."
"let ...."
"lib ...."
"like ..."
"liked .."
"likely ."
"likes .."
"liking ."
"line ..."
"lines .."
"link ..."
"linked ."
"links .."
"list ..."
"listed ."
"listen ."
"lists .."
"look ..."
"looked ."
"looks .."
"los ...."
"lot ...."
"lots ..."
"low ...."
"lower .."
"lui ...."
"made ..."
"main ..."
"mainly ."
"mais ..."
"make ..."
"makes .."
"making ."
"many ..."
"mar ...."
"may ...."
"maybe .."
"mean ..."
"means .."
"meant .."
"meet ..."
"meets .."
"menu ..."
"menus .."
"mere ..."
"merely ."
"mid ...."
"midst .."
"might .."
"mind ..."
"mine ..."
"minute ."
"misc ..."
"mixed .."
"moment ."
"mon ...."
"month .."
"months ."
"more ..."
"most ..."
"mostly ."
"moved .."
"moving ."
"much ..."
"multi .."
"must ..."
"myself ."
"nach ..."
"near ..."
"neat ..."
"neatly ."
"need ..."
"needed ."
"needs .."
"net ...."
"never .."
"new ...."
"news ..."
"next ..."
"nice ..."
"nicely ."
"nicht .."
"night .."
"nine ..."
"nineth ."
"ninety ."
"ninth .."
"nobody ."
"non ...."
"none ..."
"noone .."
"nor ...."
"not ...."
"note ..."
"noted .."
"notes .."
"notice ."
"noting ."
"nov ...."
"now ...."
"nur ...."
"occur .."
"occurs ."
"oct ...."
"oder ..."
"off ...."
"often .."
"old ...."
"once ..."
"one ...."
"ones ..."
"only ..."
"onto ..."
"open ..."
"opened ."
"org ...."
"other .."
"others ."
"ought .."
"our ...."
"ours ..."
"out ...."
"outer .."
"over ..."
"owing .."
"own ...."
"page ..."
"pages .."
"part ..."
"partly ."
"past ..."
"path ..."
"per ...."
"place .."
"placed ."
"places ."
"plain .."
"please ."
"plus ..."
"point .."
"points ."
"policy ."
"post ..."
"posted ."
"pour ..."
"pre ...."
"prefer ."
"press .."
"pretty ."
"prev ..."
"print .."
"prior .."
"pro ...."
"pub ...."
"put ...."
"puts ..."
"que ...."
"query .."
"qui ...."
"quick .."
"quite .."
"quot ..."
"raise .."
"rather ."
"read ..."
"reader ."
"reads .."
"ready .."
"real ..."
"really ."
"recent ."
"record ."
"refer .."
"refers ."
"remain ."
"remote ."
"rest ..."
"return ."
"rev ...."
"right .."
"rights ."
"role ..."
"rom ...."
"root ..."
"rule ..."
"rules .."
"run ...."
"said ..."
"same ..."
"san ...."
"sat ...."
"save ..."
"saved .."
"saw ...."
"say ...."
"saying ."
"says ..."
"second ."
"see ...."
"seeing ."
"seek ..."
"seem ..."
"seemed ."
"seems .."
"seen ..."
"sees ..."
"sein ..."
"seine .."
"self ..."
"send ..."
"sends .."
"sent ..."
"sep ...."
"ses ...."
"set ...."
"seven .."
"shall .."
"shalt .."
"she ...."
"should ."
"show ..."
"showed ."
"shown .."
"shows .."
"shtml .."
"sich ..."
"side ..."
"sie ...."
"sin ...."
"since .."
"sind ..."
"site ..."
"sites .."
"six ...."
"sixth .."
"sixty .."
"size ..."
"skip ..."
"some ..."
"soon ..."
"sooner ."
"sort ..."
"sought ."
"source ."
"spite .."
"spoken ."
"stand .."
"stands ."
"start .."
"starts ."
"stay ..."
"still .."
"stood .."
"stuff .."
"sub ...."
"submit ."
"such ..."
"sun ...."
"super .."
"sur ...."
"sure ..."
"surely ."
"table .."
"tabs ..."
"take ..."
"taken .."
"takes .."
"taking ."
"ten ...."
"tend ..."
"tended ."
"tends .."
"tenth .."
"term ..."
"terms .."
"text ..."
"texts .."
"than ..."
"thank .."
"thanks ."
"that ..."
"the ...."
"thee ..."
"their .."
"theirs ."
"them ..."
"then ..."
"thence ."
"there .."
"these .."
"they ..."
"thine .."
"thing .."
"things ."
"think .."
"thinks ."
"third .."
"thirty ."
"this ..."
"those .."
"thou ..."
"though ."
"three .."
"thu ...."
"thus ..."
"thy ...."
"till ..."
"time ..."
"times .."
"title .."
"told ..."
"too ...."
"took ..."
"top ...."
"total .."
"try ...."
"trying ."
"tue ...."
"twelve ."
"twenty ."
"twice .."
"two ...."
"type ..."
"types .."
"uber ..."
"und ...."
"under .."
"une ...."
"unless ."
"uns ...."
"until .."
"unto ..."
"update ."
"upon ..."
"url ...."
"urls ..."
"usage .."
"use ...."
"used ..."
"user ..."
"users .."
"uses ..."
"using .."
"very ..."
"via ...."
"view ..."
"viewed ."
"viewer ."
"views .."
"visit .."
"vol ...."
"von ...."
"wait ..."
"want ..."
"wanted ."
"wants .."
"was ...."
"wasn ..."
"way ...."
"ways ..."
"wed ...."
"week ..."
"weeks .."
"well ..."
"wenn ..."
"went ..."
"were ..."
"what ..."
"when ..."
"where .."
"which .."
"while .."
"who ...."
"whole .."
"whom ..."
"whose .."
"why ...."
"wide ..."
"widely ."
"wie ...."
"will ..."
"wilt ..."
"win ...."
"wir ...."
"with ..."
"within ."
"word ..."
"words .."
"work ..."
"works .."
"worse .."
"worth .."
"would .."
"wouldn ."
"write .."
"writes ."
"wrote .."
"www ...."
"year ..."
"years .."
"yes ...."
"yet ...."
"you ...."
"your ..."
"yours .."
"~ ......" // top sentinel
};

	// Argv[1] could be an existing or new filename to hold URL text lines.
	// Lately, I've been refering to the ScriptFilename file as the 'LIST'.
	// Each URL is pure text, like http://locn/path;param?query ( no #frag ).
	// After each url line may be annotation lines starting with one space.

char * ScriptFilename = 0;
int ScriptFileByteCount = 0;

	// There may be 0 or 1 flag letter argument ( -q, -f, -b, -a, -r, -w ):
	// "SURF [LIST.TXT] -Q QUERY KEYS...
	// "SURF [LIST.TXT] -F URL1 [URL2...]
	// "SURF [LIST.TXT]
	// "SURF [LIST.TXT] -B [*]
	// "SURF [LIST.TXT] -A [*]
	// "SURF [LIST.TXT] -R [*]
	// "SURF [LIST.TXT] -W > LINKS.HTM

char ProgramIsDoingFetch = 0; // For the -F, -Q, and for no flag
char ProgramIsDoingLocalFileInput = 0; // For -A, -B and -R flag
char DiscardingAllText = 1; // Because -A, -B and -R reparse local files
char ProgramIsDoingWrite = 0; // For the -W flag
char ProgramIsDoingMkdir = 0; // For the -R flag
char ProgramIsSavingURLs = 1; // False for -R or -B flags to save memory

	// This will accumulate the user's query from the command line.
	// Multiple terms will be joined with space characters ( "%20" ).

char QueryTopicString [260];

	// Now, the master list of search engines which I might query.
	// Generally if possible, I've arranged for boolean AND usage.
	// All of the 65 engines listed below worked well and could be
	// parsed by surf, but some got occasional spurious hits, some
	// always use META tag or 302 redirection, some used a boolean
	// OR which made an avalanche of garbage to collect, and a few
	// other marginal ones will all soon be commented out. However
	// I will leave them in the second table, in case anyone wants
	// to manually add the deleted query URLs into the list file.

char * SearchEngineQueries [] = {
	// "http://209.237.128.202/cgi-bin/search.cgi?Angola=yes&Botswana=yes&Lesotho=yes&Malawi=yes&Mozambique=yes&Namibia=yes&RSA=yes&Swaziland=yes&Zambia=yes&Zimbabwe=yes&variations=true&searchbox=",
	// "http://austria.intersearch.net/cgi-bin/search?icon=at.gif&t=Standard%20Suche&q=",
	"http://crawler.de/cgi-bin/suche.C?Maschine=CrawlerNeu&limit=2&Menue=35&anzahl=10&query=",
	// "http://databot.com/results.tmpl$Search?startAt=1&db=search.db&max=30&wobodytextdata=",
	// "http://element.whatuseek.com/cgi-bin/texis/texis/meta?shock=0&arg=",
	// "http://google.com/search?num=100&sa=Google+Search&q=",
	"http://goto.com/d/search/p/newhoo/?Keywords=",
	"http://home.snap.com/search/directory/results/1,61,home-0,00.html?tag=st.sn.fdsb&keyword=",
	// "http://hotbot.com/text/default.asp?SM=MC&DC=100&DE=2&DV=0&RG=all&LG=any&_v=2&OPs=MDRTP&MT=",
	"http://infoseek.go.com/Titles?col=WW&nh=25&sv=IS&lk=1&sf=1&qt=",
	"http://ink.yahoo.co.uk/bin/query_uk?hc=0&hs=0&p=",
	// "http://ink.yahoo.com/bin/query?hc=0&hs=1&p=",
	"http://lcweb.loc.gov/Harvest/cgi-bin/BrokerQuery.pl.cgi?host=lcweb.loc.gov:8501&brokerqueryconfig=BrokerLC.cf&caseflag=off&maxresultflag=100&descflag=on&verbose=on&query=",
	// "http://mexico.web.com.mx/Cgi-Bin/bmexnew.cgi?oper=o&sens=N&tema=Todo&palabra1=",
	"http://odin.ingrid.org/cgi-bin/odin.cgi?from=0&n=20&key=",
	"http://recherche.lokace.com/cgi-bin/lokace?DATA=Web&AFF=1&FROM=all&MOTCLEF=",
	"http://search.dmoz.org/cgi-bin/search?cat=ALL&t=b&fb=0&search=",
	// "http://search.dogpile.com/search?fs=web&to=thirty&q=",
	"http://search.excite.com/search.gw?perPage=50&showSummary=true&search=",
	"http://search.go2net.com/crawler?method=0?target=&region=0&rpp=30&timeout=30&hpe=10&format=power&general=",
	"http://search.kssk.net/cgi-bin/htsearch?boolean=ALL&words=",
	// "http://search.msn.com/results.asp?CO=100&MT=",
	"http://search.nationaldirectory.com/cgi-bin/query?query=",
	"http://search.thunderstone.com/texis/websearch/?max=20&q=",
	"http://search.yahoo.com/bin/search?o=1&za=and&h=s&n=100&p=",
	// "http://search2.virgilio.it/virgilio/owa/v3.search?db=v&op=and&qs=",
	"http://suchen.eule.de/cgi-bin/search.exe?startwith=1&a=3&db=&begriff=",
	"http://webcrawler.com/cgi-bin/WebQuery?showSummary=true&perPage=100&andOr=and&searchText=",
	"http://www.alingo.com/cgi-bin/perl/search.pl?search=",
	"http://www.altavista.com/cgi-bin/query?pg=aq&what=web&fmt=d&q=",
	"http://www.altavista.yellowpages.com.au/cgi-bin/query?what=web&q=",
	// "http://www.anzwers.com.au/cgi-bin/process_search.pl?pageid=search&firstresult=0&location=Australia&query_type=all+the+words&numperpage=100&result_type=detailed&query=",
	// "http://www.anzwers.com.au/cgi-bin/process_search.pl?pageid=search&firstresult=0&location=New%20Zealand&query_type=all+the+words&numperpage=100&result_type=detailed&query=",
	"http://www.arab.net/cgi-bin/AT-arabviewsearch.cgi?mode=simple&sp=sp&search=",
	// "http://www.buscador.com.mx/buscacgi/busca.cgi?buscar=",
	// "http://www.chinavista.com/cgi-local/hyper.cgi?boolean=All%20words&terms=",
	// "http://www.cipotes.com/cgi-bin/buscar.asp?buscar=",
	"http://www.digi-zone.com/search/search.cgi?mh=100&query=",
	"http://www.directhit.com/fcgi-bin/TopTenDemo.fcg?cmd=demo_qry&qry=",
	"http://www.disinfo.com/cgi-bin/htsearch.cgi?config=&restrict=&exclude=&matchesperpage=20&method=and&format=builtin-long&domain=WWW&words=",
	// "http://www.elcano.com/search.asp?A=q&Q=",
	// "http://www.elibrary.com/id/238/118/search.cgi?form=search&src-mags=checked&src-maps=checked&src-books=checked&src-news=checked&src-tvrad=checked&src-pics=checked&subject=index&moreoptions=&query=",
	// "http://www.encis.es/cgi-bin/AT-valencissearch.cgi?sp=sp&search=",
	"http://www.euroferret.com/?B=&THRESHOLD=100&P=",
	"http://www.god.co.uk/cgi-bin/osform.cgi/search?osform_template=s.oft&UN=0&CS=N&PM=N&NL=25&S=",
	"http://www.hiway.gr/b/htsearch?method=and&format=builtin-long&config=en&words=",
	"http://www.icqit.com/default.asp?IWTG=1&ILTG=1&ISTG=1&IGTG=1&MT=",
	"http://www.ilse.nl/?COMMAND=search_for&ANDOR=AND&EXTRACT=short&SEARCH_FOR=",
	// "http://www.infind.com/infind/infind.exe?time=10&query=",
	"http://www.jayde.com/cgi-bin/search.cgi?category_num=none&list_limit=100&brief=No&search_type=normal&query=",
	// "http://www.mamma.com/cgi-bin/parsearch2?lang=1&timeout=6&qtype=0&summaries=on&query=",
	"http://www.mckinley.com/search.gw?c=web&look=magellan&search=",
	// "http://www.metabug.com/cgibin/search.cgi?search_type=new&start=1&search_area=Web&timeout=3&output_number=30&summary=1&match=all&sort=relevance&keywords=",
	"http://www.nathan.de/cgi-bin/n3broker.cgi?LT=nathan.de/result.html&HPP=100&QS=",
	"http://www.northernlight.com/nlquery.fcg?qr=",
	// "http://www.rex.de/search.asp?typ=0&Op=AND&Count=100&js=0&Search=",
	// "http://www.savvysearch.com/search?op=a&cat=&q=",
	"http://www.searchopolis.com/cgi-bin/results.cgi?firstResult=0&displayfirstResult=1&request=",
	// "http://www.searchuk.com/cgi-bin/search?y=1&z=0&w=0&g=0&d=2&related=0&search=",
	"http://www.sofcom.com/cgi-bin/Dir/WWWsearch.cgi?term=",
	"http://www.sol.es/buscarplus.asp?selector=todas%20las%20palabras&donde=webs&formato=formato%20completo&buscar=",
	"http://www.stpt.com/search.asp?whatToSearch=web&query_val=",
	// "http://www.supersnooper.com/Search.dll?SearchString=",
	// "http://www.ukmax.com/search/Default.asp?DB=w&SM=MC&DC=100&DE=2&RG=WW&OPs=R&RD=RG&_v=2&MT=",
	"http://www.worldsearchcenter.com/search.cfm?type=a&limit=200&criteria=",
};

	// This similar list will describe how surf is to analyze query results.
	// However, these entries must allow for many redirection possibilities.
	// So, use an equality test that can discard initial "xxx." from domain.
	// Each string starts with absolute pathpart which must stricmp exactly.
	// Next string continues with a netlocn which must stricmp past any dot.
	// That string finishes with a digit to specify a recognition heuristic:
	// MatchingMethod:
	// 1: Extract all URLs that contain "http://" in anchor text.
	// 2: Extract all URLs that contain "http://" soon after </A>.
	// 3: Extract all URLs that contain "[0-9]+%" before the <A...>.
	// 4: Extract all URLs that contain "[0-9]+%" soon after </A>.
	// 5: Extract all URLs that contain "[0-9]+." before the <A...>.
	// 6: Extract all URLs that contain "[0-9]+." soon after </A>.
	// 7: Extract all URLs that contain "[0-9]+." in anchor text.
	// 8: Extract all URLs coming soon after an HTML <DT> tag
	// 9: Extract all URLs coming soon after an HTML <LI> tag

char * ParsableSearchEngineResultPages [] = {
	"/", "\000euroferret.com\000 1",
	"/", "\000hotbot.com\000 5",
	"/", "\000ilse.nl\000 2",
	"/", "\000savvysearch.com\000 8",
	"/Cgi-Bin/bmexnew.cgi", "\000mexico.web.com.mx\000 9",
	"/Harvest/cgi-bin/BrokerQuery.pl.cgi", "\000lcweb.loc.gov\000 1",
	"/Search.dll", "\000supersnooper.com\000 7",
	"/Titles", "\000infoseek.go.com\000 2",
	"/b/htsearch", "\000hiway.gr\000 8",
	"/bin/query", "\000ink.yahoo.com\000 9",
	"/bin/query_uk", "\000ink.yahoo.co.uk\000 9",
	"/bin/search", "\000yahoo.com\000 9",
	"/buscacgi/busca.cgi", "\000buscador.com.mx\000 9",
	"/buscarplus.asp", "\000sol.es\000 1",
	"/cgi-bin/AT-arabviewsearch.cgi", "\000arab.net\000 3",
	"/cgi-bin/AT-valencissearch.cgi", "\000encis.es\000 3",
	"/cgi-bin/Dir/WWWsearch.cgi", "\000sofcom.com\000 1",
	"/cgi-bin/WebQuery", "\000webcrawler.com\000 3",
	"/cgi-bin/buscar.asp", "\000cipotes.com\000 9",
	"/cgi-bin/edsa/chris", "\000edsa.com.ph\000 9",
	"/cgi-bin/htsearch", "\000search.kssk.net\000 8",
	"/cgi-bin/htsearch.cgi", "\000disinfo.com\000 1",
	"/cgi-bin/lokace", "\000recherche.lokace.com\000 5",
	"/cgi-bin/n3broker.cgi", "\000nathan.de\000 2",
	"/cgi-bin/odin.cgi", "\000odin.ingrid.org\000 1",
	"/cgi-bin/osform.cgi/search", "\000god.co.uk\000 5",
	"/cgi-bin/parsearch2", "\000mamma.com\000 4",
	"/cgi-bin/perl/search.pl", "\000alingo.com\000 2",
	"/cgi-bin/process_search.pl", "\000anzwers.com.au\000 3",
	"/cgi-bin/query", "\000altavista.com\000 5",
	"/cgi-bin/query", "\000altavista.yellowpages.com.au\000 2",
	"/cgi-bin/query", "\000nationaldirectory.com\000 5",
	"/cgi-bin/results.cgi", "\000searchopolis.com\000 1",
	"/cgi-bin/search", "\000austria.intersearch.net\000 1",
	"/cgi-bin/search", "\000dmoz.org\000 9",
	"/cgi-bin/search", "\000searchuk.com\000 2",
	"/cgi-bin/search.cgi", "\000209.237.128.202\000 1",
	"/cgi-bin/search.cgi", "\000jayde.com\000 5",
	"/cgi-bin/search.exe", "\000suchen.eule.de\000 2",
	"/cgi-bin/suche.C", "\000crawler.de\000 5",
	"/cgi-bin/texis/texis/meta", "\000whatuseek.com\000 9",
	"/cgi-local/hyper.cgi", "\000chinavista.com\000 9",
	"/cgibin/search.cgi", "\000metabug.com\000 2",
	"/cgibin/search_Web.cgi", "\000metabug.com\000 2",
	"/crawler", "\000go2net.com\000 2",
	"/d/search/p/newhoo/", "\000goto.com\000 2",
	"/default.asp", "\000icqit.com\000 4",
	"/fcgi-bin/TopTenDemo.fcg", "\000directhit.com\000 2",
	"/id/238/118/search.cgi", "\000elibrary.com\000 6",
	"/infind/infind.exe", "\000infind.com\000 9",
	"/nlquery.fcg", "\000northernlight.com\000 4",
	"/results.asp", "\000msn.com\000 2",
	"/results.tmpl$Search", "\000databot.com\000 2",
	"/search", "\000dogpile.com\000 9",
	"/search", "\000google.com\000 3",
	"/search", "\000savvysearch.com\000 8",
	"/search.asp", "\000elcano.com\000 2",
	"/search.asp", "\000rex.de\000 2",
	"/search.asp", "\000stpt.com\000 1",
	"/search.cfm", "\000worldsearchcenter.com\000 1",
	"/search.gw", "\000excite.com\000 2",
	"/search.gw", "\000mckinley.com\000 1",
	"/search/Default.asp", "\000ukmax.com\000 2",
	"/search/directory/results/1,61,home-0,00.html", "\000snap.com\000 1",
	"/search/search.cgi", "\000digi-zone.com\000 1",
	"/texis/search", "\000dogpile.com\000 9",
	"/texis/websearch/", "\000thunderstone.com\000 4",
	"/text/default.asp", "\000hotbot.com\000 5",
	"/virgilio/owa/v3.search", "\000virgilio.it\000 2",
};

// This non zero code means a search engine results page is being parsed.
// That influences how SURF processes the anchors found within that page.
// Prejudice file renaming operations so query URLS go into default path.

int MatchingMethod = 0;
int BaseURLHasAQueryPart = 0;

// I edit Uninhibited to zero to analyze parse w/o following hit URLs.

int Uninhibited = 1;

// More trickiness used to recognize many search engine target URLs.
// Count text and tags, so recency of certain tags can supply a clue:

int TokenNumber = 0;
int LastNumWasAtTokenNumber = -1000;
int LastUrlWasAtTokenNumber = -1000;
int LastDtTagWasAtTokenNumber = -1000;
int LastLiTagWasAtTokenNumber = -1000;
char EverSeenAnyValidHTMLTags = 0;


// I malloc and realloc to grow this array of word item pointers.
// Each item = ( one short count + one asciz string + another \0 ).
// These strings end in \0\0 to speed comparison with \0\1 words.

char** ppSortedVocabulary = 0;
int nMallocVocabulary = 0;
int nSortedVocabulary = 0;

// A place to resort the most-used uncommon word list by frequency.
char **ppResortedVocabulary = 0;

	// This holds an insertion-sorted index listing all known URLS.
	// This one is NOT done as a linked-list, like filenames below.

struct UrlPtrs {
	char *pUrl;
	char *pNote;
};
struct UrlPtrs * pSortedUrlPtrs = 0;
int nMallocUrlPtrs = 0;
int nSortedUrlPtrs = 0;

	// Every URL lookup sets this global, so MergeNoteAtLastUrlPtrSlot
	// can annotate each novel url with some <A>...</A> anchor text, and
	// can annotate each base url with file quality indices, filename, etc.
	// Also some query target url recognition heuristics need to remove "*".

int LastUrlPtrSlot = -1;
char LastUrlPtrIsAQueryMatch = 0;

	// During resort of local URLs, I need an extra Quality int.
struct UrlPtrsX {
	char *pUrl;
	char *pNote;
	int Quality;
};
struct UrlPtrsX * pResortedUrlPtrs = 0;
int nResortedUrlPtrs = 0;

	// I maintain linked lists by type-casting a ( void** ) atop data.
	// Unsorted linked list of filenames, after expanding filespecs.

void** FileListHead = 0;
void** FileListTail = ( void** ) & FileListHead;

	// Now, a linked list of "@..." lines in the order found in LIST.
	// These lines have "@path word word word" to control file names.

void** NamingListHead = 0;
void** NamingListTail = ( void** ) & NamingListHead;

	// Their text is stored together in one large malloc buffer.

int NamingListWorstCaseSize = 0;
char* NamingListStringsBuffer = 0;

	// Every pair of pointers locates one word, then its path:
	// NamingListPtrs [j+j] is the word
	// NamingListPtrs [j+j+1] is the path
	// Paths without words are not counted in NamingListTermsCount.

int NamingListTermsCount = 0;
char** NamingListPtrs = 0;

	// A renaming path without words is pointed to by DefaultNamingPath.
	// hook up this default file naming location for bare "@path" lines.

char *DefaultNamingPath = 0;
char IHaveFileRenamingData = 0;

	// That scheme now also annotates LIST with words of interest to user.

char* TopTenWordsOfInterest[10];
int TopTenWordFrequencies[10];
int nWordsOfInterest = 0;

	// I can use long filenames with the MFC library of file operators.
	// Local filenames will be chosen based on the urlpath part of URL:

char ProposedFileBaseName [60];
char UniqueFilename [70];


	// Disk read block, tail buffer, conversion buffer, rescan buffer.

char inbuf [4096 + 1024 + 1024];

	// Variables below this point should be cleared after each file.

	// This is an internal representation of parsed URLS: ( base or anchor ).
	// I shall arbitrarily limit URL segment lengths, elsewhere URL length.

struct UrlParts {
	char scheme [100];			// Method without "://", e.g., "http"
	char netlocn [300];			// Domain( :port ), e.g., "www.xxx.com:777"
	char urlpath [300];			// Empty \0 or absolute "/"( ... ) or relative.
	char parameter [300];		// With the introductory ";"
	char query [300];			// With the introductory "?"
	char fragment [100];		// With the introductory "#"
} BaseUrlParts, LinkUrlParts;

char BaseUrlText [500]; // Arbitrary limit on overall URL size

	// InsideComment is the only HTML parse state kept between disk blocks.
	// Other HTML tags cause input data to be pushed back until next block.

char InsideComment = 0;

	// DiscardingTextNow merges the global bool with HTML tag factors.

char DiscardingTextNow = DiscardingAllText;

	// DiscardingDueToTag holds mu_* enum of a tag starting discard.

int DiscardingDueToTag = 0;

	// OutputCol is for wrapping text sentence and paragraph output.
	// EverOutput is an edge-trigger bool to put <A--base>title</A>.

int OutputCol = 0;
int EverOutput = 0;
int BaseTextSaved = 0;
int TitleTextSaved = 0;

	// Certain types of markup tags serve as a clue to separate words.

char TagDelimitsWords = 0;

	// UnfinishedToken is to rejoin words split by <FONT> tags, etc.
	// SavedText is an index in UnfinishedToken, to next empty char.

char UnfinishedToken [200];
int SavedText = 0;

	// TitleBuffer holds some of text between <TITLE> and </TITLE>.

char TitleTextBuffer [200];
int TitleTextBufIndex = 0;
char SavingTitleText = 0;

	// AnchorTextBuffer holds some of text between <A> and </A>.

char AnchorTextBuffer [200];
int AnchorTextBufIndex = 0;
char SavingAnchorText = 0;

	// LocalFilename may be current -A input or -F output filename.

char * LocalFilename;


	// Set this global file pointer to save object being fetched.

FILE * ObjectFile = 0;

	// I gather some statistics about the current file.

int SentenceRecognition = 0;
int CountAcceptableIdeas = 0;
int CountAcceptableLinks = 0;
int CountAcceptableWords = 0;

	// Only permit -R file renaming for those that contain <BASE> tag.

char ThisInputFileContainedABaseTag = 0;

	// This table rids wierd spaces, international characters.
	// Convert actual < and > into \1 and \2 until next parse.
	// The symbols I could not map easily, I changed to a bar.
	// I have seen 0x92 used as apostraphe, make single quote.
	// I have seen 0x93, 94 as smart quote, make single quote.
	// I have seen 0xb4 used as apostraphe, make single quote.

char Simplification [256] = {
'|', '|', '|', '|',  ' ', '|', '|', '|',  '|', ' ', ' ', '|',  ' ', ' ', '|', '|',  // 00H = 0
'|', '|', '|', '|',  '|', '|', '|', '|',  '|', '|', '|', '|',  '|', '|', '|', '|',  // 10H = 16
' ', '!', '"', '#',  '$', '%', '&', '\'', '(', ')', '*', '+',  ',', '-', '.', '/',  // 20H = 32
'0', '1', '2', '3',  '4', '5', '6', '7',  '8', '9', ':', ';',  '\1', '=', '\2', '?', // 30H = 48
'@', 'A', 'B', 'C',  'D', 'E', 'F', 'G',  'H', 'I', 'J', 'K',  'L', 'M', 'N', 'O',  // 40H = 64
'P', 'Q', 'R', 'S',  'T', 'U', 'V', 'W',  'X', 'Y', 'Z', '[',  '\\',']', '^', '_',  // 50H = 80
'`', 'a', 'b', 'c',  'd', 'e', 'f', 'g',  'h', 'i', 'j', 'k',  'l', 'm', 'n', 'o',  // 60H = 96
'p', 'q', 'r', 's',  't', 'u', 'v', 'w',  'x', 'y', 'z', '{',  '|', '}', '~', '|',  // 70H = 112 bar is 7c
'|', '|', '|', '|',  '|', '|', '|', '|',  '|', '|', '-', '|',  '|', '|', '|', '|',  // 80H = 128
'|', '|', '\'','\'', '\'','|', '|', '|',  '|', '|', '|', '|',  '|', '|', '|', '|',  // 90H = 144
'|', '|', '|', '|',  '|', '|', '|', '|',  '|', '|', '|', '<',  '~', '-', '|', '|',  // a0H = 160
'|', '|', '2', '3',  '\'','u', '|', '|',  '|', '1', '|', '>',  '|', '|', '|', '|',  // b0H = 176
'A', 'A', 'A', 'A',  'A', 'A', 'A', 'C',  'E', 'E', 'E', 'E',  'I', 'I', 'I', 'I',  // c0H = 192
'D', 'N', 'O', 'O',  'O', 'O', 'O', '*',  'O', 'U', 'U', 'U',  'U', '|', 'P', 's',  // d0H = 208
'a', 'a', 'a', 'a',  'a', 'a', 'a', 'c',  'e', 'e', 'e', 'e',  'i', 'i', 'i', 'i',  // e0H = 224
'd', 'n', 'o', 'o',  'o', 'o', 'o', '/',  'o', 'u', 'u', 'u',  'u', 'a', 'p', 'y',  // f0H = 240
};

	// The following HTML entity names are converted with malice.
	// Each entry must be 8 chars, have a space after match term,
	// have 1-2 replacement character( s ) at ( word + 6 ), word + 7,
	// and be perfectly ASCII sorted to permit the binary search.
	// Symbols that were not easy to map become the vertical bar.
	// The dots prevent my editor conversion of 4 spaces to tabs.

char SortedEntityNames [] = {
"! ....  "  // low sentinel
"AElig AE"
"Aacute A"
"Acirc  A"
"Agrave A"
"Aring  A"
"Atilde A"
"Auml . A"
"Ccedil C"
"ETH .. D"
"Eacute E"
"Ecirc  E"
"Egrave E"
"Euml . E"
"Iacute I"
"Icirc  I"
"Igrave I"
"Iuml . I"
"Ntilde N"
"Oacute O"
"Ocirc  O"
"Ograve O"
"Oslash O"
"Otilde O"
"Ouml . O"
"THORN TH"
"Uacute U"
"Ucirc  U"
"Ugrave U"
"Uuml . U"
"Yacute Y"
"aacute a"
"acirc  a"
"acute  |"
"aelig ae"
"agrave a"
"amp .. &"
"aring  a"
"atilde a"
"auml . a"
"brvbar |"
"ccedil c"
"cedil  |"
"cent . |"
"copy . |"
"curren |"
"deg .. |"
"divide /"
"eacute e"
"ecirc  e"
"egrave e"
"eth .. d"
"euml . e"
"frac12 |"
"frac14 |"
"frac34 |"
"gt ... >"
"iacute i"
"icirc  i"
"iexcl  |"
"igrave i"
"iquest |"
"iuml . i"
"laquo  <"
"lt ... <"
"macr . |"
"micro  u"
"middot |"
"nbsp .  "
"not .. !"
"ntilde n"
"oacute o"
"ocirc  o"
"ograve o"
"ordf . |"
"ordm . |"
"oslash o"
"otilde o"
"ouml . o"
"para . |"
"plusmn |"
"pound  |"
"quot . \""
"raquo  >"
"reg .. |"
"sect . |"
"shy .. |"
"sup1 . 1"
"sup2 . 2"
"sup3 . 3"
"szlig sz"
"thorn th"
"times  *"
"uacute u"
"ucirc  u"
"ugrave u"
"uml .. |"
"uuml . u"
"yacute y"
"yen .. |"
"yuml . y"
"~ ....  " // top sentinel
};

	// The following HTML tag names constitute my recognition universe.
	// They must be perfectly ASCII sorted to permit the binary search.
	// "!doctype", !--, and variants !-, !<space> are not handled here.
	// Do not let SortedTagNames array differ from MarkUps enumeration.

char * SortedTagNames [] = {
"!", // low sentinel
"a",
"abbrev",
"acronym",
"address",
"align",
"applet",
"area",
"arg",
"au",
"author",
"b",
"banner",
"base",
"basefont",
"bdo",
"bgsound",
"big",
"blink",
"blockquote",
"body",
"bq",
"br",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"credit",
"dd",
"del",
"dfn",
"dir",
"div",
"dl",
"dt",
"em",
"embed",
"fig",
"fn",
"font",
"form",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"hr",
"html",
"i",
"image",
"img",
"input",
"ins",
"isindex",
"kbd",
"lang",
"lh",
"li",
"link",
"listing",
"map",
"marquee",
"menu",
"meta",
"nextid",
"nobr",
"noembed",
"noframes",
"noscript",
"note",
"object",
"ol",
"op",
"option",
"overlay",
"p",
"param",
"person",
"plaintext",
"pre",
"q",
"quote",
"s",
"samp",
"script",
"select",
"small",
"span",
"strike",
"strong",
"style",
"sub",
"sup",
"tab",
"table",
"tbody",
"tc",
"td",
"text",
"textarea",
"tfoot",
"th",
"thead",
"title",
"tr",
"tt",
"u",
"ul",
"var",
"wbr",
"xmp",
"~", // top sentinel
// no entry needed here for mu_Comment enum
};

	// Do not let SortedTagNames array differ from MarkUps enumeration.
	// Do not let MarkupTagTypes array differ from MarkUps enumeration.

enum MarkUps {
mu_lowsentinel = 0,
mu_a,
mu_abbrev,
mu_acronym,
mu_address,
mu_align, // common invalid tag
mu_applet,
mu_area,
mu_arg,
mu_au,
mu_author, // common invalid tag
mu_b,
mu_banner,
mu_base,
mu_basefont,
mu_bdo,
mu_bgsound,
mu_big,
mu_blink,
mu_blockquote,
mu_body,
mu_bq,
mu_br,
mu_caption,
mu_center,
mu_cite,
mu_code,
mu_col,
mu_colgroup,
mu_credit,
mu_dd,
mu_del,
mu_dfn,
mu_dir,
mu_div,
mu_dl,
mu_dt,
mu_em,
mu_embed,
mu_fig,
mu_fn,
mu_font,
mu_form,
mu_frame,
mu_frameset,
mu_h1,
mu_h2,
mu_h3,
mu_h4,
mu_h5,
mu_h6,
mu_head,
mu_hr,
mu_html,
mu_i,
mu_image, // common invalid tag
mu_img,
mu_input,
mu_ins,
mu_isindex,
mu_kbd,
mu_lang,
mu_lh,
mu_li,
mu_link,
mu_listing,
mu_map,
mu_marquee,
mu_menu,
mu_meta,
mu_nextid,
mu_nobr,
mu_noembed,
mu_noframes,
mu_noscript,
mu_note,
mu_object,
mu_ol,
mu_op,
mu_option,
mu_overlay,
mu_p,
mu_param,
mu_person,
mu_plaintext,
mu_pre,
mu_q,
mu_quote, // common invalid tag
mu_s,
mu_samp,
mu_script,
mu_select,
mu_small,
mu_span,
mu_strike,
mu_strong,
mu_style,
mu_sub,
mu_sup,
mu_tab,
mu_table,
mu_tbody,
mu_tc,
mu_td,
mu_text, // common invalid tag
mu_textarea,
mu_tfoot,
mu_th,
mu_thead,
mu_title,
mu_tr,
mu_tt,
mu_u,
mu_ul,
mu_var,
mu_wbr,
mu_xmp,
mu_topsentinel,
mu_Comment // This one is also used to get past !doctype...
};

	// Do not let MarkupTagTypes array differ from MarkUps enumeration.
	// Bit meanings used in MarkupTagTypes:
	// 1 = This tag separates words, in case no other whitespace.
	// 2 = This tag separates concepts ( sentences or paragraphs ).
	// 4 = This tag may require URL processing. ( A, AREA, BASE, FRAME )
	// 8 = TAG /TAG delimit content to discard. ( APPLET, OBJECT, SCRIPT, STYLE )
	// 16 = This tag may imply other processing. ( IMG or IMAGE, TITLE, /TITLE )
	// 32 = Tag LI, to distinguish next anchor on certain query result pages.
	// 64 = Tag DT, to distinguish next anchor on certain query result pages.

int MarkupTagTypes [] = {
-1, // mu_lowsentinel = 0,
1|4, // mu_a, // Extract <A HREF=URL>, keep URL indications in the text.
1, // mu_abbrev,
1, // mu_acronym,
1|2, // mu_address,
1|2, // mu_align,
1|2|8, // mu_applet, // Discard all content from <APPLET> to </APPLET>.
1|2|4, // mu_area, // Extract <AREA HREF=URL>, keep URL indications in the text.
1, // mu_arg,
1, // mu_au,
1, // mu_author,
1, // mu_b,
1|2, // mu_banner,
1|2|4, // mu_base, // Extract <BASE HREF=URL>, keep URL to resolve local URLs.
0, // mu_basefont,
0, // mu_bdo,
0, // mu_bgsound,
0, // mu_big,
0, // mu_blink,
1|2, // mu_blockquote,
1|2, // mu_body,
1|2, // mu_bq,
1|2, // mu_br,
1|2, // mu_caption,
1|2, // mu_center,
1, // mu_cite,
1, // mu_code,
1|2, // mu_col,
1|2, // mu_colgroup,
0, // mu_credit,
1, // mu_dd,
0, // mu_del,
1, // mu_dfn,
1|2, // mu_dir,
1|2, // mu_div,
1|2, // mu_dl,
1|2|64, // mu_dt,
1, // mu_em,
0, // mu_embed,
0, // mu_fig,
0, // mu_fn,
0, // mu_font,
1|2, // mu_form,
1|2|4, // mu_frame, // Extract <FRAME SRC=URL>, keep URL indications in the text.
1|2, // mu_frameset,
1|2, // mu_h1,
1|2, // mu_h2,
1|2, // mu_h3,
1|2, // mu_h4,
1|2, // mu_h5,
1|2, // mu_h6,
1|2|8, // mu_head, // Discard all content from <HEAD> to </HEAD>.
1|2, // mu_hr,
1|2, // mu_html,
1, // mu_i,
1|2|16, // mu_image, // ( Misspelled ) Extract <IMAGE ALT=TEXT> for the text.
1|2|16, // mu_img, // Extract <IMG ALT=TEXT> for the text.
1|2, // mu_input,
0, // mu_ins,
1|2, // mu_isindex,
0, // mu_kbd,
0, // mu_lang,
1|2, // mu_lh,
1|2|32, // mu_li,
1|2, // mu_link,
1|2, // mu_listing,
1|2, // mu_map,
1|2, // mu_marquee,
1|2, // mu_menu,
0, // mu_meta,
0, // mu_nextid,
1|2, // mu_nobr,
1|2, // mu_noembed,
1|2, // mu_noframes,
1|2, // mu_noscript,
1|2, // mu_note,
1|2|8, // mu_object, // Discard all content from <OBJECT> to </OBJECT>.
1|2, // mu_ol,
0, // mu_op,
1|2, // mu_option,
0, // mu_overlay,
1|2, // mu_p,
1|2, // mu_param,
0, // mu_person,
1|2, // mu_plaintext,
1|2, // mu_pre,
1, // mu_q,
1|2, // mu_quote,
0, // mu_s,
1|2, // mu_samp,
1|2|8, // mu_script, // Discard all content from <SCRIPT> to </SCRIPT>.
1|2, // mu_select,
0, // mu_small,
0, // mu_span,
0, // mu_strike,
1|2, // mu_strong,
1|2|8, // mu_style, // Discard all content from <STYLE> to </STYLE>.
1, // mu_sub,
1, // mu_sup,
0, // mu_tab,
1|2, // mu_table,
1|2, // mu_tbody,
1|2, // mu_tc,
1|2, // mu_td,
1|2, // mu_text,
1|2, // mu_textarea,
1|2, // mu_tfoot,
1|2, // mu_th,
1|2, // mu_thead,
1|2|16, // mu_title, // Save URL annotation text from <TITLE> to </TITLE>.
1|2, // mu_tr,
1, // mu_tt,
0, // mu_u,
1|2, // mu_ul,
1, // mu_var,
1, // mu_wbr,
1|2, // mu_xmp,
-1, // mu_topsentinel,
1|2, //mu_Comment //This mu_Comment is also used to ignore <!DOCTYPE>
};

int UserKeystrokeDetected ( )
{
	static int EverAKeystrokeDetected = 0;
	if( _kbhit( ) != 0 )
	{
		// if getch returns 0 or 0xe0, must reread.
		int c = _getch( );
		if( c == 0 || c == 0xE0 )
		{
			_getch( );
		}
		EverAKeystrokeDetected = 1;
		return 1;
	}
	return EverAKeystrokeDetected;
}

void RecursivePathExpansion( int depth, char* GivenPath )
{
	// This adds filenames to the linked list of filenames.
	char* scan = GivenPath;
	char* bgnfile;
	int count = 0;
	struct _finddata_t FindFileData;
	long SearchHandle;
	if( scan [1] == ':' && isalpha( scan [0] ) )
		scan += 2;
	if( scan [0] == '\\' || scan [0] == '/' )
		scan++;
	bgnfile = scan;
	// The idiom 'if( x ) do ... while ( x );' usually saves a jump opcode
	if( *scan != '\0' )
	do
	{
		if( scan [0] == '\\' || scan [0] == '/' )
		{
			if( ++count > depth )
			{
				char* leftpart = ( char* )malloc( scan + 1 - GivenPath );
				if( leftpart == ( char* )0 )
				{
					fprintf( stderr, "SURF: Malloc failure expanding filenames.\n" );
					exit( 1 );
				}
				strncpy( leftpart, GivenPath, scan - GivenPath );
				leftpart [scan - GivenPath] = '\0';
				if( ( SearchHandle = _findfirst( leftpart, &FindFileData ) ) != -1L )
				{
					do
					{
						if( FindFileData.attrib & _A_SUBDIR )
						{
							if( FindFileData.name [0] != '.'
							||( FindFileData.name [1] != '.'|| FindFileData.name [2] != '\0' )
							&& FindFileData.name [1] != '\0' )
							{
								int n = ( bgnfile - GivenPath ) + strlen( FindFileData.name ) + strlen( scan ) + 1;
								char* thing = ( char* )malloc( n );
								if( thing == ( char* )0 )
								{
									fprintf( stderr, "SURF: Malloc failure expanding filenames.\n" );
									exit( 1 );
								}
								if( GivenPath < bgnfile )
									strncpy( thing, GivenPath, bgnfile-GivenPath );
								strcpy( thing + ( bgnfile-GivenPath ), FindFileData.name );
								strcat( thing + ( bgnfile-GivenPath ), scan );
								// printf( "I got here for directory '%s'.\n", thing );
								RecursivePathExpansion( depth + 1, thing );
								free( thing );
							}
						}
					} // do...
					while( _findnext( SearchHandle, &FindFileData ) == 0 );
				}
				if( _findclose( SearchHandle ) == -1 )
				{
					// I am commenting out and ignoring this error condition:
					// It occurs when no file matched the path specification.
					// fprintf( stderr, "SURF: error from findclose: %s.\n", strerror( errno ) );
					// exit( 1 );
				}
				free( leftpart );
				return;
			}
			bgnfile = scan + 1;
		}
		scan++;
	} // do...
	while( *scan != '\0' );
	// It's final depth time.
	if( ( SearchHandle = _findfirst( GivenPath, &FindFileData ) ) != -1L )
	{
		do
		{
			if( FindFileData.attrib & _A_SUBDIR )
			{
				if( FindFileData.name [0] != '.'
				||( FindFileData.name [1] != '.'|| FindFileData.name [2] != '\0' )
				&& FindFileData.name [1] != '\0' )
				{
					int n = ( bgnfile - GivenPath ) + strlen( FindFileData.name ) + 2 + 1;
					char* thing = ( char* )malloc( n );
					if( thing == ( char* )0 )
					{
						fprintf( stderr, "SURF: Malloc failure expanding filenames.\n" );
						exit( 1 );
					}
					if( GivenPath < bgnfile )
						strncpy( thing, GivenPath, bgnfile-GivenPath );
					strcpy( thing + ( bgnfile-GivenPath ), FindFileData.name );
					strcat( thing + ( bgnfile-GivenPath ), "\\*" );
					// printf( "I got here for directory '%s'.\n", thing );
					RecursivePathExpansion( depth + 1, thing );
					free( thing );
				}
			}
			else
			{
				int n = sizeof( void** ) + ( bgnfile - GivenPath ) + strlen( FindFileData.name ) + 1;
				char* item = ( char* )malloc( n );
				char* fullpath = item + sizeof( void** );
				if( item == ( char* )0 )
				{
					fprintf( stderr, "SURF: Malloc failure expanding filenames.\n" );
					exit( 1 );
				}
				if( GivenPath < bgnfile )
					strncpy( fullpath, GivenPath, bgnfile-GivenPath );
				strcpy( fullpath + ( bgnfile-GivenPath ), FindFileData.name );
				// printf( "I got here for file '%s'.\n", fullpath );
				*FileListTail = ( void** )item;
				FileListTail = ( void** )item;
				*FileListTail = ( void** )0;
			}
		} // do...
		while( _findnext( SearchHandle, &FindFileData ) == 0 );
	}
	if( _findclose( SearchHandle ) == -1 )
	{
		// I am commenting out and ignoring this error condition:
		// It occurs when no file matched the path specification.
		// fprintf( stderr, "SURF: error from findclose: %s.\n", strerror( errno ) );
		// exit( 1 );
	}
	return;
}

int IgnoreThisCommonWord( char * word )
{
	// Caller passes a pointer to "alphanumerics\0".
	// If word is in CommonWordList return 1, else 0.
	// This was cloned from PotentialEntityLookup,
	int low = 0;
	int top = ( sizeof( CommonWordList ) - 1 - 8 ) >> 3;
	Flow( 11055 );
	for( ;; )
	{
		int mid = low + top >> 1; // always positive, safe to shift for / 2
		char * eptr = CommonWordList + ( mid << 3 ); // safe to shift for * 8
		char * scan = word;
		if( *eptr == *scan )
		{
			// a variant string comparison
			do
			{
				eptr++;
				scan++;
			} // do...
			while( *eptr == *scan );
			// Test for this exact match condition.
			// This time the table entry ends with a space,
			// and the candidate string ends with a null.
			if( *eptr == ' ' && *scan == '\0' )
				return 1; // Yes, it is a common word
		}
		if( *eptr > *scan ) // Warning - gt/lt on signed chars - ok thru 127.
		{
			// Word was lower. Move down in table.
			if( top == mid )
				return 0; // No, it is an unusual word
			top = mid;
		}
		else
		{
			// Word was higher. Move up in table.
			if( low == mid )
				return 0; // No, it is an unusual word
			low = mid;
		}
	} // for...
}

void InsertWordBeforeWordNumber( int Before, char *pItem )
{
	Flow( 10481 );
	// Each item = ( one short count + one asciz string + another \0 ).
	// The sorted array of pointers to the items can grow.
	if( nSortedVocabulary == nMallocVocabulary )
	{
		if( nMallocVocabulary == 0 )
		{
			// First time, do a malloc.
			nMallocVocabulary = 2048;
			ppSortedVocabulary = ( char** )malloc( sizeof( char* ) * nMallocVocabulary );
		}
		else
		{
			// Afterwards, do a realloc.
			nMallocVocabulary += 2048;
			ppSortedVocabulary = ( char** )realloc( ppSortedVocabulary, sizeof( char* ) * nMallocVocabulary );
		}
	}
	if( ppSortedVocabulary == ( char** ) 0 )
	{
		fprintf( stderr, "SURF: Malloc failure creating vocabulary.\n" );
		exit( 1 );
	}
	// To do insertion sort, slide up the end of array to make room.
	// However, if incoming words are sorted there is nothing to do.
	if( Before < nSortedVocabulary )
	{
		char **pPast = ppSortedVocabulary + nSortedVocabulary;
		int n = nSortedVocabulary - Before;
		do {
			*pPast = pPast [-1];
			pPast--;
		} // do...
		while( --n > 0 );
	}
	ppSortedVocabulary [Before] = pItem;
	nSortedVocabulary++;
	// Nowadays, I'll use vocabulary size, to not emphasize large files.
	CountAcceptableWords++;
	return;
}

void AddWordToVocabulary( char * start, char * past )
{
	int low = 0;
	int top = nSortedVocabulary;
	// start contains 1 [a-z] + 0 to 39 [a-z0-9] + ( \0 or space at past ).
	// I do not require a \0 after, neither will I change that character.
	int n = past - start;
	// The text lines are stored with two null bytes to speed comparison.
	// This candidate text ends with one null, then stops on a 0x01 byte.
	// Toss in a short integer counter atop the item
	char* item = ( char* ) malloc( sizeof( short ) + n + 2 );
	Flow( 12228 );
	if( item == ( char* )0 )
	{
		fprintf( stderr, "SURF: Malloc failure listing vocabulary.\n" );
		exit( 1 );
	}
	*( short* )item = 1; // Initialize word counter in case is new word.
	memcpy( item + sizeof( short ), start, n ); // Always n > 0
	item [sizeof( short )+n] = '\0';
	item [sizeof( short )+n+1] = '\1';
	// fprintf( stderr, "AddText: %s.\n", item+sizeof( short ) );
	// Strictly in the case of equal keys,
	// find any index to a saved item having a word which
	// matches exactly, or exceeds, word copied into Item.
	if( top == 0 )
	{
		// Repair sentinel byte 2 before storing this item.
		item [sizeof( short )+n+1] = '\0';
		InsertWordBeforeWordNumber( 0, item );
		return;
	}
	// Here is the binary search.
	for( ;; )
	{
		int mid = low + top >> 1; // always positive, safe to shift for / 2
		unsigned char * tptr = ( unsigned char * ) ( ppSortedVocabulary [mid] + sizeof( short ) );
		unsigned char * scan = ( unsigned char * ) ( item + sizeof( short ) );
		if( *tptr == *scan )
		{
			do
			{
				tptr++;
				scan++;
			} // do...
			while( *tptr == *scan );
		}
		// Test for this exactly matching word condition:
		// In SURF, the pre-sorted word ends with \0, \0,
		// whereas the candidate string ends with \0, \1.
		// The double sentinel saves steps in loop above.
		if( *tptr == '\0' && *scan == '\1' )
		{
			// Because ppSortedVocabulary [mid] is an exact match.
			// Count repeated word. Prevent overflow.
			if( ++( *( short * ) ( ppSortedVocabulary [mid] ) ) == 32000 )
				--( *( short * ) ( ppSortedVocabulary [mid] ) );
			free( item ); // Discard malloc'ed copy of repeated word.
			break; // for return;
		}
		if( *tptr > *scan )
		{
			// Candidate was lower. Move down in table.
			if( top == mid )
			{
				// Nowhere further to go lower with top.
				// Repair sentinel byte 2 before storing this item.
				item [sizeof( short )+n+1] = '\0';
				// Because ppSortedVocabulary [mid] exceeds Item.
				InsertWordBeforeWordNumber( mid, item );
				break; // for return;
			}
			top = mid;
		}
		else
		{
			// Candidate was higher. Move up in table.
			if( low == mid )
			{
				// Nowhere further to go higher with low.
				// Repair sentinel byte 2 before storing this item.
				item [sizeof( short )+n+1] = '\0';
				// Because pItem exceeds ppSortedVocabulary [mid].
				InsertWordBeforeWordNumber( mid + 1, item );
				break; // for return;
			}
			low = mid;
		}
	} // for...
	return;
}

int QuantityOfWordInVocabulary( char * AscizWord )
{
	// Made from AddWordToVocabulary( ) above.
	// Start points to asciz string, any size.
	// Return the count of word found or zero.
	int low = 0;
	int top = nSortedVocabulary;
	Flow( 16229 );
	if( top > 0 )
	{
		// Here is the binary search.
		for( ;; )
		{
			int mid = low + top >> 1; // always positive, safe to shift for / 2
			unsigned char * tptr = ( unsigned char * ) ( ppSortedVocabulary [mid] + sizeof( short ) );
			unsigned char * scan = ( unsigned char * )AscizWord;
			if( *tptr == *scan )
			{
				do
				{
					tptr++;
					scan++;
				} // do...
				while( *tptr == *scan && *tptr != '\0' );
			}
			if( *scan == '*' ) // an exact match of "word*" up to the asterisk
			{
				// The binary search may have placed me in a series of matches.
				// Look backward and forward to sum up adjacent matching slots.
				int sum = *( short* ) ( ppSortedVocabulary [mid] ); // match #1
				int i = mid;
				while( --i >= 0 )
				{
					tptr = ( unsigned char * ) ( ppSortedVocabulary [i] + sizeof( short ) );
					scan = ( unsigned char * )AscizWord;
					if( *tptr == *scan )
					{
						do
						{
							tptr++;
							scan++;
						} // do...
						while( *tptr == *scan && *tptr != '\0' );
					}
					if( *scan == '*' ) // another match of "word*"
					{
						sum += *( short* ) ( ppSortedVocabulary [i] );
					}
					else
					{
						break;
					}
				}
				i = mid;
				while( ++i < top )
				{
					tptr = ( unsigned char * ) ( ppSortedVocabulary [i] + sizeof( short ) );
					scan = ( unsigned char * )AscizWord;
					if( *tptr == *scan )
					{
						do
						{
							tptr++;
							scan++;
						} // do...
						while( *tptr == *scan && *tptr != '\0' );
					}
					if( *scan == '*' ) // another match of "word*"
								sum += *( short* ) ( ppSortedVocabulary [i] );
						else
								break;
				}
					return sum;
			}
			if( *tptr == '\0' && *scan == '\0' ) // an exact match with no '*'
			{
				return *( short* ) ( ppSortedVocabulary [mid] ); // count found in vocabulary
			}
			else if( *tptr > *scan )
			{
				// Candidate was lower. Move down in table.
				if( top == mid )
				{
					// Nowhere further to go lower with top.
					return 0; // Word was not found in vocabulary
				}
				top = mid;
			}
			else
			{
				// Candidate was higher. Move up in table.
				if( low == mid )
				{
					// Nowhere further to go higher with low.
					return 0; // Word was not found in vocabulary
				}
				low = mid;
			}
		} // for...
	}
	return 0; // Word was not found because vocabulary is empty
}

void ProcessToken( char * start )
{
	// By token I mean a text sequence: No whitespace, no tags.
	// Token contains only 1+ Simplified[] characters, then \0.
	int n = strlen( start );
	Flow( 12284 );
	TokenNumber++; // Used to age query hit recognition clues in clear text.
	// Recognize various clues in either clear text preceding, or
	// in the <A>...</A> anchor text of a query result target URL.
	if ( MatchingMethod > 0 // Current page is a known search engine result.
	&& ! DiscardingDueToTag ) // Not currently inside <SCRIPT>, etc.
	{
		// The first clue is that an "http://..." appears in clear text.
		// About 23 of the 38 different search engine contain this clue.
		if( n > 12
		&& start[4] == ':'
		&& start[5] == '/'
		&& start[6] == '/'
		&& start[0] == 'h'
		&& start[1] == 't'
		&& start[2] == 't'
		&& start[3] == 'p' )
		{
			if( SavingAnchorText )
			{
				if( MatchingMethod == 1 ) // Clue: "http://" inside the <A>...</A>
				{
					// As I am inside the anchor text, there may not be a note yet
					// and I will have to wait for the </A> to implement this task.
					LastUrlPtrIsAQueryMatch = 1; // set due to "http://......"
				}
			} else {
				if( MatchingMethod == 2 ) // Clue: "http://" after the </A>
				{
					// Although already past the <A>...</A> anchor text,
					// this is still a good clue for some search engines.
					// Apply some limits about the distance back to last URL.
					// Do it now, because last URL's first note was already saved.
					if( TokenNumber < LastUrlWasAtTokenNumber + 200
					&& LastUrlPtrSlot != -1 ) // be sure there was a prior URL
					{
						char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
						if( Item != ( char* )0
						&& *( Item + sizeof( char* ) ) == '*' && Uninhibited )
							*( Item + sizeof( char* ) ) = ' ';
					}
				}
			}
		}
		// Another clue is that "100%" or "1." etc., appears in clear text.
		if( isdigit( start[ 0 ] ) )
		{
			char * scan = start;
			do scan++; while( isdigit( *scan ) );
			if( *scan == '%' && scan [1] == '\0' )
			{
				// Percentages are seen BEFORE, or AFTER, but not IN anchor text.
				if( MatchingMethod == 3 ) // Clue: "100%" before the <A>
				{
					// As this "100%" field is seen prior to the anchor text,
					// I will have to wait for <A> to implement this task.
					LastNumWasAtTokenNumber = TokenNumber;
				}
				if( MatchingMethod == 4 ) // Clue: "100%" after the </A>
				{
					// Although already past the <A>...</A> anchor text,
					// this is still a good clue for some search engines.
					// Apply some limits about the distance back to last URL.
					// Do it now, because URL's first note is already saved.
					if( TokenNumber < LastUrlWasAtTokenNumber + 200
					&& LastUrlPtrSlot != -1 ) // be sure there was a prior URL
					{
						char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
						if( Item != ( char* )0
						&& *( Item + sizeof( char* ) ) == '*' && Uninhibited )
							*( Item + sizeof( char* ) ) = ' ';
					}
				}
			}
			else if( *scan == '.' && scan [1] == '\0' )
			{
				if( SavingAnchorText )
				{
					if( MatchingMethod == 7 ) // Clue: "http://" inside the <A>...</A>
					{
						// As I am inside the anchor text, there may not be a note yet
						// and I will have to wait for the </A> to implement this task.
						LastUrlPtrIsAQueryMatch = 1; // set due to "http://......"
					}
				} else {
					// Ordinals are seen BEFORE, or AFTER, and IN anchor text.
					if( MatchingMethod == 5 ) // Clue: "1." before the <A>
					{
						// As this "1." field is seen prior to the anchor text,
						// I will have to wait for <A> to implement this task.
						LastNumWasAtTokenNumber = TokenNumber;
					}
					if( MatchingMethod == 6 ) // Clue: "1." after the </A>
					{
						// Although already past the <A>...</A> anchor text,
						// this is still a good clue for one search engine,
						// which happens to finish like: "reading level: 9."
						// Apply some limits about the distance back to last URL.
						// Do it now, because URL's first note is already saved.
						if( TokenNumber < LastUrlWasAtTokenNumber + 200
						&& LastUrlPtrSlot != -1 ) // be sure there was a prior URL
						{
							char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
							if( Item != ( char* )0
							&& *( Item + sizeof( char* ) ) == '*' && Uninhibited )
								*( Item + sizeof( char* ) ) = ' ';
						}
					}
				}
			}
		}
	}
	// There ends the search engine result recognition block.
	// Now, I may want to save these tokens for a page title.
	if( SavingTitleText )
	{
		// This diverts text inside <TITLE> ... </TITLE>.
		if( ( unsigned ) ( TitleTextBufIndex + n )
		< ( unsigned ) ( sizeof( TitleTextBuffer ) - 2 ) )
		{
			if( TitleTextBufIndex != 0 )
			{
				TitleTextBuffer [TitleTextBufIndex] = ' ';
				TitleTextBufIndex++;
			}
			strcpy( TitleTextBuffer + TitleTextBufIndex, start );
			TitleTextBufIndex += n;
		}
		// Starting Jan 98: I now add title words to the vocabulary.
		if( isalpha( *start ) )
		{
			// This section cloned and adapted from below
			char WordLike [43]; // Accept words from 1 to 40 chars
			char * from = start;
			char * into = WordLike;
			while( isalpha( *from ) )
			{
				*into = *from | ' '; // idiom lowercases, ok for [A-Za-z0-9]
				if( into < WordLike + sizeof( WordLike ) - 1 )
					into++;
				from++;
			}
			if( into > WordLike + 2
			&& into < WordLike + sizeof( WordLike ) - 2 )
			{
				// Accept words ( containing only alphas ) from 3 to 40 chars
				*into = '\0';
				if( ! IgnoreThisCommonWord( WordLike ) )
					AddWordToVocabulary( WordLike, into );
			}
		}
		// Title text is always discarded, being inside <HEAD>.
		return;
	}
	// Now, I may want to save these tokens to annotate novel URLs in list.
	if( SavingAnchorText )
	{
		// This saves a copy of text inside <A> ... </A>.
		if( ( unsigned ) ( AnchorTextBufIndex + n )
		< ( unsigned ) ( sizeof( AnchorTextBuffer ) - 2 ) )
		{
			if( AnchorTextBufIndex != 0 )
			{
				AnchorTextBuffer [AnchorTextBufIndex] = ' ';
				AnchorTextBufIndex++;
			}
			strcpy( AnchorTextBuffer + AnchorTextBufIndex, start );
			AnchorTextBufIndex += n;
		}
	}
	// If I am fetching the file, every clear text token is saved.
	if( ! DiscardingTextNow ) {
		if( ! EverOutput
		&& BaseTextSaved
		&& TitleTextSaved )
		{
			EverOutput = 1;
			fprintf( ObjectFile, "<PRE><A HREF = \"%s\"> %s </A>\n", BaseUrlText, TitleTextBuffer );
		}
		// Place a newline ahead of these characters if needed.
		// This means EOF must place a newline if OutputCol > 0.
		if( OutputCol + n > 60 )
		{
			// Lest tokens over 60 chars cause an extra blank line.
			if( OutputCol )
			{
				fputc( '\n', ObjectFile );
				OutputCol = 0;
			}
			// Everywhere else except here, the decision to end a
			// concept ( sentence or paragraph ) should flush token.
		}
		// I prefer one blank to left of even the first word on line.
		fputc( ' ', ObjectFile );
		fputs( start, ObjectFile );
		OutputCol += n + 1;

	}

	// Now let's re-examine the clear text tokens to do an analysis
	// of vocabulary used in each file to categorize them by content.
	// Also try to discern and count apparent well-formed sentences.
	if( ! DiscardingDueToTag ) // during non-markup clear text
	{
		char WordLike [43]; // Accept words from 1 to 40 chars
		char * from = start;
		char * into;
		for( ;; )
		{
			while( !isalpha( *from ) && *from != '\0' )
			{
				if( *from == '.' )
				{
					if( SentenceRecognition >= 3+3 ) // At least One + 3 words
						CountAcceptableIdeas ++; // Period finished sentence
					SentenceRecognition = 0; // Therefore nothing in progress
				}
				++from;
			}
			if( *from == '\0' )
				break;
			// I am definitely sitting on an alpha
			into = WordLike;
			if( SentenceRecognition == 0 ) // Nothing in progress atop new word.
			{
				if( isupper( *from ) )
				{
					SentenceRecognition = 2; // Ok, Initial Uppercase letter
				}
				else
				{
					SentenceRecognition = 1; // Enter terminal garbage state
				}
			}
			*into = *from | ' '; // idiom lowercases, ok for [A-Za-z0-9]
			into++;
			from++;
			while( isalpha( *from ) )
			{
				// I know, this is an excessively restrictive heuristic:
				if( !islower( *from ) )
					SentenceRecognition = 1; // Enter terminal garbage state
				*into = *from | ' '; // idiom lowercases, ok for [A-Za-z0-9]
				if( into < WordLike + sizeof( WordLike ) - 1 )
					into++;
				from++;
			}
			if( into > WordLike + 2
			&& into < WordLike + sizeof( WordLike ) - 2 )
			{
				// Accept words ( containing only alphas ) from 3 to 40 chars
				// Nowadays, I use vocabulary size, instead of counting here.
				// No... CountAcceptableWords ++;
				*into = '\0';
				if( ! IgnoreThisCommonWord( WordLike ) )
					AddWordToVocabulary( WordLike, into );
				if( SentenceRecognition >= 2 ) // Having First Capitalized Word
					SentenceRecognition++; // Above 3, counts all-lower words
			}
			else
			{
				SentenceRecognition = 1; // Enter terminal garbage state
			}
			if( *from == '\0' )
				break;
		}
		// No, I decided to count words above, not bytes here:
		// No: CountAcceptableChars += n + 1; // This non-markup clear text + 1 space
	}

	return;
}

void EndAnyOuputParagraph( )
{
	Flow( 10989 );
	// Caller is about to output an <A> or other tag.
	// Caller outputs whole lines ignoring OutputCol.
	// There may have been some concept being output.
	// If so, end the paragraph and add a blank line.
	if( SavedText > 0 )
	{
		UnfinishedToken [SavedText] = '\0';
		ProcessToken( UnfinishedToken );
		SavedText = 0;
	}
	if( OutputCol > 0 )
	{
		// If there was some current concept,
		// end it, put a blank line after it.
		fputc( '\n', ObjectFile );
		fputc( '\n', ObjectFile );
		OutputCol = 0;
	}
	// I will indent these markups with tab.
	// I will supply two newlines after msg.
	// original said... fprintf( ObjectFile, "\t<%s>\n\n", msg );
	return;
}

void InsertUrlBeforeUrlPtrSlot( int Before, char *pUrl )
{
	Flow( 16401 );
	// Called from BinaryLookupInsertionSort if no match.
	// The array of "struct UrlPtrs" grows as required.
	if( nSortedUrlPtrs == nMallocUrlPtrs )
	{
		if( nMallocUrlPtrs == 0 )
		{
			// First time, do a malloc.
			nMallocUrlPtrs = 2048;
			pSortedUrlPtrs = ( struct UrlPtrs * )malloc( sizeof( struct UrlPtrs ) * nMallocUrlPtrs );
		}
		else
		{
			// Afterwards, do a realloc.
			nMallocUrlPtrs += 2048;
			pSortedUrlPtrs = ( struct UrlPtrs * )realloc( pSortedUrlPtrs, sizeof( struct UrlPtrs ) * nMallocUrlPtrs );
		}
	}
	// To do insertion sort, slide up the end of array to make room.
	// However, if incoming lines are sorted there is nothing to do.
	if( Before < nSortedUrlPtrs )
	{
		struct UrlPtrs * pPast = pSortedUrlPtrs + nSortedUrlPtrs;
		int n = nSortedUrlPtrs - Before;
		do {
			pPast[0].pUrl = pPast[-1].pUrl;
			pPast[0].pNote = pPast[-1].pNote;
			pPast--;
		} // do...
		while( --n > 0 );
	}
	{
		int n = strlen( pUrl ) + 1;
		char* CopyOfUrl = ( char* ) malloc( n );
		if( CopyOfUrl == ( char* )0 )
		{
			fprintf( stderr, "SURF: Malloc failure adding URLs to list.\n" );
			exit( 1 );
		}
		strcpy( CopyOfUrl, pUrl );
		pSortedUrlPtrs[Before].pUrl = CopyOfUrl;
		pSortedUrlPtrs[Before].pNote = ( char* )0;
	}
	nSortedUrlPtrs++;
	return;
}

void BinaryLookupInsertionSort( char * pUrl )
{
	// Called from AddLinkUrlPartsToUrlList.
	// BinaryLookupInsertionSort always sets LastUrlPtrSlot.
	int low = 0;
	int top = nSortedUrlPtrs - 1;
	Flow( 18875 );
	// fprintf( stdout, "SURF: %s\n", pUrl );
	if( nSortedUrlPtrs == 0 )
	{
		InsertUrlBeforeUrlPtrSlot( 0, pUrl );
		LastUrlPtrSlot = 0; // This zero means first. Use -1 for invalidity.
		return;
	}
	// This is the promised first compare to final item.
	// I cloned the loop guts rather than complicate it.
	{
		int sc = strcmp( pSortedUrlPtrs[top].pUrl, pUrl );
		// Test for this exact match condition:
		if( sc == 0 )
		{
			// UrlPtrs entry at [top] is an exact match.
			LastUrlPtrSlot = top;
			return;
		}
		if( sc > 0 )
		{
			// Candidate was lower. Must fall into the binary search.
		}
		else
		{
			// pUrl exceeds UrlPtrs entry at [top].
			// but there exists no entry at [top + 1].
			InsertUrlBeforeUrlPtrSlot( top + 1, pUrl );
			LastUrlPtrSlot = top + 1;
			return;
		}
	}
	// Here is the binary search.
	for( ;; )
	{
		int mid = low + top >> 1; // always positive, safe to shift for / 2

		int sc = strcmp( pSortedUrlPtrs[mid].pUrl, pUrl );
		// Test for this exact match condition:
		if( sc == 0 )
		{
			// UrlPtrs entry at [mid] is an exact match.
			LastUrlPtrSlot = mid;
			return;
		}
		if( sc > 0 )
		{
			// Candidate was lower. Move down in table.
			if( top == mid )
			{
				// UrlPtrs entry at [mid] exceeds pUrl.
				// But falls between [mid - 1] and [mid].
				InsertUrlBeforeUrlPtrSlot( mid, pUrl );
				LastUrlPtrSlot = mid;
				return;
			}
			top = mid;
		}
		else
		{
			// Candidate was higher. Move up in table.
			if( low == mid )
			{
				// pUrl exceeds UrlPtrs entry at [mid].
				// But falls between [mid] and [mid + 1].
				InsertUrlBeforeUrlPtrSlot( mid + 1, pUrl );
				LastUrlPtrSlot = mid + 1;
				return;
			}
			low = mid;
		}
	} // for...
}

void MergeNoteAtLastUrlPtrSlot( char *pNote )
{
	// The LastUrlPtrSlot was set by AddLinkUrlPartsToUrlList.
	// Notes will be kept as a linked list, with a ( char* ) in
	// the first four bytes, or null if this is the last note.
	// When written to scriptfile, each note is on a new line.
	int n = sizeof( char* ) + strlen( pNote ) + 1;
	char* Item = ( char* ) malloc( n );
	Flow( 14798 );
	// fprintf( stdout, "MERG: %s\n", pNote );
	if( Item == ( char* )0 )
	{
		fprintf( stderr, "SURF: Malloc failure adding URLs to list.\n" );
		exit( 1 );
	}
	*( char** )Item = ( char* )0;
	strcpy( Item + sizeof( char* ), pNote );
	// Every note starting with a '#' byte must be inserted first.
	// That way you can correctly concatenate multiple LIST files;
	// Not quite, but at least first '#' note will not become '*'.
	// The list file shows leading spaces, but no spaces are here.
	if( *pNote == '#' )
	{
		*( char** )Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
		pSortedUrlPtrs[LastUrlPtrSlot].pNote = ( char* )Item;
	}
	else
	{
		// This is the provision to add this note to end of linked list.
		char ** dad = & pSortedUrlPtrs[LastUrlPtrSlot].pNote;
		if( *dad != ( char* )0 )
		do
			dad = ( char** )*dad;
		while( *dad != ( char* )0 );
		*dad = Item;
	}
	return;
}

void AddLinkUrlPartsToUrlList( )
{
	// Caller may be adding a BASE or Anchor URL,
	// or a Url text line parsed from ScriptFile.
	// In either case URL is now in LinkUrlParts.
	// Perform no sanity checks here. Caller depends on me to set Slot No.
	char UrlText [sizeof( LinkUrlParts )]; // grossly worst case size needed
	char *from, *into = UrlText;
	Flow( 17911 );
	from = LinkUrlParts.scheme;
	if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
	*into++ = ':';
	*into++ = '/';
	*into++ = '/';
	from = LinkUrlParts.netlocn;
	if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );

	// I'm going to try appending a '/' to URLS that are just a domain name.
	// But this 3/15/98 change, 2 places, might interfere with url fetching.
	if( *LinkUrlParts.urlpath == '\0'
	&& *LinkUrlParts.parameter == '\0'
	&& *LinkUrlParts.query == '\0' )
	{
		*into++ = '/';
		*into = '\0';
	}
	else
	{
		from = LinkUrlParts.urlpath;
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
		from = LinkUrlParts.parameter;
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
		from = LinkUrlParts.query;
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
	}
	BinaryLookupInsertionSort( UrlText );
	return;
}

int LinkUrlPartsIsAcceptable( )
{
	// These heuristics exclude binary resources from the URL list file.
	// Such URLs will still appear as anchors in the fetched .HTM files.
	// Apply this only to Anchor Urls, not Base Urls,
	// for which I may already have file to describe.
	char * scan = LinkUrlParts.urlpath;
	char * dot = 0;
	char dot0, dot1, dot2, dot3;
	Flow( 10981 );

	// Pure guesswork here. Your mileage may vary.

	if( strcmp( LinkUrlParts.scheme, "http" ) != 0 )
		return 0; // Currently, only support http

	if( LinkUrlParts.netlocn[0] == '\0' )
		return 0; // URL has no domain part

	// However, urlpath may be entirely empty, having only a domain.
	// So, do not apply this planned test:
	// if( LinkUrlParts.urlpath[0] == '\0' )
		// return 0; // URL has no path part

	// Look for the final .XXXX extension on the URL filename:
	while( *scan != '\0' )
	{
		if( *scan == '.' )
			dot = scan + 1; // so actually, dot points to first XXXX letter
		scan++;
	}
	// I can do a little guesswork about the object type found at this URL.
	// This section is likely to become obsolete quickly as new types appear.
	if( scan - dot < 5 )
	{
		dot0 = dot[0]|' ';
		dot1 = dot[1]|' ';
		dot2 = dot[2]|' ';
		dot3 = dot[3]|' ';
		switch( scan - dot ) {
		case 1:
			// Rid 1: .z,
			if( dot0=='z' )
				return 0;
			break;
		case 2:
			// Rid 2: .au, .gz, .pl,
			if( dot0=='a' && dot1=='u' )
				return 0;
			if( dot0=='g' && dot1=='z' )
				return 0;
			if( dot0=='p' && dot1=='l' )
				return 0;
			if( dot0=='r' && dot1=='m' )
				return 0;
			break;
		case 3:
			// Rid 3: .aif, .dll, .gif, .hqx, .jpg,
			// .map, .mp2, .mpg, .pdf, .vmd, .wav, .zip,
			if( dot0=='a' && dot1=='i' && dot2=='f' )
				return 0;
			if( dot0=='d' && dot1=='l' && dot2=='l' )
				return 0;
			if( dot0=='g' && dot1=='i' && dot2=='f' )
				return 0;
			if( dot0=='h' && dot1=='q' && dot2=='x' )
				return 0;
			if( dot0=='j' && dot1=='p' && dot2=='g' )
				return 0;
			if( dot0=='m' && dot1=='a' && dot2=='p' )
				return 0;
			if( dot0=='m' && dot1=='p' && dot2=='2' )
				return 0;
			if( dot0=='m' && dot1=='p' && dot2=='g' )
				return 0;
			if( dot0=='p' && dot1=='d' && dot2=='f' )
				return 0;
			if( dot0=='v' && dot1=='m' && dot2=='d' )
				return 0;
			if( dot0=='w' && dot1=='a' && dot2=='v' )
				return 0;
			if( dot0=='z' && dot1=='i' && dot2=='p' )
				return 0;
			break;
		case 4:
			// Rid 4: .jpeg, .java,
			if( dot0=='j' && dot1=='p' && dot2=='e' && dot3=='g' )
				return 0;
			if( dot0=='j' && dot1=='a' && dot2=='v' && dot3=='a' )
				return 0;
			break;
		default:
			break;
		}
	}
	return 1;
}

void ResortVocabularyByFrequency( )
{
	// This is called prior to calling OutputTopOfVocabulary( ) below.
	// Each item = ( one short count + one asciz string + another \0 ).
	// Input words are limited to 1 [a-z] + 0 to 39 [a-z0-9] + \0.
	// Caller must guarantee nSortedVocabulary > 0; saves me one tab;
	int n = nSortedVocabulary;
	// Processing each Item's contents: create a new sort per counts.
	// Insertion sort place item in a list, descending by counts.
	char **ppItem = ppSortedVocabulary;
	int nResort = 0;
	Flow( 10699 );
	ppResortedVocabulary = ( char** )malloc( sizeof( char* ) * n );
	if( ppSortedVocabulary == ( char** ) 0 )
	{
		fprintf( stderr, "SURF: Malloc failure resorting vocabulary.\n" );
		exit( 1 );
	}
	do {
		// outer loop processes elements of original list.
		// inner loop inserts that item into the new list.
		// If no favorable strcmp occurs, add item at end.
		// Insert item as soon as a slot compares smaller.
		int itemqty = *( short* )*ppItem;
		int i = 0;
		while( i < nResort )
		{
			if( *( short* ) ( ppResortedVocabulary [i] ) < itemqty )
			{
				// Slide up [i] and all above to make room.
				int k = nResort;
				while ( k > i )
				{
					ppResortedVocabulary [k] = ppResortedVocabulary [k-1];
					k--;
				}
				// Put this outer loop's item at vacated i.
				ppResortedVocabulary [i] = *ppItem;
				goto ReSortTwoBreaks;
			}
			i++;
		}
		// Add this outer loop's item to end of list.
		ppResortedVocabulary [nResort] = *ppItem;

	ReSortTwoBreaks: ;

		nResort++;
		ppItem++;
	} // do...
	while( --n > 0 );
	return;
}

void OutputTopOfVocabulary( char * CallersBuffer )
{
	// Caller has recently called ResortVocabularyByFrequency( ) above.
	// if( CallersBuffer == ( char* )0 ) notes are appended to the URL list.
	// otherwise, I format the top note of URL / HTML file into buffer.
	// Run through the ResortedVocabulary in order,
	// Each item = ( one short count + one asciz string + another \0 ).
	// Input words are limited to 1 [a-z] + 0 to 39 [a-z0-9] + \0.
	// Caller must guarantee nSortedVocabulary > 0; saves me one tab;
	char WorkBuffer [120]; // Don't surpass FrequentWords [120] below.
	char * into = WorkBuffer;
	int AddedLines = 0;

	// Processing each resorted Item's contents.
	// List the words to work buffer, stop at about 100+ columns.
	int n = 0;
	Flow( 13677 );
	do {
		char * from = ppResortedVocabulary[n] + sizeof( short );
		if( into > WorkBuffer )
			*into++ = ' ';
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
		if( into > WorkBuffer + 60 )  // Loop could go 1 word beyond, ~40 chars.
		{
			if( CallersBuffer == ( char* )0 )
			{
				// It seems that I am seeing this output twice. Omit this one.
				// if( ProgramIsDoingFetch )
				// fprintf( stdout, "%s\r\n", WorkBuffer );
				MergeNoteAtLastUrlPtrSlot( WorkBuffer ); // Vocabulary for URL
			}
			else if( AddedLines == 0 )
			{
				strcpy( CallersBuffer, WorkBuffer );
			}
			into = WorkBuffer;
			// The 2/98 version added 3 lines of words, but I think
			// that made the URL list files too huge and unreadable.
			if( ++AddedLines == 2 )
				break;
		}
	} // do...
	while( ++n < nSortedVocabulary );
	// Attach any short line for files with short vocabulary lists
	if( into > WorkBuffer )
	{
		if( CallersBuffer == ( char* )0 )
		{
			// It seems that I am seeing this output twice. Omit this one.
			// if( ProgramIsDoingFetch )
			// fprintf( stdout, "%s\r\n", WorkBuffer );
			MergeNoteAtLastUrlPtrSlot( WorkBuffer ); // Vocabulary for URL
		}
		else if( AddedLines == 0 )
		{
			strcpy( CallersBuffer, WorkBuffer );
		}
	}
	return;
}

void FreeUpTheVocabulary( )
{
	// The vocabulary is a vector of item ptrs, not as a linked list.
	// Each item = ( one short count + one asciz string + another \0 ).
	// Run through the original alphabetized SortedVocabulary vector,
	// freeing each item's memory. Then free the index vector itself.
	int n = nSortedVocabulary;
	Flow( 17796 );
	if( n > 0 )
	{
		char **ppItem = ppSortedVocabulary; // Not null because n > 0
		do {
			free( *ppItem );
			ppItem++;
		} // do...
		while( --n > 0 );
		// Restore the empty and unallocated condition of the vector.
		free( ppSortedVocabulary );
		ppSortedVocabulary = ( char** )0;
		nMallocVocabulary = 0;
		nSortedVocabulary = 0;
	}
	// The following vector of re-sorted pointers are worthless now.
	if( ppResortedVocabulary != ( char ** )0 )
	{
		free( ppResortedVocabulary ); // set by ResortVocabularyByFrequency
		ppResortedVocabulary = ( char ** )0;
	}
	return;
}

void TestBaseURLAgainstQueryResultPages( )
{
	// This serves -F FetchObjectFromUrlToFile, using Base URL before fetching.
	// This serves -R and -A, using HTML Base tag read during local file parse.
	BaseURLHasAQueryPart = 0; // First, postulate no query part
	MatchingMethod = 0; // First, postulate a failure to match.
	if ( BaseUrlParts.query [0] == '?' ) // First char is '?', otherwise '\0'.
	{
		// Check whether this Base Url matches any SURF -Q search engine URLs.
		// If it matches any of those, start its proposed filename with a "_",
		// so the DOS command "DEL _*" can rid all search engine result pages.
		int qi = 0;
		BaseURLHasAQueryPart = 1; // This base URL has a ....?Query.... part.
		do
		{
			// First I have to match the current page's base URL urlpath.
			// Only then do I invest the effort to match partial netlocn.
			// I should have used a binary search on the sorted Parsable.
			if( stricmp( BaseUrlParts.urlpath, ParsableSearchEngineResultPages [qi] ) == 0 )
			{
				char * scan1 = BaseUrlParts.netlocn;
				// Take the second of two strings, just past an initial \0.
				char * scan2 = ParsableSearchEngineResultPages [qi + 1] + 1;
				scan1 += strlen( scan1 );
				scan2 += strlen( scan2 );
				char TheDigit = scan2[ 2 ]; // \0, space, digit: selects heuristic.
				for( ; ; )
				{
					scan1--;
					scan2--;
					if( *scan2 == '\0' )
						break; // sufficient match to the tail end of a known netlocn
					if( *scan1 != *scan2
					&& ( ( *scan1 ^ *scan2 & ~ ' ' ) || !isalpha( *scan1 ) ) )
						break; // non-matching, case insensitive backwards.
				}
				if( *scan2 == '\0' )
				{
					// This bool affects filename, and handling of anchor URLs.
					MatchingMethod = TheDigit - '0'; // convert to value 1 to 9.
					break;
				}
			}
			qi += 2; // Each entry in ParsableSearchEngineResultPages has 2 strings.
		} while( qi < sizeof( ParsableSearchEngineResultPages ) / sizeof( * ParsableSearchEngineResultPages ) );
	}
	return;
}

void ProposeFilenameFromBaseURL( )
{
	// This serves -F FetchObjectFromUrlToFile, using Base URL before fetching.
	// This serves -R and -A, using HTML BASE tag read during local file parse.
	char * into = ProposedFileBaseName;
	char * LastDot = into;

	// Save all known search engine result pages into "_filename".
	// User may readily delete them all with a DOS del _* command.
	if ( MatchingMethod > 0 ) // Current page is a known search engine result.
		*into++ = '_'; // So distinguish the local filename with a prefix.

	if( BaseUrlParts.urlpath [0] == '\0'
	|| BaseUrlParts.urlpath [1] == '\0' && BaseUrlParts.urlpath [0] == '/' )
	{
		// The current page's urlpath was empty. Use its netlocn instead.
		// One disadvantage I note is that my BRIEF editor regards .COM
		// files as executables, so I must type whole file name to open.
		char * from;

	UseTheDomainForFilename: ; // from below

		from = BaseUrlParts.netlocn;
		// Preserve more information by skip over any "www." prefix.
		if( ( from [0] | ' ' ) == 'w' // idiom lowercases W to w
		&& ( from [1] | ' ' ) == 'w'
		&& ( from [2] | ' ' ) == 'w'
		&& ( from [3] | ' ' ) == '.' )
			from += 4;
		// Skip over any "_", so filename will not match during "DEL _*"
		while( *from == '_' )
			from++;
		// Add the whole domain name into proposed file name
		while( ( *into=*from )!='\0' && into < ProposedFileBaseName + sizeof( ProposedFileBaseName ) - 1 )
		{
			from++;
			if( isalnum( *into ) )
				into++;
			else if( *into == '.' )
				LastDot = into; // place to perhaps discard .COM, .HTM, etc.
		}
	}
	else
	{
		// Scan to the last part of the urlpath.
		char * from = BaseUrlParts.urlpath;
		while ( *from != '\0' ) from++;
		from--;
		// Ignore any final '/', go back one segment.
		if ( *from == '/' )
			from--;
		// Scan back to the start of that path segment.
		while ( from > BaseUrlParts.urlpath && from[-1] != '/' )
			from--;
		// Avoid the common INDEX.HTM, synonym for user directory.
		if( ( from [0] | ' ' ) == 'i' // idiom lowercases I to i
		&& ( from [1] | ' ' ) == 'n'
		&& ( from [2] | ' ' ) == 'd'
		&& ( from [3] | ' ' ) == 'e'
		&& ( from [4] | ' ' ) == 'x'
		&& ( from [5] | ' ' ) == '.'
		&& ( from [6] | ' ' ) == 'h' )
		{
			// Repeat the scan backwards used above
			from--;
			// Scan back to the start of that path segment.
			while ( from > BaseUrlParts.urlpath && from[-1] != '/' )
				from--;
			if( from == BaseUrlParts.urlpath )
			{
				// This URL was like "domain/index.htm"
				goto UseTheDomainForFilename; // just above
			}
		}
		// Skip over a first '~', commonly used before a user name.
		if( *from == '~' )
			from++;
		// Skip over any "_", so it will not get caught in "DEL _*"
		while( *from == '_' )
			from++;
		// Add the final path segment into proposed file name
		while( ( *into=*from )!='\0' && into < ProposedFileBaseName + sizeof( ProposedFileBaseName ) - 1 )
		{
			from++;
			if( isalnum( *into ) )
				into++;
			else if( *into == '.' )
				LastDot = into; // place to perhaps discard .COM, .HTM, etc.
			else if( *into == '/' )
				break; // to not append the discarded ending "/index.htm"
		}
	}
	// After scrubbing so many letters, let's make sure something remains:
	if( into == ProposedFileBaseName )
		*into++ = 'X'; // Change the empty filename into "X"

	// Because of the difficulty Internet Explorer has with arbitrary
	// file extensions, rectify all filenames to end with .HTM again.
	// And, to prevent confusion when using editors like brief, that
	// use 8+3 file names, ( Well, my copy of brief is ten years old ),
	// rectify the filename to be unique within just the 8 characters.
	// No, -F or -R caller must do that, because -R may prefix a path.

	if( LastDot > ProposedFileBaseName
	&& ( into - LastDot == 3 || into - LastDot == 4 ) )
		into = LastDot; // approximate heuristic - discard .COM, .HTML, etc.

	// On Jun 4 1999, I discovered that a domain www.com exists,
	// and the processes above created filenames COM, COM1, etc.
	// which being special files caused SURF an error on fclose.
	if( (into == ProposedFileBaseName + 3)
	&& ( ( ( into [-3] | ' ' ) == 'c'
		&& ( into [-2] | ' ' ) == 'o'
		&& ( into [-1] | ' ' ) == 'm' )
	||  (  ( into [-3] | ' ' ) == 'l'
		&& ( into [-2] | ' ' ) == 'p'
		&& ( into [-1] | ' ' ) == 't' ) ) )
	{
		*into++ = 'X'; // Change the COM or LPT filename into COMX, LPTX
	}
	*into = '\0'; // This path will never come out empty
	ProposedFileBaseName [8] = '\0'; // Nor will it exceed 8 chars
	return;
}

void ProcessForEndOfInputFile( int HttpStatus )
{
	Flow( 19547 );
	// There may have been some concept being output.
	// I put @ in eof comment, to stop mailto search.
	if( ! DiscardingAllText )
	{
		EndAnyOuputParagraph( );
		fprintf( ObjectFile, "\t<!-- ######## ######## @End ######## ######## -->\n" );
	}

	// Now is the time to record facts about this HTML file I just processed:
	memcpy( ( void* ) & LinkUrlParts, ( void* ) & BaseUrlParts, sizeof( LinkUrlParts ) );

	// Before calling AddLinkUrlPartsToUrlList, make sure any prior added URL
	// got a note, so there will be an asterisk to inhibit further fetching.
	// This code block appears two places: In this case, for EOF - end of file.
	if( nSortedUrlPtrs > 0
	&& LastUrlPtrSlot != -1 // be sure there was a prior URL
	&& pSortedUrlPtrs[LastUrlPtrSlot].pNote == ( char* )0 )
	{
		// So *( ? ) after my BASE URL means URL disagreed with file's BASE tag.
		// Lets see, this could also mean had <A> with no </A> before end of file.
		// Lets see, this could also mean had <A> with no </A> before next <A>.
		MergeNoteAtLastUrlPtrSlot( "* No Anchor Text?" );
	}

	// If the file used any uncommon words, sort them now for later.
	// Then if there were any file naming words, build another list.

	nWordsOfInterest = 0; // Guarantee even if skip next IF clause.
	if( nSortedVocabulary > 0 ) // I must guarantee nSortedVocabulary > 0
	{
		if( IHaveFileRenamingData ) // LIST file contained @path [word...] lines
		{
			// I would prefer the search of a few terms into the big vocabulary.
			// But, I want them ordered by vocabulary frequency, so must resort.
			// Rather than make another linked list, limit to the top ten words.
			int j = 0;
			nWordsOfInterest = 0;
			if( NamingListTermsCount > 0 ) do
			{
				int n = QuantityOfWordInVocabulary( NamingListPtrs [j+j] );
				if( n > 0 )
				{
					int i = 0;
					// fprintf( stdout, "Word found: %s ( %d )\r\n", NamingListPtrs [j+j], n );
					while( i < nWordsOfInterest )
					{
						if( n > TopTenWordFrequencies [i] )
							break;
						i++;
					}
					if( i < 10 )
					{
						// Insert into TopTenWordsOfInterest, TopTenWordFrequencies.
						// Slide up range worst case is from [0 to 8] into [1 to 9].
						int k;
						if( nWordsOfInterest < 10 )
							nWordsOfInterest++;
						k = nWordsOfInterest;
						while( --k > i )
						{
							TopTenWordsOfInterest [k] = TopTenWordsOfInterest [k-1];
							TopTenWordFrequencies [k] = TopTenWordFrequencies [k-1];
						}
						TopTenWordsOfInterest [i] = NamingListPtrs [j+j];
						TopTenWordFrequencies [i] = n;
					}
				}
			} // do...
			while( ++j < NamingListTermsCount );
		}
	}

	AddLinkUrlPartsToUrlList( ); // This is the BASE Url of page in LocalFilename.
	{
		// Because I have just parsed the file and have the
		// authoritative word about its contents, if there
		// were other <A> Url notes, free those mallocs now.
		char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
		while( Item != ( char* )0 )
		{
			// This is the increment operation to do all items in a linked list.
			char * Next = *( char** )Item;
			free( Item );
			Item = Next;
		}
		// Be sure to null this head pointer just freed:
		pSortedUrlPtrs[LastUrlPtrSlot].pNote = ( char* )0;
	}
	if( HttpStatus == HTTP_STATUS_OK )
	{
		// Always, the first note due to a BASE tag,
		// which BASE tag means that I possess such file,
		// starts with "*", meaning do not refetch file.
		// No, use "#" here, but * on unfetched new links.
		// Then, some terse statistics, local filename.
		{
			char wk [200];
			char * into = wk;
			char * from = LocalFilename; // Normally pathless, but caution.
			*into++ = '#'; // This # in note1 means file was fetched, do not refetch.
			// Add 3 quality digits -- base-4 logs of Ideas/Links/Chars.
			{
				char c = '0';
				unsigned int n = ( unsigned int )CountAcceptableIdeas;
				while( n > 0 && c < '9' )
				{
					n >>= 2;
					c++;
				}
				*into++ = c;
			}
			{
				char c = '0';
				unsigned int n = ( unsigned int )CountAcceptableLinks;
				while( n > 0 && c < '9' )
				{
					n >>= 2;
					c++;
				}
				*into++ = c;
			}
			{
				char c = '0';
				unsigned int n = ( unsigned int )CountAcceptableWords;
				while( n > 0 && c < '9' )
				{
					n >>= 2;
					c++;
				}
				*into++ = c;
			}
			*into++ = ' ';
			// Add the localfilename being stored, or read, into note1.
			while( ( *into=*from )!='\0' && into < wk + sizeof( wk ) - 2 ) // room for ')'
				from++, into++;

			// New idea added Dec 11 98: If any word of this fetched/read
			// HTML file's vocabulary list is found on the renaming list,
			// add that word to the "#" note line bearing the local filename.
			// Then end users can use FGREP to locate filenames of interest.

			if( nWordsOfInterest > 0 )
			{
				int i = 0;
				while( i < nWordsOfInterest )
				{
					// Copy this vocabulary word to the note line.
					if( into < wk + sizeof( wk ) - 50 ) // Max vocabulary item ~40
					{
						*into++ = ' ';
						if( i == 0 )
							*into++ = '(';
						from = TopTenWordsOfInterest [i];
						while( ( *into = *from ) !='\0' )
							from++, into++;
						// for a diagnostic, see the count too
						{
							int n = TopTenWordFrequencies [i];
							if ( n > 999 )
								n = 999;
							*into++ = ' ';
							if ( n > 99 )
								*into++ = ( char ) ( '0' + n / 100 % 10 );
							if ( n > 9 )
								*into++ = ( char ) ( '0' + n / 10 % 10 );
							*into++ = ( char ) ( '0' + n % 10 );
						}
					}
					i++;
				}
				*into++ = ')';
			}
			*into = '\0';
			// The LastUrlPtrSlot was set by AddLinkUrlPartsToUrlList:
			MergeNoteAtLastUrlPtrSlot( wk ); // First BASE Url note: #NNN Filename ( words of interest )
		}
		if( TitleTextBufIndex > 0 )
		{
			// A second note may be document's cleaned-up <TITLE> info.
			TitleTextBuffer[TitleTextBufIndex] = '\0';
			if( TitleTextBuffer[0] == '#'
			|| TitleTextBuffer[0] == '*' )
				TitleTextBuffer[0] = '!';  // Prevent first # or * char in titles
			// The LastUrlPtrSlot was set by AddLinkUrlPartsToUrlList:
			MergeNoteAtLastUrlPtrSlot( TitleTextBuffer ); // Second BASE Url note: title
		}
		else
		{
			MergeNoteAtLastUrlPtrSlot( "No title" ); // Second BASE Url note: no title
		}
	}
	else
	{
		// For errors, or HttpStatus other than 200 == HTTP_STATUS_OK, note as:
		// #000 Error.00404 or
		// #000 Error.12007
		memcpy( ( void* ) & LinkUrlParts, ( void* ) & BaseUrlParts, sizeof( LinkUrlParts ) );
		AddLinkUrlPartsToUrlList( ); // This is the BASE Url that I couldn't fetch right.
		{
			// Because I could not fetch the file, thus have an
			// authoritative opinion about its contents, if there
			// were other <A> Url notes, free those mallocs now.
			char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
			while( Item != ( char* )0 )
			{
				// This is the increment operation to do all items in a linked list.
				char * Next = *( char** )Item;
				free( Item );
				Item = Next;
			}
			// Be sure to null this head pointer just freed:
			pSortedUrlPtrs[LastUrlPtrSlot].pNote = ( char* )0;
		}

		if( HttpStatus == 1 )
		{
			// I set it to  1 when aborted download. Make note without # or *.
			MergeNoteAtLastUrlPtrSlot( "Surf stopped download due to user keyin." );
		}
		else
		{
			// This will be the first note due to a fetch error.
			// Keep the "#" marker here, like well-fetched URLs.
			static char Note [] = "#000 Error.12000";
			Note [11] = ( char ) ( '0' + HttpStatus / 10000 % 10 );
			Note [12] = ( char ) ( '0' + HttpStatus / 1000 % 10 );
			Note [13] = ( char ) ( '0' + HttpStatus / 100 % 10 );
			Note [14] = ( char ) ( '0' + HttpStatus / 10 % 10 );
			Note [15] = ( char ) ( '0' + HttpStatus % 10 );
			// The LastUrlPtrSlot was set by AddLinkUrlPartsToUrlList:
			MergeNoteAtLastUrlPtrSlot( Note ); // First BAD Url note: #000 Code.404
		}
	}

	if( nSortedVocabulary > 0 ) // I must guarantee nSortedVocabulary > 0
	{
		// Add 2 lines of the most-used words, each about 60 chars,
		// to annotate the last base href url describing this page.
		ResortVocabularyByFrequency( ); // Into a separate vector
		OutputTopOfVocabulary( ( char* )0 ); // to URL list due to NULL pointer
		// I don't call FreeUpTheVocabulary( ) because -R may need it.
	}

	// Use EOF opportunity to reset all global HTML parsing state variables.

	InsideComment = 0;
	DiscardingDueToTag = 0;
	DiscardingTextNow = DiscardingAllText;
	OutputCol = 0;
	TagDelimitsWords = 0;
	SavedText = 0;
	TitleTextBufIndex = 0;
	SavingTitleText = 0;
	AnchorTextBufIndex = 0;
	SavingAnchorText = 0;
	// No need to zero. Needed for file renaming... LocalFilename = ( char* )0;
	SentenceRecognition = 0;
	CountAcceptableIdeas = 0;
	CountAcceptableLinks = 0;
	CountAcceptableWords = 0;
	// Do not reset this now, used by caller... LastUrlPtrSlot = -1;
	MatchingMethod = 0;
	EverOutput = 0;
	BaseTextSaved = 0;
	TitleTextSaved = 0;
	LastNumWasAtTokenNumber = -1000;
	LastUrlWasAtTokenNumber = -1000;
	LastDtTagWasAtTokenNumber = -1000;
	LastLiTagWasAtTokenNumber = -1000;
	LastUrlPtrIsAQueryMatch = 0;
	EverSeenAnyValidHTMLTags = 0;

	// Clear the base url before any next file.
	memset( ( void* ) & BaseUrlParts, 0, sizeof( BaseUrlParts ) );
	return;
}

void ProcessAsText( char * start, char * beyond )
{
	// I have five callers, all who recognize the end of a run of text.
	// All have space or nonspace at start, all put beyond above start.
	// Callers No. 1 and 5 have \0 byte at beyond.
	// Callers No. 2, 3, 4 have < byte at beyond.
	// I will hurt \0 or <, but then fix it to <.
	char * scan = start;
	Flow( 17577 );
	*beyond = ' '; // install specific sentinel
	if( SavedText > 0 )
	{
		// Either be done with, or add on to a partial token.
		if( *scan == ' ' ) // This is never the final sentinel space.
		{
			// The saved prior partial token must be ended.
			UnfinishedToken [SavedText] = '\0';
			ProcessToken( UnfinishedToken );
			SavedText = 0;
		}
		else
		{
			if( TagDelimitsWords )
			{
				// Prior partial token was ended by HTML tag.
				// Join up any phrases except alpha to alpha.
				// That is, a delimiting tag ends alpha runs.
				if( ( !isalpha( *scan )
				|| !isalpha( UnfinishedToken [SavedText-1] ) )
				&& ( unsigned ) SavedText < ( unsigned ) ( sizeof( UnfinishedToken ) - 1 ) )
				do
					UnfinishedToken [SavedText++] = *scan++;
				while( *scan != ' ' && ( unsigned ) SavedText < ( unsigned ) ( sizeof( UnfinishedToken ) - 1 ) );
			}
			else
			{
				// Prior partial token gets continued.
				// Append additional characters to it.
				if( ( unsigned ) SavedText < ( unsigned ) ( sizeof( UnfinishedToken ) - 1 ) ) // Need not ask if *scan != ' '.
				do
				{
					UnfinishedToken [SavedText++] = *scan++;
				} // do...
				while( *scan != ' ' && ( unsigned ) SavedText < ( unsigned ) ( sizeof( UnfinishedToken ) - 1 ) );
			}
			// Now suppose this phrase is also a final token as below.
			// In that case, don't yet output unfinished token buffer.
			if( scan != beyond )
			{
				UnfinishedToken [SavedText] = '\0';
				ProcessToken( UnfinishedToken );
				SavedText = 0;
			}
		}
	}
	// Divide the text at all ' ', processing each token.
	// In order to safeguard against multiple whitespace,
	// I will defer applying this skip over spaces until
	// the bottom of the for loop. So, it is needed here.
	if( *scan == ' ' && scan != beyond )
	do
	{
		scan ++;
	} // do...
	while( *scan == ' ' && scan != beyond );
	if( scan != beyond )
	{
		for( ;; )
		{
			char * starttoken = scan;
			do
			{
				scan ++;
			} // do...
			while( *scan != ' ' );
			*scan = '\0'; // Null terminate the token
			// Note that SavedText = 0 when entering next if stmt.
			if( scan == beyond
			&& ( unsigned ) ( scan - starttoken )
			<= ( unsigned ) ( sizeof( UnfinishedToken ) - 1 ) )
			{
				// Save final word as a possibly partial token.
				// That means EOF must trigger the flush of it.
				// Append to partial token, unless was too big.
				scan = starttoken; // Need not ask anything before this do.
				do
					UnfinishedToken [SavedText++] = *scan++;
				while( *scan != '\0' ); // Need not test SavedText overflow.
				// Notice that I have not terminated UnfinishedToken yet.
				TagDelimitsWords = 0;
				break; // because scan had equaled beyond
			}
			else
			{
				// PrAsTxt is Not a fit final token.
				ProcessToken( starttoken );
				// However, it could have been a final token, so be careful.
				if( scan != beyond ) // Need not ask if *scan == ' '
				do
				{
					scan ++;
				} // do...
				while( *scan == ' ' && scan != beyond );
				if( scan == beyond )
					break;
			}
		}
	}
	*beyond = '<'; // fix this in case partial tag gets pushed back.
	return;
}

int GetLinkUrlParts( char * url, char * beyond )
{
	// Url is non-empty \0 terminated param value
	// of BASE HREF, A HREF, AREA HREF, FRAME SRC.
	// Both callers are ok that I modify sentinel.
	char *start = url;
	char *cease = beyond;
	char overflowed = 0;
	Flow( 14590 );

	// I fill out data of the LinkUrlParts global,
	// perhaps using data taken from BaseUrlParts.

	memset( ( void* ) & LinkUrlParts, 0, sizeof( LinkUrlParts ) );

	// A parse recommendation is given in RFC1808.

	// 2.4.1 Everything to the right of leftmost '#' is a <fragment>.
	// I don't care about <fragment>, but I will extract them anyway.
	// In same loop I will scan and re-pack out any space characters.
	{ // I need not ask if( start != cease )
		char * from = start;
		char * into;
		*cease = '#'; // install my loop sentinel
		// Enter a first phase that scans without copying as into == from.
		if( *from != '#' && *from != ' ' )
		do {
			from++;
		} // do...
		while ( *from != '#' && *from != ' ' );
		into = from; // to repair cease, even if do not do next phase.
		// Enter a second phase using a similar loop only if required.
		if( *from == ' ' )
		{
			// I do this because HTML documents sometimes get word wrapped.
			// There might remain spurious hyphens to left of these spaces.
			do {
				if( *from != ' ' ) // Omit space characters before the #
					*into++ = *from;
				from++;
			} // do...
			while ( *from != '#' );
		}
		if( from != cease )
		{
			// We have found a #<fragment> on this URL.
			int n = cease - from;
			if( ( unsigned ) n > ( unsigned ) ( sizeof( LinkUrlParts.fragment ) - 1 ) )
			{
				n = sizeof( LinkUrlParts.fragment ) - 1; // truncate if too long
				// I don't care to set overflowed = 1;
			}
			// This loop will discard the initial #,
			// and it leaves a final destination \0.
			do {
				n--;
				LinkUrlParts.fragment [n] = from [n];
			} while ( n > 0 );
		}
		cease = into; // Remove the #<fragment> from URL. Also repair slippage.
	}

	// 2.4.2 One or more [a-zA-Z0-9+.-] followed by a ':' is a <scheme>.
	if( start != cease
	&& ( isalnum( *start ) || *start == '+' || *start == '.' || *start == '-' ) )
	{
		char * from = start;
		int i = 0;
		*cease = '\0'; // install my loop sentinel
		do
		{
			// ASCII [A-Za-z0-9+.-] can all withstand this lowercasing idiom.
			if( ( unsigned ) i < ( unsigned ) ( sizeof( LinkUrlParts.scheme ) - 1 ) ) // truncate if too long
			{
				LinkUrlParts.scheme [i] = *from | ' ';
				i++;
			}
			from++;
		} // do...
		while ( isalnum( *from ) || *from == '+' || *from == '.' || *from == '-' );
		if( *from == ':' )
		{
			// Syntactically, that was a scheme name. Remove <scheme>: from URL.
			start = from + 1; // Need not ask if from==cease
			if( i == sizeof( LinkUrlParts.scheme ) - 1 )
			{
				overflowed = 1;
			}
		}
		else
		{
			// Syntactically, that was not a scheme name. Erase it again.
			from = start;
			do LinkUrlParts.scheme [--i] = '\0'; while( i > 0 );
		}
	}

	// 2.4.3 If remainder starts "// ", everything until "/" is a <netloc>.
	if( start != cease
	&& start[0] == '/'
	&& start[1] == '/' )
	{
		char * from = start + 2;
		int i = 0;
		*cease = '/'; // install my loop sentinel
		if ( *from != '/' )
		do
		{
			if( ( unsigned ) i
			< ( unsigned ) ( sizeof( LinkUrlParts.netlocn ) - 1 ) ) // truncate if too long
			{
				// Lowercase net locations ( i.e., domain names ).
				if( isupper( *from ) )
					LinkUrlParts.netlocn [i] = *from | ' ';
				else
					LinkUrlParts.netlocn [i] = *from;
				i++;
			}
			from++;
		} // do...
		while ( *from != '/' );
		start = from; // Remove <netloc> from left of url.
		if( i == sizeof( LinkUrlParts.netlocn ) - 1 )
		{
			overflowed = 1;
		}
	}

	// 2.4.4 Everything to the right of leftmost '?' is a <query>.
	if( start != cease )
	{
		char * from = start;
		*cease = '?'; // install my loop sentinel
		// We only need the phase that scans
		if( *from != '?' )
		do {
			from++;
		} // do...
		while ( *from != '?' );
		if( from != cease )
		{
			// We have found a ?<query> on this URL.
			int n = cease - from;
			if( ( unsigned ) n > ( unsigned ) ( sizeof( LinkUrlParts.query ) - 1 ) )
			{
				n = sizeof( LinkUrlParts.query ) - 1; // truncate if too long
				overflowed = 1;
			}
			// This loop will discard the initial ?,
			// and it leaves a final destination \0.
			do {
				n--;
				LinkUrlParts.query [n] = from [n];
			} while ( n > 0 );
			cease = from; // Remove the ?<query> from URL.
		}
	}

	// 2.4.5 Everything to the right of leftmost ';' is a <param>.
	if( start != cease )
	{
		char * from = start;
		*cease = ';'; // install my loop sentinel
		// We only need the phase that scans
		if( *from != ';' )
		do {
			from++;
		} // do...
		while ( *from != ';' );
		if( from != cease )
		{
			// We have found a ;<parameter> on this URL.
			int n = cease - from;
			if( ( unsigned ) n > ( unsigned ) ( sizeof( LinkUrlParts.parameter ) - 1 ) )
			{
				n = sizeof( LinkUrlParts.parameter ) - 1; // truncate if too long
				overflowed = 1;
			}
			// This loop will discard the initial ;,
			// and it leaves a final destination \0.
			do {
				n--;
				LinkUrlParts.parameter [n] = from [n];
			} while ( n > 0 );
			cease = from; // Remove the ;<parameter> from URL.
		}
	}

	// 2.4.6 Whatever is left, except first '/', is URL path.
	// However, keep any '/' to remember if path is absolute.
	if( start != cease )
	{
		int n = cease - start;
		if( ( unsigned ) n > ( unsigned ) ( sizeof( LinkUrlParts.urlpath ) - 1 ) )
		{
			n = sizeof( LinkUrlParts.urlpath ) - 1; // truncate if too long
			overflowed = 1;
		}
		// This copy does not discard any chars,
		// and it leaves a final destination \0.
		memcpy( ( void* ) LinkUrlParts.urlpath, ( void* ) start, n );
	}

	// Troubleshooting a failure to follow a 301 or 302 redirected URL:
	// fprintf( stdout, "scheme = <%s>\n", LinkUrlParts.scheme );
	// fprintf( stdout, "netlocn = <%s>\n", LinkUrlParts.netlocn );
	// fprintf( stdout, "urlpath = <%s>\n", LinkUrlParts.urlpath );
	// fprintf( stdout, "parameter = <%s>\n", LinkUrlParts.parameter );
	// fprintf( stdout, "query = <%s>\n", LinkUrlParts.query );
	// fprintf( stdout, "fragment = <%s>\n", LinkUrlParts.fragment );

	if( overflowed )
	{
		// My callers will not consult LinkUrlParts if I return -1.
		return -1;
	}

	// 3.3 An important observation of RFC1808 is that
	// the BASE href must come from final redirection.
	// That will influence how I perform GetURL stuff.

	return 0; // zero = success
}

int CombineLinkUrlWithBase( )
{
	// Here we continue applying rules from section 4 of RFC1808.
	// Link and Base are UrlParts as prepared in GetLinkUrlParts.

	// Troubleshooting a failure to add domain-only type
	// anchor URLs lacking a path part (final /) to URL list:
	// fprintf( stdout, "\nBase Scheme = <%s>\n", BaseUrlParts.scheme );
	// fprintf( stdout, "Base Netlocn = <%s>\n", BaseUrlParts.netlocn );
	// fprintf( stdout, "Base Urlpath = <%s>\n", BaseUrlParts.urlpath );
	// fprintf( stdout, "Base Parameter = <%s>\n", BaseUrlParts.parameter );
	// fprintf( stdout, "Base Query = <%s>\n", BaseUrlParts.query );
	// fprintf( stdout, "Base Fragment = <%s>\n", BaseUrlParts.fragment );
	// fprintf( stdout, "Link Scheme = <%s>\n", LinkUrlParts.scheme );
	// fprintf( stdout, "Link Netlocn = <%s>\n", LinkUrlParts.netlocn );
	// fprintf( stdout, "Link Urlpath = <%s>\n", LinkUrlParts.urlpath );
	// fprintf( stdout, "Link Parameter = <%s>\n", LinkUrlParts.parameter );
	// fprintf( stdout, "Link Query = <%s>\n", LinkUrlParts.query );
	// fprintf( stdout, "Link Fragment = <%s>\n", LinkUrlParts.fragment );

	// Step 1 If BASE=empty, URL = absolute.
	// Step 2a If URL=empty, URL = base URL.

	// For SURF rather, if this link's netlocn
	// and urlpath are both empty, ignore link.
	// This adversely affects rfc1808 examples
	// that should inherit whole base net+path,
	// and along with that parameters or query.
	// <URL:?y> isn't <URL:http://a/b/c/d;p?y>.

	Flow( 10375 );
	if( LinkUrlParts.netlocn [0] == '\0'
	&& LinkUrlParts.urlpath [0] == '\0' )
	{
		// This discards links that have no net location and no path part.
		return -1;
	}

	// Step 2b If URL has scheme, URL = absolute.
	// Otherwise, URL inherits scheme from BASE.
	if( LinkUrlParts.scheme [0] != '\0' )
	{
		// Jul 13 1999 - Fixing a failure to add domain-only type
		// anchor URLs lacking a path part (final /) to URL list:
		// <BASE HREF = "http://yyy.com/nnnnn/">
		// <A HREF="http://xxx.xxx.edu">text</A>
		if( LinkUrlParts.urlpath [0] == '\0' )
		{
			LinkUrlParts.urlpath [0] = '/';
			LinkUrlParts.urlpath [1] = '\0';
		}
		goto TheCheckingAfterTheMerging;
	}
	else
	{
		strcpy( LinkUrlParts.scheme, BaseUrlParts.scheme );
	}

	// Step 3 If URL has netloc, we're all done.
	// Otherwise, URL inherits netloc from BASE.
	if( LinkUrlParts.netlocn [0] != '\0' )
	{
		// Jul 13 1999 - Fixing similar case as above.
		if( LinkUrlParts.urlpath [0] == '\0' )
		{
			LinkUrlParts.urlpath [0] = '/';
			LinkUrlParts.urlpath [1] = '\0';
		}
		goto TheCheckingAfterTheMerging;
	}
	else
	{
		strcpy( LinkUrlParts.netlocn, BaseUrlParts.netlocn );
	}

	// Step 4 If URL urlpath starts /, we're all done.
	if( LinkUrlParts.urlpath [0] == '/' )
	{
		goto TheCheckingAfterTheMerging;
	}

	// Step 5 If URL urlpath is empty, inherit urlpath from BASE.
	if( LinkUrlParts.urlpath [0] == '\0' )
	{
		strcpy( LinkUrlParts.urlpath, BaseUrlParts.urlpath );
		// In that case, empty parameter or query also inherit from BASE.
		if( LinkUrlParts.parameter [0] == '\0' )
		{
			strcpy( LinkUrlParts.parameter, BaseUrlParts.parameter );
		}
		if( LinkUrlParts.query [0] == '\0' )
		{
			strcpy( LinkUrlParts.query, BaseUrlParts.query );
		}
		// And were done.
		goto TheCheckingAfterTheMerging;
	}

	// Looks like some transplant surgery will be required.
	// To recap, link's urlpath is not empty, nor absolute.
	{
		char wk [4 + sizeof( BaseUrlParts.urlpath ) + sizeof( LinkUrlParts.urlpath )];
		char c, *into = wk, *stop = wk;
		char *from = BaseUrlParts.urlpath;
		while( ( *into = c = *from ) != '\0' )
		{
			into++;
			from++;
			if( c == '/' )
				stop = into;
		}
		// Step 6. Remove everything past rightmost "/",
		// or everything if the BASE urlpath has no "/".
		into = stop;

		// Oct.30,1997 version created BASE.COMlinkurl with no /.
		// Remedy that by checking for an empty pathpart of base.
		if( into == wk )
			*into++ = '/';

		// Step 6. Append link URL's urlpath.
		from = LinkUrlParts.urlpath;
		while( ( *into = *from ) != '\0' )
		{
			into++;
			from++;
		}

		// Step 6a,b. Remove all path segments of "./", or any final "."
		*into = '\0'; // Fix sentinel
		// Funny how much time I wasted trying to make a slick loop.
		// Now create a straightforward implementation as described.
		into = from = wk;
		while( ( *into = c = *from ) != '\0' )
		{
			into++;
			from++;
			if( c == '.' )
			{
				// I have the dot.
				if( *from == '/'
				|| *from == '\0' )
				{
					// I have the ./ or a final dot.
					if( into == wk + 1
					|| into [-2] == '/' )
					{
						// And it was an entire path segment.
						into--; // Remove the already copied dot
						if( *from == '/' )
							from++; // Remove the optional slash
						// Wow. Sleeping on it 4 days made it seem easy.
					}
				}
			}
		}

		// Step 6c,d. Remove L-to-R any "<nondotdot>/../" and "<final>/.."
		*into = '\0'; // Fix sentinel
		into = from = wk;
		while( ( *into = c = *from ) != '\0' )
		{
			into++;
			from++;
			if( c == '.'
			&& *from == '.' )
			{
				// I have the dot-dot.
				if( from [1] == '/'
				|| from [1] == '\0' )
				{
					// I have the ../ or a final dot dot.
					// Here, into > wk + 1 would protect
					// into [-2] read, but into > wk + 2
					// guarantees a non-empty left path.
					if( into > wk + 2
					&& into [-2] == '/' )
					{
						// I have the slash left of the dot-dot.
						// I must still determine a non-dot-dot
						// and non-empty segment is to the left.
						char * scan = into - 3;
						while( scan >= wk && *scan != '/' )
							scan --;
						// If no / was found, scan is at wk - 1.
						// Invert if statement to protect reads.
						if( ! ( scan == into - 2 - 3
							&& into [-3] == '.'
							&& into [-4] == '.' ) )
						{
							into = scan + 1; // Leave any / before path.
							from++; // Remove the still uncopied dot.
							if( *from == '/' )
								from++; // Remove the optional slash.
						}
					}
				}
			}
		}
		*into = '\0';
		if( ( unsigned ) ( into - wk )
		> ( unsigned ) ( sizeof( LinkUrlParts.urlpath ) - 1 ) )
		{
			// Combined BASE + LINK url path would overflow.
			return -1;
		}
		strcpy( LinkUrlParts.urlpath, wk );
	}

TheCheckingAfterTheMerging: ;

	// At this point, I verifed all the samples
	// given in rfc1808, save those I mentioned.

	// Ignore all URLS except FTP, HTTP, GOPHER scheme: e.g., MAILTO.
	// Also remove any netlocn:port suffix equal to the default port.
	if( LinkUrlParts.scheme[0] == 'h' )
	{
		if( strcmp( LinkUrlParts.scheme, "http" ) != 0 )
			return -1;
		{
			char * atend = LinkUrlParts.netlocn;
			atend += strlen( LinkUrlParts.netlocn );
			if( atend[-3] == ':'
			&& atend[-2] == '8'
			&& atend[-1] == '0' )
				*( int* ) ( atend-3 ) = 0; // remove :80 from http netlocn
		}
	}
	else if( LinkUrlParts.scheme[0] == 'f' )
	{
		if( strcmp( LinkUrlParts.scheme, "ftp" ) != 0 )
			return -1;
		{
			char * atend = LinkUrlParts.netlocn;
			atend += strlen( LinkUrlParts.netlocn );
			if( atend[-3] == ':'
			&& atend[-2] == '2'
			&& atend[-1] == '1' )
				*( int* ) ( atend-3 ) = 0; // remove :21 from ftp netlocn
		}
	}
	else if( LinkUrlParts.scheme[0] == 'g' )
	{
		if( strcmp( LinkUrlParts.scheme, "gopher" ) != 0 )
			return -1;
		{
			char * atend = LinkUrlParts.netlocn;
			atend += strlen( LinkUrlParts.netlocn );
			if( atend[-3] == ':'
			&& atend[-2] == '7'
			&& atend[-1] == '0' )
				*( int* ) ( atend-3 ) = 0; // remove :70 from gopher netlocn
		}
	} else {
		return -1;
	}

	// Ignore all URLS lacking a net location, or lacking a url path.
	// In fact, make sure here that all urlpath parts start with a /.

	// Troubleshooting a failure to follow a 301 or 302 redirected URL:
	// fprintf( stdout, "Scheme = <%s>\n", LinkUrlParts.scheme );
	// fprintf( stdout, "Netlocn = <%s>\n", LinkUrlParts.netlocn );
	// fprintf( stdout, "Urlpath = <%s>\n", LinkUrlParts.urlpath );
	// fprintf( stdout, "Parameter = <%s>\n", LinkUrlParts.parameter );
	// fprintf( stdout, "Query = <%s>\n", LinkUrlParts.query );
	// fprintf( stdout, "Fragment = <%s>\n", LinkUrlParts.fragment );


	// A base urlpath may be entirely empty, having only a domain,
	// but this routine is only processing Anchor Urls, so we ask:
	// If merged URL's netloc is empty, or not an absolute path...
	if( LinkUrlParts.netlocn [0] == '\0'
	|| LinkUrlParts.urlpath [0] != '/' )
	{
		return -1;
	}

	// Convert urlpath starting with the common /%7e or /%7E into /~

	if( LinkUrlParts.urlpath [1] == '%'
	&& LinkUrlParts.urlpath [2] == '7'
	&& ( LinkUrlParts.urlpath [3] | ' ' ) == 'e' )
	{
		char * into = LinkUrlParts.urlpath + 1; // Start past the /
		char * from = into + 3;
		*into++ = '~';
		while( ( *into = *from ) != '\0' ) into++, from++;
	}

	// This url parsed ok; is http, ftp, or gopher;
	// its net location and url path are non-empty.
	return 0;
}

int ProcessURLTags( char * start, char * beyond, int MarkUpEnum )
{
	// Only these starting tags come from loop that finds tags:
	// <A ...>
	// <AREA ...>
	// <BASE ...>
	// <FRAME ...>

	// Nov 98 -- I decided to not preserve the base href tag.
	// Most pages containing it bungle the interpretation of it,
	// so that it only confuses a later parse of the saved page.
	// Few users actually pointed to somewhere else on purpose.
	// I shall output it as the meaningless tag <xase href....>.
	// Mar 99 -- Now I shall output it as clear text: BASE: ...
	// Apr 99 -- Now I simply omit any <BASE> tag completely.

	static char AorBhrefText [] = {"xase href = "};
	// I shall arbitrarily limit URL length to some convenient length.
	char PrettyUrl [500 + sizeof( TitleTextBuffer )];
	// Start is upon <, beyond is upon >.
	char *from = start;
	char *into;
	Flow( 16981 );
	// I will hurt >, but yet not fix it.
	*beyond = '\0';
	strcpy( PrettyUrl, AorBhrefText );
	into = PrettyUrl + sizeof( AorBhrefText ) - 1; // On the \0
	if( MarkUpEnum == mu_frame )
	{
		// search for " SRC", also require some '='.
		for( ;; )
		{
			if( *from <= ' ' )
			{
				if( *from == '\0' )
					break;
				if( *from == ' '
				&& ( from[1] | ' ' ) == 's'
				&& ( from[2] | ' ' ) == 'r'
				&& ( from[3] | ' ' ) == 'c' )
				{
					from += 4;
					if( *from == '=' )
						break;
					if( *from == ' ' && from[1] == '=' )
					{
						from++;
						break;
					}
				}
			}
			from++;
		}
	}
	else
	{
		// search for " HREF", also require some '='.
		for( ;; )
		{
			if( *from <= ' ' )
			{
				if( *from == '\0' )
					break;
				if( *from == ' '
				&& ( from[1] | ' ' ) == 'h'
				&& ( from[2] | ' ' ) == 'r'
				&& ( from[3] | ' ' ) == 'e'
				&& ( from[4] | ' ' ) == 'f' )
				{
					from += 5;
					if( *from == '=' )
						break;
					if( *from == ' ' && from[1] == '=' )
					{
						from++;
						break;
					}
				}
			}
			from++;
		}
	}
	if( *from == '\0' )
	{
		// Need I fix this? No, nor below... *beyond = '>';
		return -1; // This little piggy has no HREF or SRC =.
	}
	from++;
	if( *from == ' ' )		// Never multiple whitespaces, thank me.
		from++;
	{
		char c, term = ' '; // Often ' ' between multiple parameters
		char *stopper = PrettyUrl + sizeof( PrettyUrl ) - 1
		- sizeof( "> A frame of: x </a\0" ) - strlen( TitleTextBuffer );
		if( *from == '"'
		|| *from == '\'' )
		{
			term = *from;
			from++;
		}
		*beyond = term; // Change sentinel. Look out below!|?
		do
		{
			// this loop must leave room in PrettyUrl to
			// append ( a phrase far below + TitleTextBuffer ):
			// "> An area of: TitleTextBufferContents </a\0"
			// "> A frame of: TitleTextBufferContents </a\0"
			*into = c = *from;
			from++;
			into++;
		} while( c != term && into < stopper );
		if( c != term )
		{
			// Need I fix this? No, nor below... *beyond = '>';
			return -1;		// This little piggy was too long.
		}
		into--;			// To rid space ' or " terminator.
	}
	if( into == PrettyUrl + sizeof( AorBhrefText ) - 1 )
	{
		// Need I fix this? No, nor below... *beyond = '>';
		return -1;		// This little piggy was empty.
	}
	*into = '\0';
	if( MarkUpEnum == mu_base )
	{
		ThisInputFileContainedABaseTag = 1;
		*into = ' ';
		into[1] = '\0';

		// Canonicalize and store the base url to resolve locals.
		// Nov 98, changed to not honor any BASE tags during fetch.
		// However, the -R reorganizing feature will need this info.
		// Oh, yes, the -A and -B will still need this BASE tag too.
		if( ProgramIsDoingLocalFileInput ) // for -R filenames, -A, -B URLs
		{
			// Reset these so the ordinary link process will suffice.
			memset( ( void* ) & BaseUrlParts, 0, sizeof( BaseUrlParts ) );
			if( GetLinkUrlParts( PrettyUrl + sizeof( AorBhrefText ) - 1, into ) == 0 )
			{
				memcpy( ( void* ) & BaseUrlParts, ( void* ) & LinkUrlParts, sizeof( BaseUrlParts ) );
				// When SURF processes an HTML BASE tag that it wrote in each local file,
				// propose the local filename which will be used during SURF -R renaming.
				// This routine also compares URL to a list of known search engine URLs,
				// which will determine if filename starts with "_", and whether we should
				// leave non-fetch asterisks off target URLs returned by a search engine.
				if( ProgramIsDoingLocalFileInput )
				{
					TestBaseURLAgainstQueryResultPages( );
					ProposeFilenameFromBaseURL( ); // for -A or -B processing BASE tag
					// Unique-ing filename still depends on destination directory.
				}
			}
		}

		// Need I fix this? No, nor below... *beyond = '>';
		return -1; // This was a good URL, but the BASE caller doesn't care
	}
	if( ! DiscardingAllText )
	{
		// I have changed my scheme, to leave the </a> tag
		// in the midst of the text, but for non-<A> URLs,
		// I must provide a </a> terminator now, lest none.
		if( MarkUpEnum == mu_a )
		{
			into[0] = ' ';
			into[1] = '\0'; // Don't increment into, as into is used below.
		}
		else
		{
			// I think we may be here for only <AREA> or <FRAME>.
			// Annotate other <FRAME> and <AREA> links as either:
			// "> A frame of: ContentsOfTitleTextBuffer </a\0"
			// "> An area of: ContentsOfTitleTextBuffer </a\0"
			// Needing the excess room in PrettyUrl counted far above.
			char *saveinto = into;
			*into++ = '>';
			*into++ = ' ';
			if( MarkUpEnum == mu_area )
				strcpy( into, "An area of: " );
			else if( MarkUpEnum == mu_frame )
				strcpy( into, "A frame of: " );
			else
				strcpy( into, "A what? of: " );
			into += strlen( into );
			if( TitleTextBufIndex > 0 )
			{
				TitleTextBuffer[TitleTextBufIndex] = '\0';
				strcpy( into, TitleTextBuffer );
				into += strlen( into );
			}
			*into++ = ' ';
			*into++ = '<';
			*into++ = '/';
			*into++ = 'a';
			*into = '\0'; // Don't leave home without it.
			into = saveinto; // to pass into GetLinkUrlParts
		}
		PrettyUrl [3] = 'a'; // Change PrettyUrl "BASE HREF" to "...A HREF".
		// Anchors will be indented, and have a blank line before and after:
		EndAnyOuputParagraph( );
		if( ! EverOutput
		&& BaseTextSaved
		&& TitleTextSaved )
		{
			EverOutput = 1;
			fprintf( ObjectFile, "<PRE><A HREF = \"%s\"> %s </A>\n", BaseUrlText, TitleTextBuffer );
		}
		fprintf( ObjectFile, "\t<%s>\n\n", PrettyUrl+3 );
	}
	// I annotated first, because GetLinkUrlParts is destructive.
	// Canonicalize and process absolute or ( base+relative ) URLs.
	if( GetLinkUrlParts( PrettyUrl + sizeof( AorBhrefText ) - 1, into ) == 0
	&& CombineLinkUrlWithBase( ) == 0
	&& LinkUrlPartsIsAcceptable( ) ) // This is an A/Area/Frame Anchor URL
	{
		CountAcceptableLinks++;
		// Before calling AddLinkUrlPartsToUrlList, make sure any prior added URL
		// got a note, so there will be an asterisk to inhibit further fetching.
		// This code block appears two places: In this case, for a new <A>.
		if( nSortedUrlPtrs > 0
		&& LastUrlPtrSlot != -1 // be sure there was a prior URL
		&& pSortedUrlPtrs[LastUrlPtrSlot].pNote == ( char* )0 )
		{
			// So *( ? ) after my BASE URL means URL disagreed with file's BASE tag.
			// Lets see, this could also mean had <A> with no </A> before end of file.
			// Lets see, this could also mean had <A> with no </A> before next <A>.
			MergeNoteAtLastUrlPtrSlot( "* No Anchor Text?" );
		}
		// Fit a fully specified URL from this <A HREF...> into the sorted list.
		if( ProgramIsSavingURLs ) // Only false for -R, -B flags to save memory
		{
			AddLinkUrlPartsToUrlList( ); // This URL from A, Area, Frame, etc.
			LastUrlWasAtTokenNumber = TokenNumber; // To age query hit recognition clues.
			// Need I fix this? No, nor above... *beyond = '>';
			return 0; // This was a good URL, so an Anchor caller can divert text
		}
	}
	// Need I fix this? No, nor above... *beyond = '>';
	return -1; // This was not a good, nice, absolute Anchor URL
}

int PotentialTagNameLookup( char * word )
{
	// Caller passes a pointer to a word
	// in buffer, ending in space or \2.
	// May see final \0, need not match.
	// If word is no tag name, return 0.
	// else return index in enumeration.
	int low = 0;
	int top = ( sizeof( SortedTagNames )/sizeof( *SortedTagNames ) - 1 );
	Flow( 10462 );
	for( ;; )
	{
		int mid = low + top >> 1; // always positive, safe to shift for / 2
		char * tptr = SortedTagNames [mid];
		char * scan = word;
		// My idiom ( | ' ' ) lowercases alphas without hurting numbers.
		if( *tptr == ( *scan | ' ' ) )
		{
			// a variant string comparison
			do
			{
				tptr++;
				scan++;
			} // do...
			while( *tptr == ( *scan | ' ' ) );
			// Test for this exact match condition.
			// This time the table entry ends with a null,
			// and the candidate string ends with a space,
			// or a \2 standing for the > to be converted.
			if( *tptr == '\0' && ( *scan == ' ' || *scan == '\2' ) )
				return mid;
		}
		if( *tptr > ( *scan | ' ' ) ) // Warning - gt/lt on signed chars - ok thru 127.
		{
			// Word was lower. Move down in table.
			if( top == mid )
				return 0;
			top = mid;
		}
		else
		{
			// Word was higher. Move up in table.
			if( low == mid )
				return 0;
			low = mid;
		}
	} // for...
}

char * PotentialEntityLookup( char * word )
{
	// Caller passes a pointer to "alphanumerics\0".
	// If alphanumerics is no entity name, return 0.
	// else return pointer to matching 8 char block.
	int low = 0;
	int top = ( sizeof( SortedEntityNames ) - 1 - 8 ) >> 3;
	Flow( 11790 );
	for( ;; )
	{
		int mid = low + top >> 1; // always positive, safe to shift for / 2
		char * eptr = SortedEntityNames + ( mid << 3 ); // safe to shift for * 8
		char * scan = word;
		if( *eptr == *scan )
		{
			// a variant string comparison
			do
			{
				eptr++;
				scan++;
			} // do...
			while( *eptr == *scan );
			// Test for this exact match condition.
			// This time the table entry ends with a space,
			// and the candidate string ends with a null.
			if( *eptr == ' ' && *scan == '\0' )
				return SortedEntityNames + ( mid << 3 );
		}
		if( *eptr > *scan ) // Warning - gt/lt on signed chars - ok thru 127.
		{
			// Word was lower. Move down in table.
			if( top == mid )
				return ( char* )0;
			top = mid;
		}
		else
		{
			// Word was higher. Move up in table.
			if( low == mid )
				return ( char* )0;
			low = mid;
		}
	} // for...
}

int MultiPassInbufConversion( int nbuffered )
{
	// Process input from ( inbuf+4 )[0] to ( inbuf+4 )[nbuffered-1].
	// Check for unsafe garbage or binary, returning -1 to abort.
	// Tail of intermediate disk buffer lacking final whitespace,
	// also incomplete tokens, can be copied back down to inbuf+4
	// to join the next input buffer for processing on next loop.
	// Return count of unprocessed characters sitting at inbuf+4.
	int FinalSpace = nbuffered;
	Flow( 16540 );
	// locate final whitespace ( anything <= ' ' ) to break buffer.
	( inbuf+4 )[-1] = ' '; // sentinel
	// Warning - gt/lt on signed chars would NOT be ok: over 127 in next loop:
	do FinalSpace--; while( ( unsigned char ) ( inbuf+4 )[FinalSpace] > ' ' );
	( inbuf+4 )[FinalSpace] = ' '; // revise to exact sentinel byte
	// Stop processing unsafe garbage, or probable binary data.
	if( FinalSpace == -1
	|| nbuffered - FinalSpace > 1000 )	// because 1Kb unsafe to copy back
		return -1;
	{
		// Scan to simplify character content, check for binary.
		int WierdBytes = 0;
		int i = FinalSpace;
		do
		{
			unsigned char c = ( inbuf+4 ) [i];
			unsigned char d = Simplification [c]; // Warning - array math on UNsigned chars - ok thru 255.
			if( d != c )
			{
				// Replace character with a simplified character.
				( inbuf+4 ) [i] = d;
				if( d > ' ' )
					WierdBytes ++;  // Don't count <, >, tabs, newlines.
			}
		} // do...
		while( --i >= 0 );
		// Stop processing unsafe garbage, or probable binary data.
		if( ( WierdBytes << 2 ) > FinalSpace  // over 1/4 of bytes were wierd
		&& ! EverSeenAnyValidHTMLTags ) // Keep trying for foreign HTML pages
			return -1;
	}

	// Process ( inbuf+4 ) [0] through space byte at ( inbuf+4 ) [FinalSpace].
	// During this process convert "&#255;" and "&sup3;" type of entities.
	{
		int PastCount = FinalSpace + 1;
		int i = 0, o = 0, PastAmp;
		char c;
		// This outer loop encloses several processing states.
		for( ;; )
		{
			// During the first state, copy until any '&' or any ' '.
			// Trim leading ' ' and duplicate ' ' but keep final ' '.
			if( ( inbuf [o++] = c = ( inbuf+4 ) [i++] ) != '&' )
			do
			{
				if( c == ' ' )
				{
					if( o == 1
					|| inbuf [o-2] == ' ' )
						o--;
					if( i == PastCount ) // Watch for all-important ' ' sentinel
						break;
				}
			} // do...
			while( ( inbuf [o++] = c = ( inbuf+4 ) [i++] ) != '&' );

			if( i == PastCount )  // Watch for all-important ' ' sentinel
				break;

			// During the next state, process '&' and certain next bytes.
			PastAmp = o;
			// Examine ( and keep copying ) one next byte.
			inbuf [o++] = c = ( inbuf+4 ) [i++];
			if( isalpha( c ) )
			{
				// If started off &[a-zA-Z], then accept following [a-zA-Z0-9]*.
				// Keep copying bytes in case not a valid entity name.
				if( isalnum( inbuf [o++] = c = ( inbuf+4 ) [i++] ) )
				do
				{
				} // do...
				while( isalnum( inbuf [o++] = c = ( inbuf+4 ) [i++] ) );
				o--; i--;	// put back the non-alnum
				// Now compare to the list of all valid entity names.
				inbuf [o] = '\0';
				{
					// In buffer ends on space, no partial names.
					char * cp = PotentialEntityLookup( inbuf + PastAmp );
					if( cp != ( char* )0 )
					{
						// Find the desired new chars at ( +6, ) +7 past ptr.
						o = PastAmp - 1;
						// This cannot grow, as original entity was larger.
						if( cp [6] != ' ' )
							inbuf [o++] = cp [6];
						inbuf [o++] = cp [7];
						// Here, I added code to prevent duplicate spaces.
						if( inbuf [o-1] == ' '
						&& ( o == 1 || inbuf [o-2] == ' ' ) )
							o--;
						// Consume optional ';' after this valid conversion.
						if( c == ';' )
							i++;
					}
				}
			}
			else if( c == '#' && isdigit( ( inbuf+4 ) [i] ) )
			{
				// If started off &#[0-9], then accept following [0-9]*.
				// Keep copying bytes in case not a valid number.
				if( isdigit( inbuf [o++] = c = ( inbuf+4 ) [i++] ) )
				do
				{
				} // do...
				while( isdigit( inbuf [o++] = c = ( inbuf+4 ) [i++] ) );
				o--; i--;	// put back the non-digit
				// Force final null for atoi.
				inbuf [o] = '\0';
				{
					// I will accept a number from 1 to 255,
					// but only after I have homogenized it.
					int number = atoi( inbuf + PastAmp + 1 );
					if( number >= 1
					&& number <= 255 )
					{
						o = PastAmp - 1;
						inbuf [o++] = Simplification [number]; // Warning - number is not a char.
						// Here, I added code to prevent duplicate spaces.
						if( inbuf [o-1] == ' '
						&& ( o == 1 || inbuf [o-2] == ' ' ) )
							o--;
						// Consume optional ';' after this valid conversion.
						if( c == ';' )
							i++;
					}
				}
			}
			else
			{
				// Else, is no entity, copy verbatim.
				o--; i--;	// put back the non-suitable until next loop
			}
		} // for...
		// Null-terminate the homogenized and converted string in inbuf.
		inbuf [o] = '\0';
		// Henceforth, process data starting at ( inbuf+0 ).
		// This is quantity of bytes to process: return o;
		// That leaves stuff from ( inbuf+4 ) [FinalSpace+1]
		// to ( inbuf+4 ) [nbuffered-1] unused by this loop.
	}
	// Scan buffer to locate and process markup tags versus text runs.
	// If there is a final incomplete tag, push it back for next loop,
	// but not if that area exceeds 1000 bytes or is merely a comment.

	// Use another multi-phase outer loop as done above.
	// Process from ( inbuf+0 ) [0] to null sentinel byte.

	// Recognize these tags: <VALID, </VALID, <!, and any > as an ending.
	{
		char *scan = inbuf;
		char *start = scan;
		int MarkUpEnum = 0;
		if( InsideComment )
		{
			// Resume looking for end of a comment that split buffers.
			InsideComment = 0; // Reset global bool.
			MarkUpEnum = mu_Comment;	// Restore approx state from prior call.
			goto inner2continue;		// I'm just a little sorry.
		}

		// This outer loop encloses several processing states.
		for( ;; )
		{
			// start in text mode, passing through all printables.
			while( *scan >= ' ' ) // Warning - gt/lt on signed chars - ok thru 127.
				scan++;
			// Get rid of the easy \0 end-of-inbuf possibility first.
			if( *scan == '\0' )
			{
				// Process the final few words of text in buffer.
				if( scan > start )
					ProcessAsText( start, scan ); // This is No. 1 of 5 callers
				break;
			}
			// Nothing matters in text mode except < with a valid tag.
			if( *scan != '\1' ) // My signal for <
			{
				*scan = '>'; // convert \2 back into >.
				continue;
			}
			*scan = '<'; // convert \1 back into <.
			// Scan is sitting on <. Try to recognize markup.
			if( isalpha( scan [1] ) )
			{
				// Text not yet scanned may contain [\0\1\2].
				// In buffer ends on space, no partial names.
				MarkUpEnum = PotentialTagNameLookup( scan + 1 );
				if( MarkUpEnum == 0 )
					continue; // No match to any known [A-Za-z]... tags
				EverSeenAnyValidHTMLTags = 1;
				if( scan > start )
				{
					// Process the words of text before this tag.
					ProcessAsText( start, scan ); // This is No. 2 of 5 callers
					start = scan;
				}
			}
			else if( scan [1] == '/'
			&& isalpha( scan [2] ) )
			{
				// Text not yet scanned may contain [\0\1\2].
				// In buffer ends on space, no partial names.
				MarkUpEnum = PotentialTagNameLookup( scan + 2 );
				if( MarkUpEnum == 0 )
					continue; // No match to any known /[A-Za-z]... tags
				if( scan > start )
				{
					// Process the words of text before this tag.
					ProcessAsText( start, scan ); // This is No. 3 of 5 callers
					start = scan;
				}
			}
			else if( scan [1] == '!' )
			{
				// It seems unnecessary to be very delicate.
				// Hence, I shall parse any <! as a comment.
				MarkUpEnum = mu_Comment;
				if( scan > start )
				{
					// Process the words of text before this comment.
					ProcessAsText( start, scan ); // This is No. 4 of 5 callers
					start = scan;
				}
			}
			else
			{
				// I cannot make a tag of this <. Treat it as text.
				continue;
			}
			// Continue in non-text mode, searching for some ending.
			// Scan==start are sitting on < starting a valid markup.
			scan++;

inner2continue: ;
			while( *scan >= ' ' ) // Warning - gt/lt on signed chars - ok thru 127.
				scan++;
			// Handle the in-tag \0 end-of-inbuf possibility.
			if( *scan == '\0' )
			{
				if( scan - start > 1000 )
				{
					// Bypass giant incomplete tag as unsafe data.
					// If it is not a comment, output it to alert.
					if( MarkUpEnum == mu_Comment )
					{
						InsideComment = 1; // simplified global bit
					}
					else
					{
						ProcessAsText( start, scan ); // This is No. 5 of 5 callers
					}
				}
				else
				{
					// From *start to *scan-1 inclusive must be
					// copied up, possibly overlapping, to just
					// below the lowest tail byte, which starts
					// currently with ( inbuf+4 ) [FinalSpace+1].
					do ( inbuf+4 ) [FinalSpace--] = *--scan;
					while ( start < scan );
				}
				break;
			}
			// Nothing matters in markup mode except >.
			// I did code to honor single and double quotes during markup,
			// but bad quoting causes loss of content. Now I take first >.
			if( *scan != '\2' ) // My signal for >
			{
				*scan = '<'; // convert \1 back into <.
				goto inner2continue;
			}
			*scan = '>'; // convert \2 back into >.

			// Process this just slightly validated markup tag.
			// Start is sitting on <, and scan is sitting on >.
			int mutt = MarkupTagTypes [MarkUpEnum];

			if( mutt & 1 )
			{
				// 1 = This tag separates words, in case no other whitespace.
				TagDelimitsWords = 1;
			}
			if( mutt & 2 )
			{
				// 2 = This tag separates concepts ( sentences or paragraphs ).
				// There may have been some concept being output.
				if( SavedText > 0 )
				{
					UnfinishedToken [SavedText] = '\0';
					ProcessToken( UnfinishedToken );
					SavedText = 0;
				}
				if( OutputCol > 0 )
				{
					fputc( '\n', ObjectFile );
					fputc( '\n', ObjectFile );
					OutputCol = 0;
				}
				SentenceRecognition = 0; // Therefore nothing in progress
			}
			if( mutt & 4 )
			{
				// 4 = This tag may require URL processing.
				// My interest is ( A, AREA, BASE, FRAME ).
				if( start [1] != '/' )
				{
					// As there was no "/", we are processing starting tags.
					LastUrlPtrIsAQueryMatch = 0; // Starting <A> resets clue
					if( ProcessURLTags( start, scan, MarkUpEnum ) != -1 )
					{
						// A return value 0 implied a nice, absolute URL.
						// That set me up to call MergeNoteAtLastUrlPtrSlot.
						if( MarkUpEnum == mu_a )
						{
							// This was specifically the <A> starting tag.
							// Start capturing text between <A> ... </A>,
							// which I may record as a note for this URL.
							SavingAnchorText = 1;
							// If by chance a prior <A> had no </A>, it's
							// no problem, I merely reset the index here
							// and will overwrite text already collected.
							// Prefix an "*" to the anchor test which may
							// be annotated as the first note of this URL,
							// to keep surf from fetching every novel URL.
							AnchorTextBuffer [0] = '*';
							AnchorTextBufIndex = 1;
							// Now, while I am processing a <A HREF=URL>,
							// promote any clues from preceeding clear text
							// into asterisk-removing actions at this URL.
							// I can't remove '*' now, just set a flag so </A>
							// will do it, while also examining existing notes.
							if( MatchingMethod > 0 ) // Current page is a known search engine result.
							{
								// MatchingMethod == 3 // Clue: "100%" before the <A>
								// MatchingMethod == 5 // Clue: "1." before the <A>
								if( TokenNumber < LastNumWasAtTokenNumber + 8 )
								{
									LastUrlPtrIsAQueryMatch = 1; // due to 100%
									// Disarm clue, lest I see translate, etc.
									LastNumWasAtTokenNumber = -1000;
								}
								// MatchingMethod == 8 // Clue: <DT> before the <A>
								if( TokenNumber < LastDtTagWasAtTokenNumber + 8 )
								{
									LastUrlPtrIsAQueryMatch = 1; // due to <DT>
									// Disarm clue, lest I see translate, etc.
									LastDtTagWasAtTokenNumber = -1000;
								}
								// MatchingMethod == 9 // Clue: <LI> before the <A>
								if( TokenNumber < LastLiTagWasAtTokenNumber + 8 )
								{
									LastUrlPtrIsAQueryMatch = 1; // due to <LI>
									// Disarm clue, lest I see translate, etc.
									LastLiTagWasAtTokenNumber = -1000;
								}
							}
						}
						else if( MarkUpEnum == mu_frame )
						{
							// As <FRAME> tags do not have any </FRAME> tag,
							// I cannot save some text as with <A> ... </A>.
							// This is a good bold new step, 11/98:
							// Always remove the asterisk from any
							// <FRAME> url, so SURF auto-fetches it
							char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
							if( Item != ( char* )0 )
							{
								// On 3/99, I did this to override old notes:
								// As there is already an existing note, just
								// remove any asterisk, but don't touch a '#'.
								if( *( Item + sizeof( char* ) ) == '*' )
									*( Item + sizeof( char* ) ) = ' ';
							}
							else
							{
								// Add " A frame of: <TitleTextBufferContents>"
								char Annotation [40 + sizeof( TitleTextBuffer )];
								strcpy( Annotation, " A frame of: " );
								strcat( Annotation, TitleTextBuffer );
								MergeNoteAtLastUrlPtrSlot( Annotation ); // For Frame
							}

						}
						else if( MarkUpEnum == mu_area )
						{
							// As <AREA> tags do not have any </AREA> tag,
							// I cannot save some text as with <A> ... </A>.
							// If no other notes were recorded for this URL,
							// add "* An area of: <TitleTextBufferContents>"
							if( pSortedUrlPtrs[LastUrlPtrSlot].pNote == ( char* )0 )
							{
								char Annotation [40 + sizeof( TitleTextBuffer )];
								// Oops... SURF really goes far without the *'s!
								if( MarkUpEnum == mu_area )
								{
									strcpy( Annotation, "* An area of: " );
									strcat( Annotation, TitleTextBuffer );
									MergeNoteAtLastUrlPtrSlot( Annotation ); // For Area
								}
							}
						}
					}
				}
				else
				{
					// As there was a "/", we are processing ending tags.
					// Ending any mutt = 4 tag
					// Syntactically, /area, /frame, or /base could exist,
					// but only the </A> end tag makes sense in HTML text.
					// So, let's take SavingAnchorText to know it is </A>.
					if( SavingAnchorText )
					{
						// There may have been some concept being output.
						// In particular, get the final word before </A>.
						if( SavedText > 0 )
						{
							UnfinishedToken [SavedText] = '\0';
							ProcessToken( UnfinishedToken );
							SavedText = 0;
						}
						// End saving of text to annotate an anchor URL.
						SavingAnchorText = 0;
						if( ! DiscardingTextNow )
						{
							// Insert " </A>" right in the text,
							// sort of like ProcessToken would.
							if( OutputCol + 5 > 60 )
							{
								fputc( '\n', ObjectFile );
								OutputCol = 0;
							}
							fputs( " </a>", ObjectFile );
							OutputCol += 5;
						}
						// I already processed UnfinishedToken above,
						// so the entire <A>...</A> anchor text has
						// been collected. Use text to annotate URL.
						AnchorTextBuffer[AnchorTextBufIndex] = '\0';
						if( AnchorTextBufIndex > 0 )
						{
							// Now the decision to annotate that last URL
							// with the <A>...</A> text shall be that the
							// URL either has no note, or has a note that
							// is shorter than the currently created note.
							// Examine the item holding the first note.
							char* Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
							if( Item == ( char* )0 )
							{
								// This URL in <A> has never had a note yet.
								MergeNoteAtLastUrlPtrSlot( AnchorTextBuffer );
							}
							else if( *( char** )Item == ( char* )0
							&& *( Item + sizeof( char* ) ) == '*'
							&& strlen( Item + sizeof( char* ) )
							< ( unsigned ) AnchorTextBufIndex )
							{
								// URL has only one note, and it is shorter.
								// Also, that note like mine has a '*' mark.
								// First discard the older lesser note item.
								free( Item );
								pSortedUrlPtrs[LastUrlPtrSlot].pNote = ( char* )0;
								MergeNoteAtLastUrlPtrSlot( AnchorTextBuffer ); // First note = revised anchor text
							}
							// Now, after annotating the last URL, let's see if some
							// clues apply to have me remove a first note's asterisk.
							if( LastUrlPtrIsAQueryMatch // Implies MatchingMethod > 0
							&& LastUrlPtrSlot != -1 ) // be sure there was a prior URL
							{
								// Current page is a search engine result,
								// and this <A> URL locates a target page.
								// I know the following ItemB != ( char* )0
								char * ItemB = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
								if( *( ItemB + sizeof( char* ) ) == '*' && Uninhibited )
									*( ItemB + sizeof( char* ) ) = ' ';
							}
							LastUrlPtrIsAQueryMatch = 0; // disarm my trigger
						}
						// There ended the if( AnchorTextBufIndex > 0
					}
					// There ended the if ( SavingAnchorText
				}
				// There ended the </...> as for </A> interest
			}
			// There ended the mutt & 4 interest ( A, AREA, BASE, FRAME ).
			if( mutt & 8 )
			{
				// 8 = TAG /TAG delimit content to discard.
				// Interest is ( APPLET, OBJECT, SCRIPT, STYLE ).
				if( start [1] == '/' )
				{
					// End the same type of discard block.
					if( DiscardingDueToTag == MarkUpEnum )
					{
						DiscardingDueToTag = 0;
						DiscardingTextNow = DiscardingAllText;
					}
				}
				else
				{
					// Start an outermost discard block.
					if( DiscardingDueToTag == 0 )
					{
						DiscardingDueToTag = MarkUpEnum;
						DiscardingTextNow = 1;
						// There may have been some concept being output.
						// That's easily handled: make sure each 8 has 2.
					}
				}
			}
			if( mutt & 16 )
			{
				// 16 = This tag may imply other processing.
				// Interest is ( TITLE, /TITLE ).
				if( MarkUpEnum == mu_title )
				{
					if( start[1] == '/' )
					{
						// Ending </Title>.
						if( SavingTitleText )
						{
							SavingTitleText = 0;
							if( ! DiscardingAllText
							&& TitleTextBufIndex > 0 )
							{
								TitleTextBuffer[TitleTextBufIndex] = '\0';
								TitleTextSaved = 1;
								EndAnyOuputParagraph( );
								fprintf( ObjectFile, "<TITLE> %s </TITLE>\n", TitleTextBuffer );
								if( ! EverOutput
								&& BaseTextSaved
								&& TitleTextSaved )
								{
									EverOutput = 1;
									fprintf( ObjectFile, "<PRE><A HREF = \"%s\"> %s </A>\n", BaseUrlText, TitleTextBuffer );
								}
							}
						}
					}
					else
					{
						// Starting <Title>.
						SavingTitleText = 1;
						TitleTextBufIndex = 0;
					}
				}
			}
			if( mutt & 32
			&& MatchingMethod == 9 // Clue: <LI> before the <A>
			&& start[1] != '/' ) // Exclude DMOZ's incorrect <LI>...</LI> usage
			{
				// 32 = The LIST ITEM tag, "<LI>"
				// Used to recognize some query match target URLS
				LastLiTagWasAtTokenNumber = TokenNumber;
				// Since I would not have -A perform less well than -F,
				// write an HTML <LI> tag, just on search result pages.
				if( ! DiscardingTextNow )
				{
					EndAnyOuputParagraph( );
					fprintf( ObjectFile, "\t<LI>\n" );
				}
			}
			if( mutt & 64
			&& MatchingMethod == 8 // Clue: <DT> before the <A>
			&& start[1] != '/' ) // Exclude any incorrect <DT>...</DT> usage
			{
				// 64 = The LIST ITEM tag, "<DT>"
				// Used to recognize some query match target URLS
				LastDtTagWasAtTokenNumber = TokenNumber;
				// Since I would not have -A perform less well than -F,
				// write an HTML <DT> tag, just on search result pages.
				if( ! DiscardingTextNow )
				{
					EndAnyOuputParagraph( );
					fprintf( ObjectFile, "\t<DT>\n" );
				}
			}
			MarkUpEnum = 0; // Having finished with this tag

			// Resume text mode loop, searching for next < startup.
			// Put scan==start just past > ending the valid markup.
			scan++;
			start = scan;
		} // for...
	}
	// That leaves ( inbuf+4 ) [FinalSpace+1] to ( inbuf+4 ) [nbuffered-1] undone.
	( inbuf+4 ) [nbuffered] = '\0'; // for this printf
	{
		int noffset = nbuffered - 1 - FinalSpace;
		// Non-overlapping copy of leftovers to start of ( inbuf+4 ).
		if( noffset > 0 )
			memcpy( ( inbuf+4 ), ( inbuf+4 ) + FinalSpace + 1, noffset );
		( inbuf+4 ) [noffset] = '\0'; // for this printf
		return noffset;
	}
}

void ReadScriptFileToUrlList( )
{
	// The ScriptFilename exists. Import lines of it as URLs.
	int hfi = _open( ScriptFilename, _O_BINARY|_O_RDONLY, _S_IREAD|_S_IWRITE );
	Flow( 12151 );
	if( hfi == -1 )
	{
		fprintf( stderr, "SURF: cannot open file %s: %s.\n", ScriptFilename, strerror( errno ) );
		exit( 1 );
	}
	// One of my favorite file input methods is to read whole
	// next disk block just past the tail of unprocessed data.
	// It would be even better if I can acheive int alignment.
	int nread;
	int nSkipAFew = 0;
	int nBuffered = 0;
	ScriptFileByteCount = 0;
	// --Outer loop--
	// The outer processes disk blocks until end of file.
	for( ;; )
	{
		nread = _read( hfi, inbuf + nBuffered, 4096 );
		// EOF gives 0; Errors -1.
		if( nread < 0 )
		{
			fprintf( stderr, "SURF: error reading file %s: %s.\n", ScriptFilename, strerror( errno ) );
			exit( 1 );
		}
		nBuffered += nread;
		ScriptFileByteCount += nread;
		// Process text from inbuf [nSkipAFew] to inbuf [nBuffered-1]
		if( nBuffered <= nSkipAFew )
		{
			// At EOF with nothing held over
			break;
		}
		{
			char *start = inbuf + nSkipAFew;
			char *stop = inbuf + nBuffered;
			char *scan = start;
			stop[0] = '\r'; // Install loop sentinel 1
			stop[1] = 'Z'; // Install loop sentinel 2
			// --Middle  loop--
			// The middle loop processes this buffer.
			for( ;; )
			{
				// --Inner loop--
				// The inner loop looks for a newline.
				if( *scan != '\r' && *scan != '\n' )
				{
					do
					{
						scan++;
					} // do...
					while( *scan != '\r' && *scan != '\n' );
				}
				else
				{
					// For any empty line, stop adding to last URL.
					// Otherwise, my SURF legends add to prior URL.
					LastUrlPtrSlot = -1;
				}

				// Check for sentinel. If reached end of disk block,
				// this line ( start to scan-1 ) is likely incomplete.
				// If incomplete, some safe size, and !EOF, keep it.
				// I will accept CR alone or LF alone except at the
				// end of the disk block where only CR+LF is enough.

				if( scan >= stop - 1				// Postpone for single CR or LF too.
				&& scan - start < 1204			// Dont postpone if dangerously big.
				&& nread != 0 )					// Dont postpone if now end of file.
				{
					// Postpone it's processing until next disk block.
					// Copy the tail end of buffer back for next loop.
					// I hope saved buss cycles warrant the diversion.
					nSkipAFew = start - stop & sizeof( int ) - 1;
					nBuffered = nSkipAFew + stop - start;
					if( nBuffered > 0 )
					{
						// This might very well be an overlapping copy, however I
						// believe the customary implementation of memcpy is safe.
						memcpy( inbuf, stop - nBuffered, nBuffered );
					}
					// Break out of the middle loop processing this buffer.
					break;
				}
				// This is where to ProcessLineOfText( start, scan );
				if( start < scan )
				{
					char c = *scan; // Save the newline char
					*scan = '\0'; // terminate the entire line
					// fprintf( stdout, "LINE: %s\n", start );
					if( isalnum( *start ) )
					{
						// Treat this line as a URL
						// This parse has BaseUrlParts full of all zeros.
						if( GetLinkUrlParts( start, scan ) == 0
						&& LinkUrlPartsIsAcceptable( ) ) // This is a Scriptfile URL
						{
							// Here, don't worry if the prior URL had no notes
							AddLinkUrlPartsToUrlList( );
						}
					}
					else if( start [0] == '@' ) // Only in first input column
					{
						// Save user's lines of LIST starting with a "@",
						// in order, by appending to tail of linked list.
						// This is a pretty generic linked list block.
						char * TextData = start + 1; // don't store '@'
						int n = sizeof( void** ) + strlen( TextData ) + 1;
						char* item = ( char* )malloc( n );
						char* itemtext = item + sizeof( void** );
						if( item == ( char* )0 )
						{
							fprintf( stderr, "SURF: Malloc failure saving '@' lines.\n" );
							exit( 1 );
						}
						NamingListWorstCaseSize += n; // loosey goosey
						strcpy( itemtext, TextData );
						// Hook this item to old tail item, or to head if none.
						*NamingListTail = ( void** )item;
						// Revise the tail pointer to new tail, i.e., this item.
						NamingListTail = ( void** )item;
						// Place a null pointer in new tail, to terminate list.
						*NamingListTail = ( void** )0;
					}
					else
					{
						// Strip any type of leading whitespaces.
						while( *start == ' ' || *start == '\t' )
							start++;
						// Treat this line as a note continuation line;
						// but only if there were some non-whitespaces.
						if( LastUrlPtrSlot != -1 // be sure there was a prior URL
						&& start < scan )
						{
							// The LastUrlPtrSlot was set by AddLinkUrlPartsToUrlList:
							MergeNoteAtLastUrlPtrSlot( start ); // During input of LIST file
						}
					}
					*scan = c; // Repair the newline char
				}
				// Jump over the CR or LF newline byte
				scan++;
				// Jump by two in order to count CR+LF as one newline.
				if( scan[-1] == '\r' && scan[0] == '\n' )
					scan++;
				// check for sentinel
				if( scan >= stop )
				{
					nSkipAFew = 0;
					nBuffered = 0;
					// Break out of the middle loop processing this buffer.
					break;
				}
				start = scan;
			} // for...
		}
		if( nread == 0 )
		{
			// At EOF
			break;
		}
	} // for...
	if( _close( hfi ) == -1 )
	{
		fprintf( stderr, "SURF: error closing file %s: %s.\n", ScriptFilename, strerror( errno ) );
		exit( 1 );
	}
	return;
}

void SplitOutTheFetchedUrls( )
{
	// This was the top half of WriteUrlListToScriptFile( ),
	// but I split it off for old -M effort to rename files.
	int i, n;
	Flow( 16496 );
	// Warning: nSortedUrlPtrs could be 0. Therefore I said +1.
	// These two will be cleared in WriteUrlListToScriptFile( ).
	pResortedUrlPtrs = ( struct UrlPtrsX * ) malloc (
		sizeof( struct UrlPtrsX ) * nSortedUrlPtrs + 1 ); // Worst case
	nResortedUrlPtrs = 0; // To count items relinked.
	if( pResortedUrlPtrs == ( struct UrlPtrsX * ) 0 )
	{
		fprintf( stderr, "SURF: Malloc failure outputing URL list.\n" );
		exit( 1 );
	}
	// Process original list to transfer all the visited ( '#' ) links
	// to a new list, where they are resorted by some other criteria.
	n = 0;
	while( n < nSortedUrlPtrs )
	{
		// This loop examines item: pSortedUrlPtrs[n].
		// Examine the first note for this url, if any.
		// If it's first char is '#', move to new list.
		char * ItemNote1 = pSortedUrlPtrs[n].pNote;
		if( ItemNote1 != ( char* )0
		&& *( ItemNote1 + sizeof( char* ) ) == '#' )
		{
			// This insertion sort is ordered by the 3 digits that
			// follow the '#' in first note: ( Ideas, Links, Words ).
			char *itemkey = ItemNote1 + sizeof( char* ) + 1;
			// Jan 3, 1999: Changed to an unequal weighting of these digits.
			int Quality = 9 * itemkey[0] + 7 * itemkey[1] + 5 * itemkey[2];

			// Inner loop inserts that item into the new list.
			// If no favorable strcmp occurs, add item at end.
			// Insert item as soon as a slot compares smaller.
			i = 0;
			while( i < nResortedUrlPtrs )
			{
				if( pResortedUrlPtrs[i].Quality < Quality )
				{
					// Slide up [i] and all above to make room.
					int k = nResortedUrlPtrs;
					while ( k > i )
					{
						// Array holds the structs, not pointers to structs.
						// So, copy both members. No, X makes three members.
						pResortedUrlPtrs[k].Quality = pResortedUrlPtrs[k-1].Quality;
						pResortedUrlPtrs[k].pUrl = pResortedUrlPtrs[k-1].pUrl;
						pResortedUrlPtrs[k].pNote = pResortedUrlPtrs[k-1].pNote;
						k--;
					}
					// Put this outer loop's item at vacated i.
					pResortedUrlPtrs[i].Quality = Quality;
					pResortedUrlPtrs[i].pUrl = pSortedUrlPtrs[n].pUrl;
					pResortedUrlPtrs[i].pNote = pSortedUrlPtrs[n].pNote;
					goto ReSortURLTwoBreaks;
				}
				i++;
			}
			// Add this outer loop's item to end of list.
			pResortedUrlPtrs[nResortedUrlPtrs].Quality = Quality;
			pResortedUrlPtrs[nResortedUrlPtrs].pUrl = pSortedUrlPtrs[n].pUrl;
			pResortedUrlPtrs[nResortedUrlPtrs].pNote = pSortedUrlPtrs[n].pNote;

		ReSortURLTwoBreaks: ;

			nResortedUrlPtrs++;
			// Nullify the moved item out of original list.
			pSortedUrlPtrs[n].pUrl = ( char* )0;
			pSortedUrlPtrs[n].pNote = ( char* )0;
		}
		n++;
	}

	// Rather than disrupt that work with a bunch of linked lists to
	// keep separate lists of files containing a NamingListPtrs word,
	// Put into .Quality the lowest index to any equal NamingListPtrs word.

	// Process the new list to re-catalog the most-used words,
	// for a most-used-of-the-most-used-words in LIST and HTM.
	i = 0;
	while( i < nResortedUrlPtrs )
	{
		// Cloned looping from the cloned loop below.
		char * ItemNote = pResortedUrlPtrs[i].pNote;
		int NoteCounter = 0;
		int BestMatch = NamingListTermsCount;
		while( ItemNote != ( char* )0 )
		{
			// This is the increment operation to do all items in a linked list.
			char * Next = *( char** )ItemNote;
			// Do not tally first or second notes,
			// which are filename and title lines.
			NoteCounter++;
			if( NoteCounter > 2 )
			{
				char * start = ItemNote + sizeof( char* );
				char * past = start;
				// Scan the asciz record to tally each word in it.
				while ( *past != '\0' )
				{
					past++;
					if( *past <= ' ' )
					{
						// There should never be zero length cases,
						// but I see fit to defend my code from it.
						// For example, user may have edited lines.
						// In fact, to defend myself from a random
						// file being used as a list file, such as
						// some grep output that I mistakenly used,
						// skip any words not starting with alphas.
						if( past > start + 1
						&& isalpha ( *start ) )
						{
							AddWordToVocabulary( start, past );
							if( BestMatch > 0 )
							{
								int j = 0;
								do
								{
									char * from = NamingListPtrs [j+j];
									char * into = start;
									if( *from == *into )
									{
										do
										{
											from++; into++;
										} // do...
										while( *from == *into && *from != '\0' );
										// Now test comparison loop outcome
										if( *from == '*'
										|| *from == '\0' && into == past )
										{
											BestMatch = j;
											break;
										}
									}
								} // do...
								while( ++j < BestMatch );
							}
						}
						start = past + 1;
					}
				}
			}
			ItemNote = Next;
		}
		// Having run through all the notes for this URL,
		// put in .Quality the index of earliest matching term.
		pResortedUrlPtrs[i].Quality = BestMatch;
		i++;
	}
	return;
}

void AnalysisToRenameLocalFile( )
{
	// Rename file if uncommon vocabulary in page has user's favorite words.
	// For path in "@path word1 word2..."; Sum up word1, word2... counts.
	// Do that for every path. Whatever path rates highest gets the file.
	int j = 0;
	char *lastpath = 0;
	int lastsum = 0;
	char *bestpath = 0;
	int bestsum = 0;
	Flow( 13809 );
	// If page contained some uncommon vocabulary, sum the greatest matches.
	if( nSortedVocabulary > 0 )
	{
		while( j < NamingListTermsCount )
		{
			// Every pair of pointers locates one word, then its path:
			// NamingListPtrs [j+j] is the word
			// NamingListPtrs [j+j+1] is the path
			// fprintf( stdout, "Word=%s: Path=%s.\r\n", NamingListPtrs [j+j], NamingListPtrs [j+j+1] );
			if( j == 0
			|| lastpath != NamingListPtrs [j+j+1] )
			{
				// Initialize sum for this newly encountered path
				lastpath = NamingListPtrs [j+j+1];
				lastsum = 0;
			}

			// Add the count found for this word to the total for this path.
			{
				int n = QuantityOfWordInVocabulary( NamingListPtrs [j+j] );
				// fprintf( stdout, "Word %s occurred ( %d ) times.\r\n", NamingListPtrs [j+j], n );
				lastsum += n;
			}

			++j; // loop advance buried in here

			if( j == NamingListTermsCount
			|| lastpath != NamingListPtrs [j+j+1] )
			{
				// We exhausted the word list of a path.
				// Complete the processing for the path.
				// fprintf( stdout, "Path %s totaled ( %d ) hits.\r\n", lastpath, lastsum );
				if( bestsum < lastsum )
				{
					bestsum = lastsum;
					bestpath = lastpath;
				}
			}
		}
	}
	if( bestsum > 0
	|| DefaultNamingPath != ( char* )0 )
	{
		// Found some of user's words of interest in "@path1 word1 word2...".
		// Or, a default renaming path is available, specified as "@path".
		// Rename the file just fetched or just scanned to dwell under path1.
		// If I merely allow rename to fail on identical files, then there
		// is nothing more to do but join up bestpath with the filename.ext
		// ( but not the path ) part of the localfilename as destination file.
		// What a pleasure: rename( "samo", "samo" ) does not return an error.
		char wk [260];
		char * into = wk;
		char * from = ( bestsum > 0 ) ? bestpath : DefaultNamingPath;
		if( BaseURLHasAQueryPart
		&& DefaultNamingPath != ( char* )0 )
		{
			// On Jun 4 1999, I am adding a new rule, that any URL having
			// a query part will be redirected into the DefaultNamingPath.
			// Those URLs are often search engine results, web rings, etc.
			from = DefaultNamingPath;
		}
		while( ( *into=*from )!='\0' && into < wk + sizeof( wk ) - 1 - 16 )
			from++, into++;

		// Use the final part of URL pathpart for the destination filename.
		// Now after the ProcessForEndOfInputFile, Base URL is in Past URL.

		from = ProposedFileBaseName;
		// Here is where the -R function makes its "UniqueFilename".
		// Change to trim to 8 including uniqueness, always add .htm:
		char * FirstChar = into;
		while( ( *into=*from )!='\0' && into < wk + sizeof( wk ) - 1 - 8 )
			from++, into++;
		// leave 'into' sitting on the '\0', for further unique naming tries.
		into [0] = '.';
		into [1] = 'h';
		into [2] = 't';
		into [3] = 'm';
		into [4] = '\0';
		if( _access( wk, 00 ) == 0 ) // Desired file name already exists
		{
			// Test filenames for existance, vary until have a unique filename.
			int i = 1;
			do {
				// Try to append numerical suffix, if filename shorter than 8.
				int j =  i < 10 ? 1 : i < 100 ? 2 : i < 1000 ? 3 : 4;
				int k = 8 - ( into - FirstChar );
				if( j > k )
					j = k; // Shift suffix over filename to be under 8 chars.
				into [j - 1] = '0' + i % 10;
				if( i >= 10 )
					into [j - 2] = '0' + i / 10 % 10;
				if( i >= 100 )
					into [j - 3] = '0' + i / 100 % 10;
				if( i >= 1000 )
					into [j - 4] = '0' + i / 1000 % 10;
				if( i >= 10000 )
				{
					// Prevent infinite loop at 1000-th fetch of same name.
					// I actually hit this condition when only set to 1000.
					fprintf( stderr, "Error: 10000 same filenames.\n" );
					exit( 1 );
				}
				into [j + 0] = '.';
				into [j + 1] = 'h';
				into [j + 2] = 't';
				into [j + 3] = 'm';
				into [j + 4] = '\0';
				i ++;
			} while( _access( wk, 00 ) == 0 );
		}
		// fprintf( stdout, "Rename file <%s> to <%s>.\r\n", LocalFilename, wk );
		if( rename( LocalFilename, wk ) != 0 )
		{
			// Write -R errors to stdout, so user can collect them using ">"
			fprintf( stdout, "SURF: cannot rename %s to %s: %s.\n", LocalFilename, wk, strerror( errno ) );
			// However, do not exit( ).
		}
	}
	return;
}


void WriteUrlListToScriptFile( )
{
	// This is always preceded by a call to SplitOutTheFetchedUrls( ).
	// Output the URLs and notes to LIST and HTM for -W, and free all.
	int i, j, jordinal;
	FILE * fs = fopen( ScriptFilename, "wb" );
	Flow( 11702 );
	if( fs == ( FILE* )0 )
	{
		fprintf( stderr, "SURF: cannot create %s: %s.\n", ScriptFilename, strerror( errno ) );
		exit( 1 );
	}
	// Start with introductory text, and for HTML a nice file header.
	{
		// Get one line of most-used-of-the-most-used-words to adorn files.
		char FrequentWords [120]; // As big as OutputTopOfVocabulary's WorkBuffer
		FrequentWords [0] = '\0';
		if( nSortedVocabulary > 0 ) // I must guarantee nSortedVocabulary > 0
		{
			// Copy here 1 line of the most-used words, about 60 chars,
			// to describe files to annotate the LIST and -W HTML file.
			ResortVocabularyByFrequency( ); // Into a separate vector
			OutputTopOfVocabulary( FrequentWords ); // Passing a Non-NULL pointer
			FreeUpTheVocabulary( ); // This call is for -W, not for an HTML file.
		}

		// Prefix some introductory text to the URL LIST file.
		fprintf( fs,
		" SURF URL file: '%s'\r\n"
		" %s\r\n"
		"\r\n"
		" # prevents re-downloading web pages that you already have.\r\n"
		" * prevents downloading novel links. Delete * to SURF them.\r\n"
		, ScriptFilename, FrequentWords );

		// Prefix an HTML header if also creating that output:
		if( ProgramIsDoingWrite )
		{
			// Make a TITLE: "1234 Surf links: topic topic topic...".
			// Make a H1 line: "1234 Surf links".
			// Make a H2 line: "topic topic topic...".
			// I use binary mode for stdout: then I can put \r\n uniformly.
			fprintf( stdout,
			"<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\r\n"
			"<HTML>\r\n"
			"<HEAD>\r\n"
			"<TITLE>%d SURF links: %s</TITLE>\r\n"
			"</HEAD>\r\n"
			"<BODY>\r\n"
			"<H1>%d SURF links</H1>\r\n"
			"<H2>%s</H2>\r\n"
			" \r\n", nResortedUrlPtrs, FrequentWords, nResortedUrlPtrs, FrequentWords );
		}
	}

	// If there were any, output list of user controls to relocate files.
	// The following is a pretty generic linked list consumer code block.
	if( NamingListHead != ( void** )0 )
	{
		void** Run = NamingListHead;
		fprintf( fs, "\r\n" );
		do // I proved above that ( Run != ( void** )0 )
		{
			char * TextData = ( char* )Run + sizeof( void** );
			fprintf( fs, "@%s\r\n", TextData );
			{
				// This is the increment operation to do all items in a linked list.
				void** next = ( void** )*Run;
				// No, I'm NOT the last user of NamingList... free( Run );
				Run = next;
			}
		} // do...
		while ( Run != ( void** )0 );
	}

	// This marker helps automate stream edits, and helps user understand.
	fprintf( fs, "\r\n\t----LOCAL FILES----\r\n" );

	// Process the new list to output all the old ( '#' ) links.
	// Outer loop, j, will re-run the new list multiple times
	// to collect by order of the .Quality keys from the NamingList.
	// URLs whose most-used-words show no matches bear .Quality max.

	// But first, clone a piece of that loop to prepare local
	// anchors into local anchors in the HTML output version:
	// <UL>
	// <LI><A HREF="#keyword">keyword</A>
	// </UL>
	if( ProgramIsDoingWrite
	&& IHaveFileRenamingData )
	{
		// Add a list of in-page anchors for the HTML.
		// Process from 0 to NamingListTermsCount - 1.
		// Then do one more loop to do "All The Rest".
		jordinal = 0;
		j = 0;
		fprintf( stdout, "<BR>Selections<UL>\r\n" );
		while( j <= NamingListTermsCount )
		{
			// Rather than index J one by one through all the
			// categorization words, group together all those
			// words that point to the same file storage path.
			int jmin = j;
			int jmax = j;
			do
			{
				j++; // Forcing final outer iteration to stop
			} // do...
			while( j < NamingListTermsCount // protecting next access
				&& NamingListPtrs [j+j+1]
				== NamingListPtrs [jmin+jmin+1] );
			jmax = j - 1;

			fprintf( stdout, "<LI><A HREF=\"#" );
			if( jmin == NamingListTermsCount )
			{
				fprintf( stdout, "AllTheRest" );
			}
			else
			{
				int k = jmin;
				do
				{
					fprintf( stdout, "%s", NamingListPtrs [k+k] );
				} // do...
				while( ++k <= jmax );
			}
			fprintf( stdout, "\">( %d )", ++jordinal );
			if( jmin == NamingListTermsCount )
			{
				fprintf( stdout, " All The Rest" );
			}
			else
			{
				int k = jmin;
				do
				{
					fprintf( stdout, " %s", NamingListPtrs [k+k] );
				} // do...
				while( ++k <= jmax );
			}
			fprintf( stdout, "</A>\r\n" );
		}
		fprintf( stdout, "</UL>\r\n" );
	}

	// Finally, the promised loop to output fetched URLs.
	j = 0;
	jordinal = 0;
	while( j <= NamingListTermsCount )
	{
		// Process from 0 to NamingListTermsCount - 1.
		// Then do one more loop to do "All The Rest".
		// Rather than index J one by one through all the
		// categorization words, group together all those
		// words that point to the same file storage path.
		int jmin = j;
		int jmax = j;
		do
		{
			j++; // Forcing final outer iteration to stop
		} // do...
		while( j < NamingListTermsCount // protecting next access
			&& NamingListPtrs [j+j+1]
			== NamingListPtrs [jmin+jmin+1] );
		jmax = j - 1;

		if( NamingListTermsCount > 0 )
		{
			// If there is a NamingList, show terms in LIST file;
			fprintf( fs, "\r\n [%d]", ++jordinal );
			if( jmin == NamingListTermsCount )
			{
				fprintf( fs, " All The Rest" );
			}
			else
			{
				int k = jmin;
				do
				{
					fprintf( fs, " %s", NamingListPtrs [k+k] );
				} // do...
				while( ++k <= jmax );
			}
			fprintf( fs, "\r\n\r\n" );
			if( ProgramIsDoingWrite )
			{
				// Make in-page anchors for the HTML.
				fprintf( stdout, "\r\n<H2><A NAME=\"" );
				if( jmin == NamingListTermsCount )
				{
					fprintf( stdout, "AllTheRest" );
				}
				else
				{
					int k = jmin;
					do
					{
						fprintf( stdout, "%s", NamingListPtrs [k+k] );
					} // do...
					while( ++k <= jmax );
				}
				fprintf( stdout, "\">( %d )", jordinal );
				if( jmin == NamingListTermsCount )
				{
					fprintf( stdout, " All The Rest" );
				}
				else
				{
					int k = jmin;
					do
					{
						fprintf( stdout, " %s", NamingListPtrs [k+k] );
					} // do...
					while( ++k <= jmax );
				}
				fprintf( stdout, "</A></H2>\r\n" );
			}
		}
		i = 0;
		while( i < nResortedUrlPtrs )
		{
			// Cloned guts from the loop below.
			// Then I added HTML-to-stdout work.
			// Now the guts are conditional on J.
			if( pResortedUrlPtrs[i].Quality <= jmax )
			{
				pResortedUrlPtrs[i].Quality = 12345678; // Never come back here.
				fputs( pResortedUrlPtrs[i].pUrl, fs );
				fputc( '\r', fs );
				fputc( '\n', fs );
				if( ProgramIsDoingWrite )
				{
					// For HTML, prefix an ordinal to orient user;
					// Output the URL inside an Anchor <A...> tag.
					fprintf( stdout,
					"\r\n<p> %d. <a href =\"%s\">\r\n",
					i + 1, pResortedUrlPtrs[i].pUrl );
				}
				free( pResortedUrlPtrs[i].pUrl ); // I last user, so free it.
				// If there are notes for this url,
				// put each one on a new line starting with a space or tab char.
				{
					char * ItemNote = pResortedUrlPtrs[i].pNote;
					int NoteCounter = 0;
					while( ItemNote != ( char* )0 )
					{
						// This is the increment operation to do all items in a linked list.
						char * Next = *( char** )ItemNote;
						fputc( ' ', fs );
						fputs( ItemNote + sizeof( char* ), fs );
						fputc( '\r', fs );
						fputc( '\n', fs );
						if( ProgramIsDoingWrite )
						{
							NoteCounter++;
							// Skip the first note, such as "#329 32/9tn8it.htm"
							if( NoteCounter != 1 )
							{
								fputc( ' ', stdout );
								fputs( ItemNote + sizeof( char* ), stdout );
								// Add End-Anchor </A> tag after note #2 == title.
								if( NoteCounter == 2 )
									fputs( "</a><br>", stdout );
								fputc( '\r', stdout );
								fputc( '\n', stdout );
							}
						}
						free( ItemNote ); // I'm last user, so free it.
						ItemNote = Next;
					}
					if( ProgramIsDoingWrite )
					{
						// Ensure there is always an End-Anchor </A> tag.
						if( NoteCounter < 2 )
							fputs( "No Title</a><br>\r\n", stdout );
					}
				}
				fputc( '\r', fs );
				fputc( '\n', fs );
			}
			i++;
		}
	}


	// I am finished with the resorting vector.
	free( pResortedUrlPtrs ); // Created recently in SplitOutTheFetchedUrls.
	pResortedUrlPtrs = ( struct UrlPtrsX * ) 0; // Not that anyone ever looks.
	nResortedUrlPtrs = 0;

	// This marker helps automate stream edits, and helps user understand.
	fprintf( fs,	"\t----NOVEL LINKS----\r\n" );

	// And I can wrap up the HTML listing output now.
	if( ProgramIsDoingWrite )
	{
		// Enabling Windows 95 accessibility option "high contrast" makes
		// status line fat; This <P> helps Netscape not hide final text.
		fprintf( stdout,
		"\r\n"
		"<P>\r\n"
		"</BODY>\r\n"
		"</HTML>\r\n" );
	}

	// Reprocess original list to output all the new ( '*' ) links.
	i = 0;
	while( i < nSortedUrlPtrs )
	{
		// Skip over any items that were nulled out of this list.
		if( pSortedUrlPtrs[i].pUrl != ( char* )0 )
		{
			char * ItemNote = pSortedUrlPtrs[i].pNote;
			// Problem: Many unfetched URLs glut memory during -F work.
			// So, as a way to compact my huge URL list, I let SURF -B
			// invocation discard any URL that has an asterisk in note.
			// It will not remove unfetched urls with asterisk removed.
			int writeurl = ( ProgramIsSavingURLs
				|| ItemNote == ( char * )0
				|| *( char * )( ItemNote + sizeof( char* ) ) != '*' );
			if( writeurl )
			{
				fputs( pSortedUrlPtrs[i].pUrl, fs );
				fputc( '\r', fs );
				fputc( '\n', fs );
			}
			free( pSortedUrlPtrs[i].pUrl );
			// If there are notes for this url,
			// put each one on a new line starting with a space or tab char.
			while( ItemNote != ( char* )0 )
			{
				// This is the increment operation to do all items in a linked list.
				char * Next = *( char** )ItemNote;
				if( writeurl )
				{
					fputc( ' ', fs );
					fputs( ItemNote + sizeof( char* ), fs );
					fputc( '\r', fs );
					fputc( '\n', fs );
				}
				free( ItemNote );
				ItemNote = Next;
			}
			if( writeurl )
			{
				fputc( '\r', fs );
				fputc( '\n', fs );
			}
		}
		i++;
	}
	if( fclose( fs ) == -1 )
	{
		fprintf( stderr, "SURF: error closing file %s: %s.\n", ScriptFilename, strerror( errno ) );
		exit( 1 );
	}
	// Restore the empty condition of the list
	if( pSortedUrlPtrs != ( struct UrlPtrs * )0 )
		free( pSortedUrlPtrs );
	pSortedUrlPtrs = ( struct UrlPtrs * )0;
	nMallocUrlPtrs = 0;
	nSortedUrlPtrs = 0;
	return;
}

void FreeUpTheFileNamingLists( )
{
	// This was put at the end of WriteUrlListToScriptFile( ),
	// but I split this off to permit file renaming activity
	// under a -R flag to take place without re-writing list.
	// Free URLS and linked URL notes, then the global array, etc.
	Flow( 15771 );
	if( NamingListHead != ( void** )0 )
	{
		void** Run = NamingListHead;
		do // I proved above that ( Run != ( void** )0 )
		{
			// This is the increment operation to do all items in a linked list.
			void** next = ( void** )*Run;
			free( Run ); // I'm last user of NamingList, emptying it.
			Run = next;
		} // do...
		while ( Run != ( void** )0 );
		// Keep empty list's pointer integrity, in case resurrected.
		NamingListTail = ( void** ) & NamingListHead;
		NamingListHead = ( void** ) 0;
		// That list implied 2 other data stores, now free them too:
		if( NamingListStringsBuffer != ( char* )0 )
		{
			free( NamingListStringsBuffer );
			NamingListStringsBuffer = ( char* )0;
		}
		NamingListWorstCaseSize = 0;
		if( NamingListPtrs != ( char** )0 )
		{
			free( NamingListPtrs );
			NamingListPtrs = ( char** )0;
		}
		NamingListTermsCount = 0;
		IHaveFileRenamingData = 0;
	}
	return;
}


void FetchObjectFromUrlToFile( char * pUrl )
{
	// This routine started from the MSVC++ 5.0 sample code program "TEAR".
	// See the VC++ help page "Steps in a Typical HTTP Client Application".
	CInternetSession session( SurfVersion, 1, PRE_CONFIG_INTERNET_ACCESS );
	CHttpConnection* pServer = NULL;
	CHttpFile* pFile = NULL;
	DWORD dwRet = 0;

	Flow( 11957 );

	// Canonicalize this Url and save as Base Url to resolve local Anchor Urls.
	// Reset Base Url so that ordinary anchor URL parsing process will suffice.
	memset( ( void* ) & BaseUrlParts, 0, sizeof( BaseUrlParts ) );

	// Because GetLinkUrlParts is destructive, make a copy of URL to fetch:
	strcpy( inbuf, pUrl );				// First misuse of inbuf

	if( GetLinkUrlParts( inbuf, inbuf + strlen( inbuf ) ) != 0 )
	{
		fprintf( stderr, "SURF: Unsuitable Url: \"%s\".\n", pUrl );
		return;
	}
	{
		// Reconstitute a perfect Url.
		char * from = LinkUrlParts.scheme;
		char * into = inbuf; // Second misuse of inbuf
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
		*into++ = ':';
		*into++ = '/';
		*into++ = '/';
		from = LinkUrlParts.netlocn;
		if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );

		// Often, URL's urlpath may be empty, having only a net location.
		// I wonder if all servers would match "domain.x/" to "domain.x".

		// The reason I worry, is that Netscape Navigator will not resolve
		// relative urls correctly when browsing these locally saved files
		// as annotated with BASE HREF="domain.x" held on local hard drive.
		// I'm going to try appending a '/' to URLS that are just a domain name.
		// But this 3/15/98 change, 2 places, might interfere with url fetching.
		// By the way, that may no longer be a problem, since adding the 302
		// redirection work: E.g. hughes.net redirects ~scheper to ~scheper/
		if( *LinkUrlParts.urlpath == '\0'
		&& *LinkUrlParts.parameter == '\0'
		&& *LinkUrlParts.query == '\0' )
		{
			*into++ = '/';
			*into = '\0';
		}
		else
		{
			from = LinkUrlParts.urlpath;
			if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
			from = LinkUrlParts.parameter;
			if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
			from = LinkUrlParts.query;
			if( ( *into=*from )!='\0' ) do from++, into++; while( ( *into=*from )!='\0' );
		}
		if( into >= inbuf + sizeof( BaseUrlText ) )
		{
			fprintf( stderr, "SURF: Unsuitable Url: \"%s\".\n", pUrl );
			return;
		}
		strcpy( BaseUrlText, inbuf ); // Done with misuse of inbuf
		BaseTextSaved = 1;

		// This BaseUrlParts will be used to reconstitute incomplete Anchor Urls:
		memcpy( ( void* ) & BaseUrlParts, ( void* ) & LinkUrlParts, sizeof( BaseUrlParts ) );
	}
	// During -F, the final path part of URL is known in advance,
	// So make it into a local filename in the current directory.
	// With -F there's no final renaming according to vocabulary.
	// User should apply -R to do that, and -B or -A to fix list.
	{
		char * from = ProposedFileBaseName;
		char * into = UniqueFilename;
		// During -F or -Q or SURF with no flag, the URL is known in advance,
		// so use it to propose the local filename which will hold this file.
		// This routine also compares URL to a list of known search engine URLs.
		// A match will make result filename start with "_", and also make me
		// omit fetch-stop asterisks for hit URLs returned by a search engine.
		TestBaseURLAgainstQueryResultPages( );
		ProposeFilenameFromBaseURL( ); // Make local filename like URL for -F
		// Here is where the -F function makes its "UniqueFilename".
		// Change to trim to 8 including uniqueness, always add .htm:
		char * FirstChar = into;
		while( ( *into=*from )!='\0' && into < UniqueFilename + sizeof( UniqueFilename ) - 1 - 8 )
			from++, into++;
		// leave 'into' sitting on the '\0', for further unique naming tries.
		into [0] = '.';
		into [1] = 'h';
		into [2] = 't';
		into [3] = 'm';
		into [4] = '\0';
		if( _access( UniqueFilename, 00 ) == 0 ) // Desired file name already exists
		{
			// Test filenames for existance, vary until have a unique filename.
			int i = 1;
			do {
				// Try to append numerical suffix, if filename shorter than 8.
				int j =  i < 10 ? 1 : i < 100 ? 2 : 3;
				int k = 8 - ( into - FirstChar );
				if( j > k )
					j = k; // Shift suffix over filename to be under 8 chars.
				into [j - 1] = '0' + i % 10;
				if( i >= 10 )
					into [j - 2] = '0' + i / 10 % 10;
				if( i >= 100 )
					into [j - 3] = '0' + i / 100 % 10;
				if( i >= 1000 )
				{
					// Prevent infinite loop at 1000-th fetch of same name.
					fprintf( stderr, "Error: 1000 same filenames.\n" );
					exit( 1 );
				}
				into [j + 0] = '.';
				into [j + 1] = 'h';
				into [j + 2] = 't';
				into [j + 3] = 'm';
				into [j + 4] = '\0';
				i ++;
			} while( _access( UniqueFilename, 00 ) == 0 );
		}
		ObjectFile = fopen( UniqueFilename, "w" );
		if( ObjectFile == ( FILE* )0 )
		{
			fprintf( stderr, "SURF: cannot create %s: %s.\n", UniqueFilename, strerror( errno ) );
			exit( 1 );
		}
		LocalFilename = UniqueFilename;

		// I start each locally cached file with an HTML <BASE...> tag,
		// so local anchors can be resolved correctly when the file is
		// viewed in a browser, or when the file is reanalyzed by SURF.
		// And also add as an anchor, so user can browse original page.
		fprintf( ObjectFile, "<BASE HREF=%s>\n", BaseUrlText );
	}

	// Give a terse indication of progress in urls and filenames:
	fprintf( stderr, "\n%s\n", BaseUrlText ); // Filename will come from notes

	try
	{
		CString strServerName;
		CString strObject;
		INTERNET_PORT nPort;
		DWORD dwServiceType;
		if( AfxParseURL( BaseUrlText, dwServiceType, strServerName, strObject, nPort ) )
		{
			// On March 30, 1999 I discovered that the AfxParseURL routine
			// will re-expand any '%' character held in the urlpath part
			// of url, but not in the netlocn, parameter or query, so I'll
			// have to compact them back out again. I don't want to modify
			// the outside work, lest I have to handle spaces etc. in URLs.
			// printf( "AfxParseUrl was given '%s'.\n", BaseUrlText );
			// printf( "AfxParseUrl made strServerName into '%s'.\n", ( LPCTSTR ) strServerName );
			// printf( "AfxParseUrl made strObject into '%s'.\n", ( LPCTSTR ) strObject );
			{
				// So, breakdown. I finally have to manipulate a CString object.
				// Surprise. CString [] do not provide an lvalue for assigments.
				int i = 0;
				int j = 0;
				int n = strObject.GetLength( );
				while ( i < n )
				{
					strObject.SetAt( j, strObject[ i ] );
					// Look for "%25" ( '%' ) followed by a USASCII 20-7f value
					if( strObject[ i ] == '%'
					&& i + 4 < n
					&& strObject[ i + 1 ] == '2'
					&& strObject[ i + 2 ] == '5'
					&& strObject[ i + 3 ] >= '2'
					&& strObject[ i + 3 ] <= '7'
					&& isxdigit( strObject[ i + 4 ] ) )
					{
						// Convert the %25 back into a simple % character.
						// In other words, drop out the 2 characters "25".
						i += 2;
					}
					i++;
					j++;
				}
				// I don't suppose nulls will terminate CString objects. So,
				strObject = strObject.Left ( j );
				// printf( "I made strObject back into '%s'.\n", ( LPCTSTR ) strObject );
			}
			// Currently, I only support the http: method.
			if( dwServiceType == INTERNET_SERVICE_HTTP )
			{
				pServer = session.GetHttpConnection( strServerName, nPort );
				pFile = pServer->OpenRequest( CHttpConnection::HTTP_VERB_GET,
					strObject, NULL, 1, NULL, "HTTP/1.0",
					INTERNET_FLAG_EXISTING_CONNECT | INTERNET_FLAG_NO_AUTO_REDIRECT );
				pFile->AddRequestHeaders( _T( "Accept: text/*\r\nUser-Agent: SURF\r\n" ) );
				pFile->SendRequest( );

				// There are sites with lists of URLs that are all local
				// and return a URL redirect code 302 which your typical
				// browser quietly fetches, showing only that final URL.
				// Also, some search engines return URLS like self?other.
				// Before I put in the No Auto Redirect flag above, SURF
				// would record such sites' misleading unredirected URLs.
				// The HTML pages' contents associated with the 302 code
				// usually show the URL, but to be safe I will parse the
				// header for the URL, as was shown in the TEAR program.

				pFile->QueryInfoStatusCode( dwRet );
				if( dwRet == HTTP_STATUS_MOVED			// 301 object permanently moved
				|| dwRet == HTTP_STATUS_REDIRECT		// 302 object temporarily moved
				|| dwRet == HTTP_STATUS_REDIRECT_METHOD ) // 303 redirection w/ new access method
				{
					CString strHeader;
					pFile->QueryInfo( HTTP_QUERY_RAW_HEADERS_CRLF, strHeader );
					char *LocnLine = strstr ( ( LPCTSTR ) strHeader, "Location: " );
					if( LocnLine != ( char* )0 )
					{
						char *p1 = LocnLine + 10;
						char *p2 = LocnLine + 10;
						while ( ( unsigned char )*p2 >= ' ' ) p2++;
						if( GetLinkUrlParts( p1, p2 ) != -1 )
						{
							// I have to append a '/' to URLS that are just a bare domain name.
							// Otherwise, CombineLinkUrlWithBase rejects them, because it was
							// written for the assumption that it was analyzing an Anchor URL.
							// ( So I pondered, saying, what good was an anchor to the root? )
							if( *LinkUrlParts.urlpath == '\0'
							&& *LinkUrlParts.parameter == '\0'
							&& *LinkUrlParts.query == '\0' )
							{
								LinkUrlParts.urlpath [0] = '/';
								LinkUrlParts.urlpath [1] = '\0';
							}
							if( CombineLinkUrlWithBase( ) != -1 )
							{
								// Fit this fully specified url into the sorted list.
								// I will be omitting the asterisk on this URL's note
								// so that SURF will continue to fetch this page from
								// the redirected location automatically. Therefore I
								// do not need old notes, as I will soon see the page.
								AddLinkUrlPartsToUrlList( ); // Add the redirected URL.
								{
									char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
									// Delete old notes in case note1 starts with a '*'.
									// But if note1 has '#', I already fetched the page.
									// In that case, don't fiddle with the new URL item.
									if( Item == ( char* )0
									|| *( Item + sizeof( char* ) ) != '#' )
									{
										while( Item != ( char* )0 )
										{
											// This is the increment operation to do all items in a linked list.
											char * Next = *( char** )Item;
											free( Item );
											Item = Next;
										}
										// Be sure to null this head pointer just freed:
										pSortedUrlPtrs[LastUrlPtrSlot].pNote = ( char* )0;
										MergeNoteAtLastUrlPtrSlot( pUrl ); // First note = original URL with no asterisk
									}
								}
							}
						}
					}
				}

				if( dwRet == HTTP_STATUS_OK )
				{
					// Clone and adapt the successful file parsing loop...
					// One of my favorite file input methods is to read whole
					// next disk block just past the tail of unprocessed data.
					int noffset = 0;
					for( ;; )
					{
						int nread;
						// Adapt my loop to the stream read of the sample code.

						char *cp = pFile->ReadString( ( inbuf+4 ) + noffset, 4096 );
						// EOF gives null ptr.
						if( cp == ( char* )0 )
							nread = 0;
						else
							nread = strlen( cp );
						if( nread <= 0 )
						{
							if( noffset == 0 )
							{
								// At EOF, and nothing was left from prior loop.
								break;
							}
							// Tail data from prior loop lacks a space at EOF.
							// Append a final space character to assist parse.
							( inbuf+4 ) [noffset] = ' ';
							noffset++;
						}

						// This may copy back some tail of data partly processed.
						noffset = MultiPassInbufConversion( noffset + nread );
						if( noffset == -1 )
							break;  // unacceptable garbage or binary data
						if( nread == 0 )
							break;  // Input had reached EOF

						// Ever get stuck in a long file? I'm fixing that now.
						if( UserKeystrokeDetected( ) )
						{
							fprintf( stderr, "SURF: Stopping download of %s.\n", BaseUrlText );
							dwRet = 1; // Change from HTTP_STATUS_OK
							break;
						}
					} // for...
				}
				pFile->Close( );
				pServer->Close( );
			}
		}
	}
	catch ( CInternetException* pEx )
	{
		// catch errors from WinINet
		TCHAR szErr[1024];
		dwRet = pEx->m_dwError; // To format the 12000 errors like HTTP statuses
		pEx->GetErrorMessage( szErr, 1024 );
		fprintf( stderr, "Error: ( %d ) %s.\n", pEx->m_dwError, ( LPCTSTR )szErr );
		{
			// I should get all words of the error message, but get this:
			char ISaid [] = "Error";
			AddWordToVocabulary( ISaid, ISaid + sizeof( ISaid ) - 1 );
		}
		fprintf( ObjectFile, "\nError: ( %d ) %s.\n", pEx->m_dwError, ( LPCTSTR )szErr );
		pEx->Delete( );
	}

	// Flush any internal parse buffers, also restore all states.
	ProcessForEndOfInputFile( dwRet );

	// These were already created before that goto too.
	if ( pFile != NULL )
		delete pFile;
	if ( pServer != NULL )
		delete pServer;
	session.Close( );
	if( fclose( ObjectFile ) == -1 )
	{
		fprintf( stderr, "SURF: error closing file %s: %s.\n", UniqueFilename, strerror( errno ) );
		exit( 1 );
	}
	ObjectFile = ( FILE * ) 0;
	if ( dwRet == HTTP_STATUS_OK )
	{
		// This section is deleted because of filename in list notes.
		// With -F there's no final renaming according to vocabulary.
		// User should apply -R to do that, and -B or -A to fix list.
	}
	else
	{
		// The -F or No <-> fetching yielded a non-200 status. Often 302, 404.
		// Such items will remain in the list, but do not keep such files.
		// Whenever SURF -A or -B rebuild LIST afresh from the local files,
		// the data for such URLs will be lost, and if their '*' is removed,
		// they will be resurfed, an unfortunate waste of internet bandwidth.
		if( remove( UniqueFilename ) == -1 )
		{
			fprintf( stderr, "SURF: cannot remove %s: %s.\n", UniqueFilename, strerror( errno ) );
			// no need to... exit( 1 );
		}
	}
	FreeUpTheVocabulary( );
	return;
}

int RandomlyFetchOneURLFromUrlList( )
{
	// Choose one URL at random, fetch that Internet object, mark URL.
	// Return -1 to stop caller's loop when URL list is exhausted.
	// The 2/98 version didn't fetch randomly when had over 32K links:
	int i = ( rand( ) * rand( ) ) % nSortedUrlPtrs;
	int n = i;
	Flow( 12520 );
	// Search for the next URL that was not fetched
	for( ;; ) {
		// Fall out of loop when i locate an unfetched URL to fetch.
		// Do not fetch if "*" is the first character of the first note.
		// Nor if "#" is the first character of the first note.
		if( pSortedUrlPtrs[i].pNote == ( char* )0
		|| *( pSortedUrlPtrs[i].pNote + sizeof( char* ) ) != '*'
		&& *( pSortedUrlPtrs[i].pNote + sizeof( char* ) ) != '#' )
			break; // That URL at random i was acceptable
		if( ++i == nSortedUrlPtrs )
			i = 0; // Keep looking: increment and wrap around
		if( i == n ) // Back to the original random choice.
			return -1; // signal no more URLs to fetch with -1
	}
	// Before this file is fetched, mark out this slot [i] as fetched.
	// This is necessary for files without "nice" urls, but generally
	// ProcessForEndOfInputFile will replace with a note starting '#',
	// and I do ProcessForEndOfInputFile via FetchObjectFromUrlToFile.
	{
		static char Blurb [] = "#( FETCHED )";
		char* Extant = pSortedUrlPtrs[i].pNote;
		char* Item = ( char* ) malloc( sizeof( char* ) + sizeof( Blurb ) );
		if( Item == ( char* )0 )
		{
			fprintf( stderr, "SURF: Malloc failure adding URLs to list.\n" );
			exit( 1 );
		}
		*( char** )Item = Extant;
		strcpy( Item + sizeof( char* ), Blurb );
		pSortedUrlPtrs[i].pNote = Item;
	}

	LastUrlPtrSlot = -1;				// Reset before parsing each file
	FetchObjectFromUrlToFile( pSortedUrlPtrs[i].pUrl );

	// Output fetched page's analysis, so user can gauge productivity.
	// Notice that my i has become obsolete, if some links were added.
	// ProcessForEndOfInputFile will write a first note with '#' mark,
	// and I do ProcessForEndOfInputFile via FetchObjectFromUrlToFile.
	if( LastUrlPtrSlot != -1 ) // be sure there was a prior URL
	{
		char * Item = pSortedUrlPtrs[LastUrlPtrSlot].pNote;
		while( Item != ( char* )0 )
		{
			// This is the increment operation to do all items in a linked list.
			char * Next = *( char** )Item;
			fprintf( stdout, " %s\n", Item + sizeof( char* ) );
			Item = Next;
		}
	}
	return 0;
}

void PrepareFileSavingControls( )
{
	// Caller has previously called ReadScriptFileToUrlList( ),
	// wherein any lines in LIST starting with @ were set aside assuming
	// that they specified paths to rename files containing match terms.
	// My caller has checked and guarantees NamingListWorstCaseSize > 0.
	void** Run;
	char *fill, *TopOfRecord;
	Flow( 17508 );
	// I loosely counted the bytes needed when saving these lines.
	// Hold paths and words as a wad of contiguous asciiz strings.
	NamingListStringsBuffer = ( char* )malloc( NamingListWorstCaseSize );
	if( NamingListStringsBuffer == ( char* )0 )
	{
		fprintf( stderr, "SURF: Malloc failure making file saving list.\n" );
		exit( 1 );
	}
	// I use, and run through items in NamingList, but do not empty it.
	Run = NamingListHead;
	fill = NamingListStringsBuffer;
	if( Run != ( void** )0 )
	do
	{
		char * TextData = ( char* )Run + sizeof( void** );
		TopOfRecord = fill;
		// fprintf( stdout, "Filenaming control: <%s>\r\n", TextData );
		// first skip any space or tab chars.
		while( *TextData == ' ' || *TextData == '\t' )
			TextData++;
		// second, process all non-space chars as a file path.
		for( ;; )
		{
			// Stop at every path separator character;
			// Make that directory if it doesn't exist.
			// Oh, I must call this at end of path too.
			*fill = *TextData;
			if( *TextData == '/' || *TextData == '\\'
			|| *TextData == ' ' || *TextData == '\t' || *TextData == '\0' )
			{
				*fill = '\0'; // asciz terminate the partial path
				if( *TopOfRecord != '\0'
				&& fill [-1] != ':'
				&& _access( TopOfRecord, 00 ) != 0 )
				{
					// fprintf( stdout, "Need directory %s.\n", TopOfRecord );
					// If it does not exist, try to make directory.
					// Only the SURF -R feature makes directories.
					if( ProgramIsDoingMkdir )
					{
						if( _mkdir( TopOfRecord ) == -1 )
						{
							fprintf( stderr, "SURF: cannot make directory %s: %s.\n", TopOfRecord, strerror( errno ) );
							exit( 1 );
						}
					}
				}
				else
				{
					// Strictly, I ought to check if filename is a directory.
					// fprintf( stdout, "Have directory %s.\n", TopOfRecord );
				}
				*fill = '\\'; // regulate the path separator char
			}
			fill++;
			if( *TextData == ' ' || *TextData == '\t' || *TextData == '\0' )
				break;
			TextData++;
		}
		*fill++ = '\0'; // asciz terminate the overall path
		// Wierd loop construction above guaranteed \ at end.
		// process rest of record copying and counting terms.
		for( ;; )
		{
			// This loop does not guarantee terms after path.
			if( *TextData == '\0' )
				break;
			// skip whitespace; no rather, skip all non alphabetics
			while( *TextData != '\0' && !isalpha( *TextData ) )
				TextData++;
			if( *TextData == '\0' )
				break;
			// collect letters -- only [a-z] will ever match in the vocabulary.
			// Oops. Permit a final asterisk character to match rest of a word.
			while( isalpha( *TextData ) )
			{
				*fill = *TextData | ' '; // Lowercase any [A-Z] in match terms
				fill++;
				TextData++;
			}
			if( *TextData == '*' )
			{
				*fill = *TextData; // Keep a final '*' which matches rest of word.
				fill++;
				TextData++;
			}
			NamingListTermsCount++;
			IHaveFileRenamingData = 1;
			*fill++ = '\0'; // asciz terminate the term
		}
		*fill++ = '\0'; // double-null terminate each record
		// Advance from this record ( still NOT freed! ) to next record.
		Run = ( void** )*Run;
	} // do...
	while ( Run != ( void** )0 );
	*fill++ = '\0'; // triple-null end of list in case I want to use it.
	// I need memory to set up two pointers for each match term.
	if( NamingListTermsCount != 0 )
	{
		NamingListPtrs = ( char** )malloc( NamingListTermsCount * 2 * sizeof( char* ) );
		if( NamingListPtrs == ( char** )0 )
		{
			fprintf( stderr, "SURF: Malloc failure making file saving list.\n" );
			exit( 1 );
		}
	}
	// Rescan the Wad o' Text to hook up pointers to term and path.
	{
		int i = 0;
		char * scan = NamingListStringsBuffer;
		// Process records until the triple null.
		while( *scan != 0 )
		{
			TopOfRecord = scan;
			// skip over the directory path string
			while( *scan != 0 )
				scan++;
			scan++;
			if( *scan == 0 )
			{
				// There were no asciz words found for this path
				if( DefaultNamingPath != ( char* )0 )
				{
					fprintf( stderr, "SURF: More than one default path: <%s> and <%s>.\r\n",
					DefaultNamingPath, TopOfRecord );
					exit( 1 );
				}
				DefaultNamingPath = TopOfRecord;
				IHaveFileRenamingData = 1;
			}
			else
			do {
				// Now process any asciz words found
				// every pair of pointers will locate one term, then a path.
				NamingListPtrs [i++] = scan; // one word
				NamingListPtrs [i++] = TopOfRecord; // then its path
				// skip over the word
				while( *scan != 0 )
					scan++;
				scan++;
			} while( *scan != 0 );
			scan++;
		}
		// pruune auuthor fauults
		if( i != 2 * NamingListTermsCount )
		{
			fprintf( stderr, "SURF: Program error #1.\r\n" );
			exit( 1 );
		}
	}
	// Show me...
	// { int i; for( i=0; i < NamingListTermsCount; i++ ) fprintf( stdout, "<%s>=<%s>\r\n", NamingListPtrs [i+i], NamingListPtrs [i+i+1] ); }
	return;
}

int main( int argc, char* argv [] )
{
	// Using MFC API for Internet access forces me to do things like this.
	// As noted, compiler Settings: C/C++, Code Generation: Multithreaded.
	if ( !AfxWinInit( ::GetModuleHandle( NULL ), NULL, ::GetCommandLine( ), 0 ) )
	{
		fprintf( stderr, "MFC Failed to initialize.\n" );
		exit( 1 );
	}
	// Notice that reliance upon MFC makes surf only run under Windows 95.
	// You must invoke SURF from the Windows95 MS-DOS Prompt command line.
	Flow( 10352 );

#if 0
	// Enable these selfchecks once after table edits. Tested for Version 1.4.
	if ( SortedTagNames [mu_lowsentinel] [0] != '!'
	||  SortedTagNames [mu_topsentinel] [0] != '~'
	||  MarkupTagTypes [mu_lowsentinel] != -1
	||  MarkupTagTypes [mu_topsentinel] != -1
	||  ( sizeof( SortedEntityNames ) & 7 ) != 1
	||  ( sizeof( CommonWordList ) & 7 ) != 1 )
	{
		fprintf( stderr, "Bad Tables.\n" );
		exit( 1 );
	}
	// More program debugging selfchecks on tables.
	{
		int i;
		for( i=0; i < sizeof( SortedTagNames ) / sizeof( *SortedTagNames ) - 1; i ++ )
			if( strcmp( SortedTagNames [i], SortedTagNames [i+1] ) >= 0 )
			{
				fprintf( stderr, "Bad TagNames %d.\n", i );
				exit( 1 );
			}
		for( i=0; i < sizeof( SortedEntityNames ) - 1 - 8; i += 8 )
			if( strcmp( SortedEntityNames + i, SortedEntityNames + i + 8 ) >= 0 )
			{
				fprintf( stderr, "Bad EntNames %d.\n", i );
				exit( 1 );
			}
		for( i=0; i < sizeof( CommonWordList ) - 1 - 8; i += 8 )
			if( strcmp( CommonWordList + i, CommonWordList + i + 8 ) >= 0 )
			{
				fprintf( stderr, "Bad ComnWds %d.\n", i );
				exit( 1 );
			}
	}
#endif // if 0

	{
		int NextArgc = argc; // none
		char* CmdLetters = "- ";
		char CmdLetter;
		ScriptFilename = "list.txt";

		// Analyze SURF's command line arguments.
		if( argc == 1 )
		{
			// argc == 1 means no arguments: Just "SURF".
			// If there exists a file named "LIST.TXT", surf will process it.
			// Otherwise, lack of arguments gets the usage instructions.
			if( _access( ScriptFilename, 00 ) != 0 )
			{
				GiveUsageMessage( );
				exit( 1 );
			}
		}
		else
		{
			// There are arguments. Either "SURF LIST ..." or "SURF -LETTER ..."
			if( argv [1] [0] == '-' )
			{
				// The invocation was "SURF -CMD ...". Supply the filename LIST.
				CmdLetters = argv [1];
				NextArgc = 2;
			}
			else
			{
				// The invocation was "SURF XXX ..." Use user-supplied filename
				if( argc == 2 )
				{
					// The invocation was "SURF FILENAME". Use default ' ' activity.
					ScriptFilename = argv [1];
				}
				else if( argv [2] [0] == '-' )
				{
					// The invocation was "SURF FILENAME -CMD ...".
					ScriptFilename = argv [1];
					CmdLetters = argv [2];
					NextArgc = 3;
				}
				else
				{
					// The invocation was "SURF XXX XXX ..."
					GiveUsageMessage( );
					fprintf( stderr, "SURF: Invocation SURF XXX XXX... is meaningless.\n" );
					exit( 1 );
				}
			}
		}
		CmdLetter = ( char ) ( CmdLetters [1] | ' ' ); // ASCII idiom lowercases a-z
		if( CmdLetter != 'q'
		&&  CmdLetter != 'f'
		&&  CmdLetter != ' ' // Default meaning there was no -flag argument
		&&  CmdLetter != 'a'
		&&  CmdLetter != 'b'
		&&  CmdLetter != 'r'
		&&  CmdLetter != 'w' )
		{
			// The invocation had an invalid command letter
			GiveUsageMessage( );
			fprintf( stderr, "SURF: Invocation had an invalid command letter.\n" );
			exit( 1 );
		}
		{
			// Try to access ScriptFilename, supplying no filespec expansion.
			if( _access( ScriptFilename, 00 ) == 0 )
			{
				// If list file exists, import it now. Later may re-write update.
				// Valid lines of the URL list file start http: or @ or <space>.
				ReadScriptFileToUrlList( );
				// ReadScriptFileToUrlList also extracted the "@path ..." lines:
				if( CmdLetter == 'r' )
				{
					if( NamingListWorstCaseSize == 0 )
					{
						fprintf( stderr, "SURF: With -R, the file %s must contain \"@path [word...]\" lines.\n", ScriptFilename );
						exit( 1 );
					}
				}
				else
				{
					if( nSortedUrlPtrs == 0			 // Was this a real LIST with real URLs?
					&& ScriptFileByteCount > 2000 )			 // Excuse very small files with no URLs
					{
						// This check is to avoid re-writing non-list files.
						// But, accept a nearly empty list file with 0 urls.
						fprintf( stderr, "SURF: File %s had no 'http://url...' lines.\n", ScriptFilename );
						exit( 1 );
					}
				}

			}
			else
			{
				// If URL file doesn't exist, I will create it, except for -R.
				if( CmdLetter == 'r' )
				{
					fprintf( stderr, "SURF: With -R, the file %s must previously exist.\n", ScriptFilename );
					exit( 1 );
				}
				else
				{
					FILE * fs = fopen( ScriptFilename, "wb" );
					if( fs == ( FILE* )0
					|| fclose( fs ) == -1 )
					{
						fprintf( stderr, "SURF: cannot create file %s: %s.\n", ScriptFilename, strerror( errno ) );
						exit( 1 );
					}
				}
			}
		}

		// -R uses @path word... lines to reorganize local files.
		// -A,B,F,Q,W use @path word... lines to categoried URLs.
		// -W also uses @path word... lines to annotate HTM file.
		
		if( CmdLetter == 'r' )
		{
			ProgramIsDoingMkdir = 1; // Before calling PrepareFileSavingControls
		}
		if( NamingListWorstCaseSize > 0 )
		{
			PrepareFileSavingControls( );
		}
		

		if( CmdLetter == 'a' // -A, -B, and -R might have filename specs
		||  CmdLetter == 'b' // -b is a fast variant of -a operation
		||  CmdLetter == 'r' )
		{
			if ( NextArgc < argc )
			{
				// Commands a and r normally have filespec* arguments.
				int ai = NextArgc;
				while ( ai < argc )
				{
					// If filespec finds any directory,
					// I'll analyze all files under it.
					RecursivePathExpansion( 0, argv [ai] );
					ai++;
				}
			}
			else
			{
				// I will supply "*" when no filespec is given.
				// Notice that that will go into subdirectories!
				RecursivePathExpansion( 0, "*" );
			}
		}
		else if( CmdLetter == 'f' )
		{
			// Command f normally has some arguments.
			if ( NextArgc == argc )
			{
				// The invocation lacked required arguments
				GiveUsageMessage( );
				fprintf( stderr, "SURF: Invocation requires arguments after -f.\n" );
				exit( 1 );
			}
		}
		else if( CmdLetter == 'q' )
		{
			// I could say that 'q' should have some arguments too,
			// but SURF -Q could also continue a stopped -Q fetch.
			if ( NextArgc == argc
			&& nSortedUrlPtrs == 0 )
			{
				// The invocation lacked required arguments
				GiveUsageMessage( );
				fprintf( stderr, "SURF: Invocation normally requires arguments after -q.\n" );
				exit( 1 );
			}
		}
		else
		{
			// Commands <noletter> and w normally have no arguments.
			if ( NextArgc < argc )
			{
				// The invocation had unexpected arguments
				GiveUsageMessage( );
				fprintf( stderr, "SURF: Invocation requires no arguments after -w.\n" );
				exit( 1 );
			}
		}
		// Those were the commands. Any other args are file*, url or query.

		switch( CmdLetter )
		{

			// Default value is ' ' when no -FLAGS are given

		case ' ': // Surf the list file

		LetOptionQGoAndDo: ;

			if( nSortedUrlPtrs == 0 )
			{
				fprintf( stderr, "SURF: There were no valid URLs to fetch in %s.\n", ScriptFilename );
				fprintf( stderr, "( Valid URL lines must start with an 'http://...' in first column. )\n" );
			}

		LetOptionFGoAndDo: ;

			if( nSortedUrlPtrs == 0 )
			{
				exit( 1 );
			}

			ProgramIsDoingFetch = 1; // For the non-flag invocation, and for -Q too

			// I select the URLs to fetch in a random order,
			// to avoid being too burdensome a net citizen.
			// Seed randoms from the system clock.
			srand( ( unsigned )time( ( time_t* )0 ) );
			// Reset these when writing to a local file.
			DiscardingTextNow = DiscardingAllText = 0;

			for( ;; )
			{
				if( RandomlyFetchOneURLFromUrlList( ) == -1 )
				{
					fprintf( stderr, "SURF: No more URLS to fetch in %s.\n", ScriptFilename );
					fprintf( stderr, "SURF: You must remove the '*' after URLs you want to fetch next.\n" );
					break;
				}
				if( UserKeystrokeDetected( ) )
				{
					fprintf( stderr, "SURF: Stopping due to your keystroke.\n" );
					break;
				}
			}

			// These two calls ( Split and Write... ) must be used as a pair.
			SplitOutTheFetchedUrls( );
			WriteUrlListToScriptFile( );
			FreeUpTheFileNamingLists( );
			break;

		case 'b': // -B is a fast variant of -A to next do -W.

			ProgramIsSavingURLs = 0; // Only false for the -R flag; Now -B too.
			// Case B falls into case A:

		case 'a': // Augment list with local FILE1, FILE2,...

			ProgramIsDoingLocalFileInput = 1; // For the -A flag
			// Process the file list, and free all mallocs.
			if ( FileListHead == ( void** )0 )
			{
				fprintf( stderr, "SURF: -B or -A: No matching local filenames were found.\n" );
				// But do not exit program for that.
			}
			else do
			{
				char * InputFilename = LocalFilename = ( char* )FileListHead + sizeof( void** );
				int hfi = _open( InputFilename, _O_BINARY|_O_RDONLY, _S_IREAD|_S_IWRITE );
				LastUrlPtrSlot = -1;		// Reset before parsing each file
				if( hfi == -1 )
				{
					fprintf( stderr, "SURF: cannot open to read file %s: %s.\n", InputFilename, strerror( errno ) );
					// But do not exit program for that.
				}
				else
				{
					// One of my favorite file input methods is to read whole
					// next disk block just past the tail of unprocessed data.
					int noffset = 0;
					// Give a terse indication of progress in filenames:
					fprintf( stderr, "%s\n", InputFilename );
					for( ;; )
					{
						int nread = _read( hfi, ( inbuf+4 ) + noffset, 4096 );
						if( nread <= 0 )
						{
							// EOF gives 0; Errors -1.
							if( nread < 0 )
							{
								fprintf( stderr, "SURF: error reading file %s: %s.\n", InputFilename, strerror( errno ) );
								exit( 1 );
							}
							if( noffset == 0 )
							{
								// At EOF, and nothing was left from prior loop.
								break;
							}
							// Tail data from prior loop lacks a space at EOF.
							// Append a final space character to assist parse.
							( inbuf+4 ) [noffset] = ' ';
							noffset++;
						}
						// This may copy back some tail of data partly processed.
						noffset = MultiPassInbufConversion( noffset + nread );
						if( noffset == -1 )
							break;  // unacceptable garbage or binary data
						if( nread == 0 )
							break;  // Input had reached EOF
					} // for...
					if( _close( hfi ) == -1 )
					{
						fprintf( stderr, "SURF: error closing file %s: %s.\n", InputFilename, strerror( errno ) );
						exit( 1 );
					}
					// Flush any internal parse buffers, also restore all states.
					// This case parsing local files did not use FetchObjectFromUrlToFile,
					// So after reading the file this case calls ProcessForEndOfInputFile.
					ProcessForEndOfInputFile( HTTP_STATUS_OK ); // Pass 200 to see filename

					// This -A will not be burdened with file renaming like -R option.
					FreeUpTheVocabulary( );
				}
				{
					// This is the increment operation to do all items in a linked list.
					void** next = ( void** )*FileListHead;
					free( FileListHead );
					FileListHead = next;
				}
				if( UserKeystrokeDetected( ) )
				{
					fprintf( stderr, "SURF: Stopping due to your keystroke.\n" );
					break;
				}
			} // do...
			while ( FileListHead != ( void** )0 );
			// These two calls ( Split and Write... ) must be used as a pair.
			SplitOutTheFetchedUrls( );
			WriteUrlListToScriptFile( );
			FreeUpTheFileNamingLists( );
			break;

		case 'f': // Fetch web pages from URL1, URL2...

			ProgramIsDoingFetch = 1; // For the -F flag

			// I use binary mode for stdout: then I can put \r\n uniformly.
			if( _setmode( _fileno( stdout ), _O_BINARY ) == -1 )
			{
				fprintf( stderr, "SURF: cannot set binary mode on stdout: %s.\n", strerror( errno ) );
				exit( 1 );
			}

			{
				int ai = NextArgc;
				while ( ai < argc )
				{
					// Reset these when writing to a local file.
					DiscardingTextNow = DiscardingAllText = 0;
					// Fetch the single Internet object.
					if( strnicmp( argv [ai], "http://", 7 ) == 0 )
					{
						// The user was not a lazy typist.
						FetchObjectFromUrlToFile( argv [ai] );
					}
					else
					{
						// Allow that user need not type in "http://" atop url.
						// Remember, right now surf only processes HTTP method.
						char WithHttp [300]; // This is no limit if you type it.
						strcpy ( WithHttp, "http://" );
						strncpy ( WithHttp + 7, argv[ai], sizeof( WithHttp ) - 1 - 7 );
						// Curiously, strncpy doesn't guarantee a final \0.
						WithHttp [sizeof( WithHttp ) - 1] = '\0';
						FetchObjectFromUrlToFile( WithHttp );
					}
					// I do ProcessForEndOfInputFile via FetchObjectFromUrlToFile.
					if( UserKeystrokeDetected( ) )
					{
						fprintf( stderr, "SURF: Stopping due to your keystroke.\n" );
						break;
					}
					ai++;
				}
			}
			goto LetOptionFGoAndDo; // Having built a list, go fetch them
			// unreachable: break;

		case 'q': // Add search engine queries to list, then run list
			{
				// First, let's assemble the query terms with "+" for spaces.
				// Wait a minute -- I may have confused some engines. Try %20.
				char * into = QueryTopicString;
				int ai = NextArgc;
				while ( ai < argc )
				{
					char * from = argv [ai];
					while( ( *into=*from ) != '\0'
					&& into < QueryTopicString + sizeof( QueryTopicString ) - 3 )
					{
						// Convert embedded spaces into plus signs: No, into %20:
						if( *from == ' ' )
						{
							// changed from just one + character: *into++ = '+';
							// trying with the normally escaped space character:
							*into++ = '%';
							*into++ = '2';
							*into = '0';	// increments below
						}
						// Add other mandatory url processing into a %nn form.
						// RFC1738 says this about HTTP: Within the <path> and
						// <searchpart> components, "/", ";", "?" are reserved.
						// So, hdump reveals the hex values: 2F=/, 3B=;, 3F=?.
						if( *from == '/' )
						{
							*into++ = '%';
							*into++ = '2';
							*into = 'F';	// increments below
						}
						if( *from == ';' )
						{
							*into++ = '%';
							*into++ = '3';
							*into = 'B';	// increments below
						}
						if( *from == '?' )
						{
							*into++ = '%';
							*into++ = '3';
							*into = 'F';	// increments below
						}

						from++, into++; // Copy one search query word
					}
					// Separate query words with + instead of whitespace.
					if( into < QueryTopicString + sizeof( QueryTopicString ) - 3 )
					{
						// changed from just one + character: *into++ = '+';
						// trying with the normally escaped space character:
						*into++ = '%';
						*into++ = '2';
						*into++ = '0';	 // increment this one myself
					}
					ai++;
				}
				// No: into [-1] = '\0'; // Discard final "+", which IS present.
				into [-3] = '\0'; // Discard final "%20", which IS present.
			}
			{
				// Now let's run through the predefined search engine queries.
				int qi = 0;
				for( ;; )
				{
					char JoinedSearchUrl [500];
					strcpy( JoinedSearchUrl, SearchEngineQueries [qi] );
					strcat( JoinedSearchUrl, QueryTopicString );

					// Canonicalize and process absolute or ( base+relative ) URLs.
					if( GetLinkUrlParts( JoinedSearchUrl, JoinedSearchUrl + strlen( JoinedSearchUrl ) ) == 0
					&& CombineLinkUrlWithBase( ) == 0 ) // Don't say && LinkUrlPartsIsAcceptable( ) )
					{
						AddLinkUrlPartsToUrlList( ); // URL from SearchEngineQ... plus QueryTopics.
						MergeNoteAtLastUrlPtrSlot( "SURF auto-query URL" ); // For name of search engine
					}
					// Originally, there were pairs of ( url, note ). Now just ( url ).
					qi ++;
					if( ( unsigned ) qi >= ( unsigned ) ( sizeof( SearchEngineQueries ) / sizeof( *SearchEngineQueries ) ) )
						break; // do not exceed max array index ( if well paired )
				}
			}
			// Test for a keystroke after adding all URLs, but before fetching.
			if( UserKeystrokeDetected( ) )
			{
				fprintf( stderr, "SURF: Stopping due to your keystroke.\n" );
				break;
			}
			goto LetOptionQGoAndDo; // Having built a list, go fetch them
			// unreachable: break;

		case 'r': // Revise file storage according to word list

			ProgramIsDoingLocalFileInput = 1; // For the -R flag
			ProgramIsSavingURLs = 0; // Only false for the -R flag
			// Process the file list, and free all mallocs.
			if ( FileListHead == ( void** )0 )
			{
				fprintf( stderr, "SURF: -R: No matching local filenames were found.\n" );
				// But do not exit program for that.
			}
			else do
			{
				char * InputFilename = LocalFilename = ( char* )FileListHead + sizeof( void** );
				int hfi = _open( InputFilename, _O_BINARY|_O_RDONLY, _S_IREAD|_S_IWRITE );
				LastUrlPtrSlot = -1;		// Reset before parsing each file
				ThisInputFileContainedABaseTag = 0; // Reset for -R before each file
				if( hfi == -1 )
				{
					fprintf( stderr, "SURF: cannot open to read file %s: %s.\n", InputFilename, strerror( errno ) );
					// But do not exit program for that.
				}
				else
				{
					// One of my favorite file input methods is to read whole
					// next disk block just past the tail of unprocessed data.
					int noffset = 0;
					// Give a terse indication of progress in filenames:
					fprintf( stderr, "%s\n", InputFilename );
					for( ;; )
					{
						int nread = _read( hfi, ( inbuf+4 ) + noffset, 4096 );
						if( nread <= 0 )
						{
							// EOF gives 0; Errors -1.
							if( nread < 0 )
							{
								fprintf( stderr, "SURF: error reading file %s: %s.\n", InputFilename, strerror( errno ) );
								exit( 1 );
							}
							if( noffset == 0 )
							{
								// At EOF, and nothing was left from prior loop.
								break;
							}
							// Tail data from prior loop lacks a space at EOF.
							// Append a final space character to assist parse.
							( inbuf+4 ) [noffset] = ' ';
							noffset++;
						}
						// This may copy back some tail of data partly processed.
						noffset = MultiPassInbufConversion( noffset + nread );
						if( noffset == -1 )
							break;  // unacceptable garbage or binary data
						if( nread == 0 )
							break;  // Input had reached EOF
					} // for...
					if( _close( hfi ) == -1 )
					{
						fprintf( stderr, "SURF: error closing file %s: %s.\n", InputFilename, strerror( errno ) );
						exit( 1 );
					}
					// Flush any internal parse buffers, also restore all states.
					// This case parsing local files did not use FetchObjectFromUrlToFile,
					// So after reading the file this case calls ProcessForEndOfInputFile.
					ProcessForEndOfInputFile( HTTP_STATUS_OK ); // Pass 200 to see filename

					// This -R will be burdened with file renaming, not like -A option.
					// Rename file based on vocabulary sorted in ProcessForEndOfInputFile
					if( IHaveFileRenamingData
					&& ThisInputFileContainedABaseTag )
					{
						// LIST file contained @path word lines;
						// This local file contained a <BASE> tag, so
						// it is safe for -R to rename this local file.
						// Such test was added lest the recursive action
						// of "SURF -R *" in root \ destroy file system.
						// All files saved by SURF contain a <BASE> tag.
						AnalysisToRenameLocalFile( );
					}
					FreeUpTheVocabulary( );
				}
				{
					// This is the increment operation to do all items in a linked list.
					void** next = ( void** )*FileListHead;
					free( FileListHead );
					FileListHead = next;
				}
				if( UserKeystrokeDetected( ) )
				{
					fprintf( stderr, "SURF: Stopping due to your keystroke.\n" );
					break;
				}
			} // do...
			while ( FileListHead != ( void** )0 );
			// Do not write back the glutted list for the -R renaming usage:
			// Not for the -R flag: SplitOutTheFetchedUrls( );
			// Not for the -R flag: WriteUrlListToScriptFile( );
			FreeUpTheFileNamingLists( );
			break;

		case 'w': // Write an HTML file of fetched links

			ProgramIsDoingWrite = 1; // For the -W flag

			// I use binary mode for stdout: then I can put \r\n uniformly.
			if( _setmode( _fileno( stdout ), _O_BINARY ) == -1 )
			{
				fprintf( stderr, "SURF: cannot set binary mode on stdout: %s.\n", strerror( errno ) );
				exit( 1 );
			}

			// These two calls ( Split and Write... ) must be used as a pair.
			SplitOutTheFetchedUrls( );
			WriteUrlListToScriptFile( );
			FreeUpTheFileNamingLists( );

			if( fclose( stdout ) == -1 )
			{
				fprintf( stderr, "SURF: error closing stdout: %s.\n", strerror( errno ) );
				exit( 1 );
			}
			break;

		default:
			break;
		}
	}
	return 0; // exit( 0 )
}

// End of file: SURF.CPP

