/* * html2setext.c - convert HTML files to setext (structured enhanced text) * Written 24/07/94 by Andrew Pam * Copyright (c) 1994 Serious Cybernetics * * May be freely distributed and used free of charge. No warranty, * no guarantees, no support. Share and enjoy! * * Date Who Description * -------- --- ----------- * 24/07/94 ADP Created * 25/07/94 ADP Implemented SGML entities * 01/08/94 ADP Implemented word-wrap and HTML tags * 07/08/94 ADP Implemented hyperlinks as endnotes * * To do: * provide support for internal links using tags * * Tested with: * cc under SCO Unix * gcc under SunOS * Borland C++ under DOS */ #include #include #include #include #define FALSE (0) /* boolean false value */ #define TRUE (!FALSE) /* boolean true value */ #define RESET(x, y) x &= ~(y) /* reset flag bit */ #define SET(x, y) x |= (y) /* set flag bit */ #define TEST(x, y) (x & (y)) /* test flag bit */ #define BUFSIZE (1024) /* number of chars to buffer */ #define MAXENT (8) /* maximum length of an SGML entity */ #define MAXTAG (256) /* maximum length of an SGML tag */ #define MAXURL (128) /* maximum length of a URL */ #define MAXWORD (64) /* maximum length of a word */ #define LMARGIN (2) /* left margin */ #define RMARGIN (68) /* right margin */ #define S_BREAK (1 << 0) /* force a line break */ #define S_FLUSH (1 << 1) /* force return to leftmost column */ #define S_NOINDENT (1 << 2) /* disable indentation */ #define S_ANCHOR (1 << 3) /* inside an tag */ #define S_H1 (1 << 4) /* inside an

tag / #define S_H2 (1 << 5) / inside an

tag */ #define S_HEAD (1 << 6) /* inside a tag */ #define S_PRE (1 << 7) /* inside a

 tag */

#define QUOTE		'"'		/* quotation mark character */
#define	CH_FILL		' '		/* fill character for indents */
#define CH_BOLD		'*'		/* bold character */
#define	CH_H1		'='		/* heading level 1 */
#define CH_H2		'-'		/* heading level 2 */
#define CH_ITALIC	'~'		/* italic character */
#define CH_ULINE	'_'		/* underline character */
#define	CH_GT		'>'		/* "greater than" character */
#define CH_LT		'<'		/* "less than" character */
#define CH_UML		'e'		/* "umlaut" character */

#define OPEN_ENTITY	'&'		/* start of SGML entity */
#define CLOSE_ENTITY	';'		/* end of SGML entity */
#define NUM_ENTITY	'#'		/* numeric SGML entity */
#define FRAGMENT	'#'		/* URL fragment */
#define	OPEN_TAG	'<'		/* start of SGML tag */
#define CLOSE_TAG	'>'		/* end of SGML tag */

#define	END_TAG		"/"		/* SGML closing tag */
#define A_TAG		"A"		/* anchor tag */
#define	BOLD_TAG	"B"		/* bold tag */
#define BODY_TAG	"BODY"		/* body tag */
#define BREAK_TAG	"BR"		/* line break tag */
#define DDEF_TAG	"DD"		/* definition tag */
#define DTERM_TAG	"DT"		/* term tag */
#define EM_TAG		"EM"		/* emphasis tag */
#define HEAD_TAG	"HEAD"		/* header tag */
#define	HRULE_TAG	"HR"		/* horizontal rule tag */
#define HREF_TAG	"HREF"		/* hypertext reference tag */
#define	H1_TAG		"H1"		/* heading level 1 tag */
#define	H2_TAG		"H2"		/* heading level 2 tag */
#define	H3_TAG		"H3"		/* heading level 3 tag */
#define	H4_TAG		"H4"		/* heading level 4 tag */
#define	H5_TAG		"H5"		/* heading level 5 tag */
#define	H6_TAG		"H6"		/* heading level 6 tag */
#define ITALIC_TAG	"I"		/* italic tag */
#define LIST_TAG	"LI"		/* list element tag */
#define	PARA_TAG	"P"		/* paragraph break */
#define PRE_TAG		"PRE"		/* preformatted text tag */
#define STRONG_TAG	"STRONG"	/* strong emphasis tag */
#define	TITLE_TAG	"TITLE"		/* title tag */
#define UL_TAG		"UL"		/* unordered list tag */

#define GT_ENT		"gt"		/* "greater than" entity */
#define LT_ENT		"lt"		/* "less than" entity */
#define UML_ENT		"uml"		/* "umlaut" entity */

#define NEWLINE		"\n"		/* newline */
#define SENTENCE	".:!?"		/* end of sentence characters */
#define SE_BULLET	"*"		/* setext bullet */
#define SE_EOF		"$$\n"		/* setext end of file marker */
#define SE_FORMAT	"[%d]"		/* setext hyperlink printf format */
#define SE_LINK		".. "		/* setext hyperlink prefix */
#define	SE_RULE		"    ____________________________________________________________"

typedef	int	Boolean;		/* booleans are stored as ints */

typedef struct href
{
	struct href	*next;		/* pointer to next Href */
	char		*url;		/* hypertext link URL */
} Href;					/* hypertext reference structure */

Href	*href_head = NULL;		/* pointer to first Href */

void heading(int col, char *out, int state)
/*
 * concatenates setext heading to  with size 
 * and type determined by 
 */
{
	char	ch;		/* heading character */

	ch = (char) (TEST(state, S_H1) ? CH_H1 : CH_H2);
	out += strlen(out);
	while (col--)
		*(out++) = ch;
	*out = 0;
	strcat(out, NEWLINE);
}

void wordwrap(char *in, char *out, int *state)
/*
 * Wordwrap a word from  and concatenate result to 
 * modifies  as appropriate
 */
{
	static	int	col;		/* column position */
		int	i;		/* temporary string index */
	static	int	sentence;	/* end of sentence flag */

	if (!TEST(*state, S_PRE))
	{
		/* strip the whitespace */
		while (isspace(*in))
			in++;

		if (*in)
		{
			/* wrap if right margin exceeded */
			if ((col + strlen(in)) >= RMARGIN)
			{
				strcat(out, NEWLINE);
				if (TEST(*state, S_H1 | S_H2))
					heading(col, out, *state);
				col = 0;
			}

			/* indent */
			if (!TEST(*state, S_NOINDENT))
				while (col < LMARGIN)
				{
					strcat(out, " ");
					col++;
				}

			/* separate words with a space or two */
			if (col > (TEST(*state, S_NOINDENT) ? 0 : LMARGIN))
			{
				if (sentence)
				{
					strcat(out, " ");
					col++;
					sentence = FALSE;
				}
				strcat(out, " ");
				col++;
			}

			/* check for end of sentence */
			i = strlen(in) - 1;
			if ((in[i] == QUOTE) && i)
				i--;
			sentence = (strchr(SENTENCE, in[i]) != NULL);
		}
	}

	/* output the word */
	strcat(out, in);
	col += strlen(in);

	if (TEST(*state, S_FLUSH))
	{
		if (col > (TEST(*state, S_NOINDENT) ? 0 : LMARGIN))
		{
			strcat(out, NEWLINE);
			if (TEST(*state, S_H1 | S_H2))
				heading(col, out, *state);
			col = 0;
		}
		RESET(*state, S_FLUSH);
	}

	if (TEST(*state, S_BREAK))
	{
		strcat(out, NEWLINE);
		col = 0;
		RESET(*state, S_BREAK);
	}
}

void capitalise(char *s)
/* capitalise a string, except for portions in quotes */
{
	Boolean	quote = FALSE;		/* quote flag */

	while (*s)
	{
		if (*s == QUOTE)
			quote = !quote;
		else if (!quote)
			*s = (char) toupper(*s);
		s++;
	}
}

void xtract_url(char *in, char *out)
/* look for an HREF and extract the URL */
{
	in++;
	while (*in)
	{
		/* skip whitespace */
		while (isspace(*in))
			in++;

		if (!strncmp(in, HREF_TAG, sizeof(HREF_TAG) - 1))
		{
			in = strchr(in, QUOTE) + 1;
			while (*in && (*in != QUOTE))
				*(out++) = *(in++);
			*out = 0;
		}

		/* skip non-whitespace */
		while (*in && !isspace(*in))
			in++;
	}
}

int store_href(char *url)
/* store an HREF in a linked list; returns list element number */
{
	int	i = 0;			/* list counter */
	Href	*href = href_head;	/* hypertext reference pointer */
	Href	*last = NULL;		/* hypertext reference pointer */

	/* check if this URL is already stored in the linked list */
	while (href && strcmp(href->url, url))
	{
		i++;
		last = href;
		href = href->next;
	}

	if (!href)
	{
		href = (Href *) malloc(sizeof(Href));
		href->next = NULL;
		href->url = strdup(url);
		if (!last)
			href_head = href;
		else
			last->next = href;
	}

	return i;
}

char *xlate_tag(char *in, char *word, char *out, int *state)
/*
 * Translate HTML tag from  and concatenate result to 
 * modifies  as appropriate, may also concatenate to 
 * returns new position of  pointer
 */
{
		char	*wp;		/* word string pointer */
	static	char	url[MAXURL];	/* URL buffer */

	capitalise(in);
	wp = word + strlen(word);

	if (!strncmp(in, A_TAG, sizeof(A_TAG) - 1))
		xtract_url(in, url);

	else if (!strcmp(in, END_TAG A_TAG) && *url && (*url != FRAGMENT))
	{
		sprintf(wp, SE_FORMAT, store_href(url) + 1);
		wp += strlen(wp);
		*url = 0;
	}

	else if (!strcmp(in, BOLD_TAG) || !strcmp(in, END_TAG BOLD_TAG) ||
		 !strcmp(in, STRONG_TAG) || !strcmp(in, END_TAG STRONG_TAG))
	{
		*(wp++) = CH_BOLD;
		*(wp++) = CH_BOLD;
	}

	else if (!strcmp(in, BREAK_TAG))
	{
		SET(*state, S_BREAK);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
	}

	else if (!strcmp(in, DTERM_TAG))
		SET(*state, S_NOINDENT);

	else if (!strcmp(in, DDEF_TAG))
	{
		SET(*state, S_FLUSH);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
		RESET(*state, S_NOINDENT);
	}

	else if (!strcmp(in, EM_TAG) || !strcmp(in, END_TAG EM_TAG))
		*(wp++) = CH_ULINE;

	else if (!strcmp(in, H1_TAG) || !strcmp(in, H2_TAG) ||
	    !strcmp(in, H3_TAG) || !strcmp(in, H4_TAG) ||
	    !strcmp(in, H5_TAG) || !strcmp(in, H6_TAG))
	{
		SET(*state, S_BREAK | S_FLUSH | S_NOINDENT);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
		if (in[1] == '1')
			SET(*state, S_H1);
		else if (in[1] == '2')
			SET(*state, S_H2);
	}

	else if (!strcmp(in, END_TAG H1_TAG) || !strcmp(in, END_TAG H2_TAG) ||
		 !strcmp(in, END_TAG H3_TAG) || !strcmp(in, END_TAG H4_TAG) ||
		 !strcmp(in, END_TAG H5_TAG) || !strcmp(in, END_TAG H6_TAG))
	{
		SET(*state, S_BREAK | S_FLUSH);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
		RESET(*state, S_H1 | S_H2 | S_NOINDENT);
	}

	else if (!strcmp(in, HEAD_TAG) || !strcmp(in, TITLE_TAG))
		SET(*state, S_HEAD);

	else if (!strcmp(in, END_TAG HEAD_TAG) ||
		 !strcmp(in, END_TAG TITLE_TAG) || !strcmp(in, BODY_TAG))
	{
		RESET(*state, S_HEAD);
		wp = word;
	}

	else if (!strcmp(in, HRULE_TAG))
	{
		SET(*state, S_FLUSH);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
		/* Note: this could overflow the out buffer! */
		strcat(out, SE_RULE NEWLINE NEWLINE);
	}

	else if (!strcmp(in, ITALIC_TAG) || !strcmp(in, END_TAG ITALIC_TAG))
		*(wp++) = CH_ITALIC;

	else if (!strcmp(in, LIST_TAG))
	{
		SET(*state, S_FLUSH | S_NOINDENT);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
		wordwrap(SE_BULLET, out, state);
		RESET(*state, S_NOINDENT);
	}

	else if (!strcmp(in, PARA_TAG) ||
		 !strcmp(in, UL_TAG) || !strcmp(in, END_TAG UL_TAG))
	{
		SET(*state, S_BREAK | S_FLUSH);
		*wp = 0;
		wordwrap(word, out, state);
		wp = word;
	}

	else if (!strcmp(in, PRE_TAG))
		SET(*state, S_PRE);

	else if (!strcmp(in, END_TAG PRE_TAG))
		RESET(*state, S_PRE);

	return wp;
}

char *xlate_entity(char *in, char *out)
/*
 * Translate SGML entity from  and concatenate result to 
 * returns new position of  pointer
 */
{
	if (!strcmp(in, GT_ENT))
		*(out++) = CH_GT;
	else if (!strcmp(in, LT_ENT))
		*(out++) = CH_LT;
	else if (*in == NUM_ENTITY)
		*(out++) = (char) atoi(in + 1);
	else if (!strcmp(in + 1, UML_ENT))
	{
		/* handle all umlauts by suffixing character with an 'e' */
		*(out++) = *in;
		*(out++) = CH_UML;
	}

	return out;
}

void filter(char *in, char *out)
/*
 * Filter from  to , processing SGML entities and HTML tags
 * Note: Words are terminated by whitespace; the input stream must end
 *       with a whitespace character or the last word may not be output.
 */
{
	static	int	state;		/* HTML state flags */
	static	char	entity[MAXENT];	/* entity string */
	static	char	*ep;		/* entity string pointer */
	static	char	tag[MAXTAG];	/* tag string */
	static	char	*tp;		/* tag string pointer */
	static	char	word[MAXWORD];	/* word string */
	static	char	*wp = word;	/* word string pointer */

	while (*in)
	{
		if (ep && (*in == CLOSE_ENTITY))
		{
			*ep = 0;
			if (tp)
				tp = xlate_entity(entity, tp);
			else
				wp = xlate_entity(entity, wp);
			ep = NULL;
		}
		else if (tp && (*in == CLOSE_TAG))
		{
			*wp = *tp = 0;
			wp = xlate_tag(tag, word, out, &state);
			tp = NULL;
		}
		else if (*in == OPEN_ENTITY)
			ep = entity;
		else if (*in == OPEN_TAG)
			tp = tag;
		else if (ep)
			if (ep < (entity + MAXENT - 1))
				*(ep++) = *in;
			else
			{
				*wp = *ep = 0;
				strcat(wp, entity);
				ep = NULL;
			}
		else if (tp)
			*(tp++) = *in;
		else if (!isspace(*in) && (wp < (word + MAXWORD - 1)))
			*(wp++) = *in;
		else
		{
			if (!TEST(state, S_HEAD))
			{
				*wp = 0;
				wordwrap(word, out, &state);
			}
			wp = word;
			*(wp++) = *in;
		}

		in++;
	}
}

void output_hrefs(void)
/* output setext links to stdout from HREF linked list */
{
	int	i = 0;			/* list counter */
	Href	*href = href_head;	/* hypertext reference pointer */

	if (href)
		fputs(NEWLINE, stdout);

	while (href)
	{
		fprintf(stdout, "%s" SE_FORMAT " %s" NEWLINE,
				SE_LINK, ++i, href->url);
		free(href->url);
		href_head = href->next;
		free(href);
		href = href_head;
	}
}

int main(void)
/* read blocks from stdin, process and write to stdout */
{
	char	in[BUFSIZE];		/* input buffer */
	char	out[BUFSIZE];		/* output buffer */

	while (fgets(in, BUFSIZE, stdin))
	{
		*out = 0;
		filter(in, out);
		fputs(out, stdout);
	}
	fputs(NEWLINE, stdout);
	output_hrefs();
	fputs(SE_EOF, stdout);
	return 0;
}

tag */ #define S_H2 (1 << 5) /* inside an

tag / #define S_H2 (1 << 5) / inside an