%{ /* * html2setext.y - HTML parser and setext converter * Created 15/12/94 by avatar@sericyb.com.au (Andrew Pam) * Copyright (c) 1994-1997 Serious Cybernetics * * Date Who Description * -------- --- ----------- * 16/12/94 ADP Added and * 17/01/95 ADP Added word wrap * 18/01/95 ADP Added setext headings * 18/02/95 ADP Fixed line spacing and indentation * 24/02/95 ADP Added nested unordered lists * 02/03/95 ADP Added nested ordered lists * 28/04/96 ADP Added including ALT parameter * 01/05/96 ADP Added SGML quote entity and support * 05/05/96 ADP Added , and * 25/07/97 ADP Added support * * TODO: (hyperlink) and support */ #include #include #include #ifdef __GNUC__ void yyerror(const char *msg); int yylex(void); #endif #define FALSE (0) #define TRUE (!FALSE) #define MARGIN (68) #define MAXLINE (128) #define S_HRULE " ____________________________________________________________\n" #define S_BOLD "**" #define S_EOF "$$" #define S_INDENT " " #define S_LIST "* " #define S_QUOTE "> " #define CH_EMPH '_' #define CH_ITAL '~' #define CH_H1 '=' #define CH_H2 '-' typedef int BOOL; typedef struct listn { struct listn *next; /* Pointer to next list node */ int num; /* Ordered list numbering, 0 for unordered */ } LISTN; BOOL body = FALSE; /* TRUE inside the document body */ BOOL center = FALSE; /* TRUE inside tag */ BOOL flush = FALSE; /* TRUE if flush left */ BOOL pre = FALSE; /* TRUE inside tag */ BOOL quote = FALSE; /* TRUE inside tag */ BOOL sentence = FALSE; /* TRUE at end of sentence */ int blanks = 0; /* Number of blank lines skipped */ int depth = 0; /* Indentation depth */ char heading = (char) 0; /* Current heading level */ char line[MAXLINE] = ""; /* Output line buffer */ LISTN *listp = NULL; /* Head of list numbering list */ int yydebug = 1; /* */ void output(void) /* Trim trailing spaces and output the current line */ { register int i = strlen(line); /* Find the last non-space character */ while (i-- && (line[i] == ' ')) ; /* Truncate the line */ line[++i] = (char) 0; if (center && i && (heading != '1') && (heading != '2')) for (i = (MARGIN - i) / 2; i > 0; --i) putchar(' '); puts(line); if (*line) blanks = 0; else blanks++; *line = (char) 0; sentence = FALSE; if (heading == '1') { while (i--) putchar(CH_H1); putchar('\n'); } if (heading == '2') { while (i--) putchar(CH_H2); putchar('\n'); } } /* Implemented as macros for speed */ #define coutput() { if (*line) output(); } #define skip(count) { coutput(); while (blanks < (count)) output(); } void indent(void) { register int i = depth; while (--i) strcat(line, S_INDENT); } void wraps(char *s) { char *p; /* Punctuation inside a word is not the end of a sentence */ if (*s != '"') sentence = FALSE; if (quote && !*line) /* Add quote marker at the beginning of a line */ strcpy(line, S_QUOTE); else if (!flush && !*line) { /* Indent at the beginning of a normal line */ strcat(line, S_INDENT); if (depth) indent(); } /* Append the text */ strcat(line, s); /* Check for line break */ if (!pre && (strlen(line) > MARGIN)) /* Look for a space to break at */ if ((p = strrchr(line, ' ')) == NULL) output(); else { *p = (char) 0; output(); if (quote) strcpy(line, S_QUOTE); else if (!flush) { strcat(line, S_INDENT); if (depth) indent(); } strcat(line, ++p); } } void wrap(char c) { char s[2]; s[0] = c; s[1] = 0; wraps(s); } void space() { /* Spaces are literal inside */ if (pre) strcat(line, " "); /* Two spaces at the end of a sentence */ else if (sentence) if ((strlen(line) + 1) >= MARGIN) output(); else { strcat(line, " "); sentence = FALSE; } /* Otherwise check for existing characters */ else if (*line && (line[strlen(line)-1] != ' ')) if (strlen(line) >= MARGIN) output(); else strcat(line, " "); } void newlist(int num) { LISTN *newp = malloc(sizeof(LISTN)); newp->next = listp; newp->num = num; listp = newp; } void freelist(void) { LISTN *oldp = listp; if (oldp) { listp = oldp->next; free(oldp); } } %} %union { char c; char *s; } %token NEWLINE SPACE SGML_AMP SGML_GT SGML_LT SGML_QUOT %token ADDRESS END_ADDRESS END_ANCHOR BASE BODY END_BODY BOLD END_BOLD %token BQUOTE END_BQUOTE BREAK CENTER END_CENTER CITE END_CITE %token DDEF DLIST END_DLIST DTERM EMPH END_EMPH FONT END_FONT %token HEAD END_HEAD HRULE HTML END_HTML ITAL END_ITAL LINK LIST %token PARA END_PARA PRE END_PRE OLIST END_OLIST TITLE END_TITLE %token ULIST END_ULIST %token SENTENCE TEXT SGML_UML HEADING END_HEADING %token SGML_NUMERIC SGML_ENTITY ANCHOR HTML_TAG IMAGE %% file: { body = TRUE; } data | title { body = TRUE; } data | head data body data | HTML data head data body data END_HTML data ; head: HEAD heads END_HEAD ; heads: /* empty */ | heads text | heads BASE | heads title | heads LINK ; title: TITLE data END_TITLE ; body: BODY { body = TRUE; flush = FALSE; } data END_BODY { body = FALSE; flush = TRUE; } ; data: /* empty */ | data text ; text: NEWLINE { if (body) if (pre) output(); else space(); } | SPACE { if (body) space(); } | SENTENCE { if (body) wrap($1); sentence = TRUE; } | TEXT { if (body) wrap($1); } | sgml | html ; sgml: SGML_AMP { wrap('&'); } | SGML_GT { wrap('>'); } | SGML_LT { wrap('<'); } | SGML_QUOT { wrap('"'); } | SGML_NUMERIC { wrap(atoi($1)); } | SGML_UML { wrap($1); wrap('e'); } | SGML_ENTITY { wrap('&'); wraps($1); yyerror("Unknown SGML entity"); } ; html: anchor | pre | ADDRESS { coutput(); } data END_ADDRESS | BQUOTE { coutput(); quote = TRUE; } data END_BQUOTE { coutput(); quote = FALSE; } | CENTER { coutput(); center = TRUE; } data END_CENTER { coutput(); center = FALSE; } | CITE data END_CITE | BREAK { output(); } | PARA { skip(1); } | END_PARA | HRULE { coutput(); puts(S_HRULE); } | HEADING { flush = TRUE; skip(2); heading = $1; } | END_HEADING { coutput(); if ($1 != heading) yyerror("Heading level mismatch"); heading = (char) 0; output(); flush = FALSE; } | DLIST { coutput(); } dlist END_DLIST { skip(1); } | OLIST { coutput(); newlist(1); depth++; } list END_OLIST { depth--; freelist(); skip(1); } | ULIST { coutput(); newlist(0); depth++; } list END_ULIST { depth--; freelist(); skip(1); } | BOLD { wraps(S_BOLD); } data END_BOLD { wraps(S_BOLD); } | EMPH { wrap(CH_EMPH); } data END_EMPH { wrap(CH_EMPH); } | ITAL { wrap(CH_ITAL); } data END_ITAL { wrap(CH_ITAL); } | FONT data END_FONT | IMAGE { space(); if ($1) { wraps($1); free($1); } else wraps("[IMAGE]"); space(); } | HTML_TAG { char *errmsg = malloc(strlen($1) + 20); strcpy(errmsg, "Unknown HTML tag <"); strcat(errmsg, $1); strcat(errmsg, ">"); free($1); yyerror(errmsg); free(errmsg); } ; anchor: ANCHOR data END_ANCHOR { /* yyerror($1); */ } ; dbody: text | DTERM { coutput(); flush = TRUE; } | DDEF { flush = FALSE; coutput(); } ; dlist: /* empty */ | dlist dbody ; lbody: text | LIST { coutput(); if (listp->num) sprintf(line, "%d ", listp->num++); else strcpy(line, S_LIST); if (depth) indent(); } ; list: /* empty */ | list lbody ; pre: PRE { pre = flush = TRUE; } data END_PRE { pre = flush = FALSE; } ; %% int main(void) { if (yyparse()) fprintf(stderr, "Parse failed.\n"); else { coutput(); puts(S_EOF); } return 0; }