%{
/*
* html.l - HTML lexer
* Created 14/12/94 by avatar@sericyb.com.au (Andrew Pam)
* Copyright (c) 1994-1996 Serious Cybernetics
*
* Date Who Description
* -------- --- -----------
* 15/12/94 ADP Added SGML entities and HTML tags
* 16/12/94 ADP Added and
* 17/01/95 ADP Added NEWLINE, SENTENCE and SPACE
* 18/02/95 ADP Added , & and SGML comments
* 24/02/95 ADP Added
* 07/02/96 ADP Added
* 28/04/96 ADP Added , allowed parameters on tag,
* supported ALT parameter of
tag
*/
#include
#include
#include "y.tab.h"
#ifdef __GNUC__
void yyerror(const char *msg);
int yyparse (void);
#endif
#define FALSE (0)
#define TRUE (!FALSE)
#define ALT (1)
char *p;
int lineno = 1;
int param, tag;
%}
%x COMMENT TAG TAGQ
/* Heading levels */
hl [1-6]
/* Whitespace */
ws [ \t\n]
%%
/* SGML entities */
[0-9]+; { yylval.s = yytext+2; return SGML_NUMERIC; }
& { return SGML_AMP; }
> { return SGML_GT; }
< { return SGML_LT; }
&[aeiou]uml; { yylval.c = yytext[1]; return SGML_UML; }
&[^; \t\n]+; { yylval.s = yytext+1; return SGML_ENTITY; }
& { yylval.c = '&'; yyerror("Invalid ampersand"); return TEXT; }
/* SGML comment */
"\n { lineno++; }
[^>\n]
\> { BEGIN INITIAL; }
/* Simple HTML tags */
"
" { return BREAK; }
"" { return DDEF; }
"" { return DTERM; }
"
" { return HRULE; }
"" { return LIST; }
"" { return ADDRESS; }
"" { return END_ADDRESS; }
"" |
"" { return BOLD; }
"" |
"" { return END_BOLD; }
"" { return BQUOTE; }
"
" { return END_BQUOTE; }
"" { return CENTER; }
"" { return END_CENTER; }
"" { return CITE; }
"" { return END_CITE; }
"" |
"" |
"" { return PRE; }
"" |
"</XMP>" | /* NOTE: replace "<" with a "<" character */
"" { return END_PRE; }
"" { return DLIST; }
"
" { return END_DLIST; }
"" |
"" { return EMPH; }
"" |
"" { return END_EMPH; }
"" { return HEAD; }
"" { return END_HEAD; }
"" { return HTML; }
"" { return END_HTML; }
"" { return ITAL; }
"" { return END_ITAL; }
"" { return OLIST; }
"
" { return END_OLIST; }
"" { return PARA; }
"
" { return END_PARA; }
"" { return TITLE; }
"" { return END_TITLE; }
"" { return END_ULIST; }
/* Heading tags */
" { yylval.c = yytext[2]; return HEADING; }
" { yylval.c = yytext[3]; return END_HEADING; }
/* HTML tags with parameters */
"" { return END_BODY; }
"" { return END_ANCHOR; }
" \t\n]* { BEGIN TAG; tag=HTML_TAG; param = FALSE;
yylval.s = strdup(yytext+1); }
"ALT=" { param = ALT; }
= { param = FALSE; }
[^=> \t\n\"]+ { if (param) { yylval.s = strdup(yytext);
param = FALSE; }
}
[ \t]+
\n { lineno++; }
\" { BEGIN TAGQ; }
[^\n\"]+ { if (param)
{ yylval.s = strdup(yytext);
p = strchr(yylval.s, '"'); if (p) *p = 0;
param = FALSE;
}
}
\n { lineno++; }
\" { BEGIN TAG; if (param)
{ yylval.s = strdup(""); param = FALSE; }
}
\> { BEGIN INITIAL; return tag; }
\> { yylval.c = '>'; yyerror("Unmatched '>'"); return TEXT; }
/* Any other text */
\n { lineno++; return NEWLINE; }
[ \t] { return SPACE; }
[\.:!?] { yylval.c = *yytext; return SENTENCE; }
. { yylval.c = *yytext; return TEXT; }
%%
int yywrap(void)
{
/* Always terminate at end of first input file */
return 1;
}
void yyerror(const char *msg)
{
fprintf(stderr, "Line %d: %s at '%s'\n", lineno, msg, yytext);
}