%{ /* * html.l - HTML lexer * Created 14/12/94 by avatar@sericyb.com.au (Andrew Pam) * Copyright (c) 1994-1996 Serious Cybernetics * * Date Who Description * -------- --- ----------- * 15/12/94 ADP Added SGML entities and HTML tags * 16/12/94 ADP Added and * 17/01/95 ADP Added NEWLINE, SENTENCE and SPACE * 18/02/95 ADP Added , & and SGML comments * 24/02/95 ADP Added * 07/02/96 ADP Added * 28/04/96 ADP Added <CENTER>, allowed parameters on <BODY> tag, * supported ALT parameter of <IMG> tag */ #include <string.h> #include <unistd.h> #include "y.tab.h" #ifdef __GNUC__ void yyerror(const char *msg); int yyparse (void); #endif #define FALSE (0) #define TRUE (!FALSE) #define ALT (1) char *p; int lineno = 1; int param, tag; %} %x COMMENT TAG TAGQ /* Heading levels */ hl [1-6] /* Whitespace */ ws [ \t\n] %% /* SGML entities */ &#[0-9]+; { yylval.s = yytext+2; return SGML_NUMERIC; } &amp; { return SGML_AMP; } &gt; { return SGML_GT; } &lt; { return SGML_LT; } &[aeiou]uml; { yylval.c = yytext[1]; return SGML_UML; } &[^; \t\n]+; { yylval.s = yytext+1; return SGML_ENTITY; } & { yylval.c = '&'; yyerror("Invalid ampersand"); return TEXT; } /* SGML comment */ "<!C-" { BEGIN COMMENT; } <COMMENT>\n { lineno++; } <COMMENT>[^>\n] <COMMENT>\> { BEGIN INITIAL; } /* Simple HTML tags */ "<BR>" { return BREAK; } "<DD>" { return DDEF; } "<DT>" { return DTERM; } "<HR>" { return HRULE; } "<LI>" { return LIST; } "<ADDRESS>" { return ADDRESS; } "</ADDRESS>" { return END_ADDRESS; } "<B>" | "<STRONG>" { return BOLD; } "</B>" | "</STRONG>" { return END_BOLD; } "<BLOCKQUOTE>" { return BQUOTE; } "</BLOCKQUOTE>" { return END_BQUOTE; } "<CENTER>" { return CENTER; } "</CENTER>" { return END_CENTER; } "<CITE>" { return CITE; } "</CITE>" { return END_CITE; } "<CODE>" | "<XMP>" | "<PRE>" { return PRE; } "</CODE>" | "&lt;/XMP>" | /* NOTE: replace "&lt;" with a "<" character */ "</PRE>" { return END_PRE; } "<DL>" { return DLIST; } "</DL>" { return END_DLIST; } "<EM>" | "<U>" { return EMPH; } "</EM>" | "</U>" { return END_EMPH; } "<HEAD>" { return HEAD; } "</HEAD>" { return END_HEAD; } "<HTML>" { return HTML; } "</HTML>" { return END_HTML; } "<I>" { return ITAL; } "</I>" { return END_ITAL; } "<OL>" { return OLIST; } "</OL>" { return END_OLIST; } "<P>" { return PARA; } "</P>" { return END_PARA; } "<TITLE>" { return TITLE; } "</TITLE>" { return END_TITLE; } "<UL>" { return ULIST; } "</UL>" { return END_ULIST; } /* Heading tags */ "<H"{hl}> { yylval.c = yytext[2]; return HEADING; } "</H"{hl}> { yylval.c = yytext[3]; return END_HEADING; } /* HTML tags with parameters */ "<BODY" { BEGIN TAG; tag=BODY; param = FALSE; yyless(yyleng-1); } "</BODY>" { return END_BODY; } "<A"{ws} { BEGIN TAG; tag=ANCHOR; param = FALSE; yyless(yyleng-1); } "</A>" { return END_ANCHOR; } "<DL"{ws} { BEGIN TAG; tag=DLIST; param = FALSE; yyless(yyleng-1); } "<HR"{ws} { BEGIN TAG; tag=HRULE; param = FALSE; yyless(yyleng-1); } "<IMG"{ws} { BEGIN TAG; tag=IMAGE; param = FALSE; yylval.s = NULL; yyless(yyleng-1); } "<LINK"{ws} { BEGIN TAG; tag=LINK; param = FALSE; yyless(yyleng-1); } "<OL"{ws} { BEGIN TAG; tag=OLIST; param = FALSE; yyless(yyleng-1); } "<UL"{ws} { BEGIN TAG; tag=ULIST; param = FALSE; yyless(yyleng-1); } \<[^> \t\n]* { BEGIN TAG; tag=HTML_TAG; param = FALSE; yylval.s = strdup(yytext+1); } <TAG>"ALT=" { param = ALT; } <TAG>= { param = FALSE; } <TAG>[^=> \t\n\"]+ { if (param) { yylval.s = strdup(yytext); param = FALSE; } } <TAG>[ \t]+ <TAG>\n { lineno++; } <TAG>\" { BEGIN TAGQ; } <TAGQ>[^\n\"]+ { if (param) { yylval.s = strdup(yytext); p = strchr(yylval.s, '"'); if (p) *p = 0; param = FALSE; } } <TAGQ>\n { lineno++; } <TAGQ>\" { BEGIN TAG; if (param) { yylval.s = strdup(""); param = FALSE; } } <TAG>\> { BEGIN INITIAL; return tag; } \> { yylval.c = '>'; yyerror("Unmatched '>'"); return TEXT; } /* Any other text */ \n { lineno++; return NEWLINE; } [ \t] { return SPACE; } [\.:!?] { yylval.c = *yytext; return SENTENCE; } . { yylval.c = *yytext; return TEXT; } %% int yywrap(void) { /* Always terminate at end of first input file */ return 1; } void yyerror(const char *msg) { fprintf(stderr, "Line %d: %s at '%s'\n", lineno, msg, yytext); }