484 lines
12 KiB
Plaintext
484 lines
12 KiB
Plaintext
%{
|
|
/*
|
|
* Copyright (c) 2004 Jann Fischer. All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. Neither the name of the University nor the names of its contributors
|
|
* may be used to endorse or promote products derived from this software
|
|
* without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
/**
|
|
* This is a lexer file for parsing MIME compatible messages. It is intended
|
|
* to satisfy at least RFC 2045 (Format of Internet Message Bodies). It still
|
|
* has quite a few problems:
|
|
*
|
|
* - The parsing could probably be done in a more elegant way
|
|
* - I don't know what performance impact REJECT has on the parser
|
|
*/
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <errno.h>
|
|
|
|
#include "mimeparser.h"
|
|
#include "mimeparser.tab.h"
|
|
|
|
#define NAMEOF(v) #v
|
|
/* BC() is a debug wrapper for lex' BEGIN() macro */
|
|
#define BC(x) do { \
|
|
struct lexer_state *lstate = yyget_extra(yyscanner); \
|
|
BEGIN(x); \
|
|
lstate->condition = x; \
|
|
} while(0);
|
|
|
|
#define ZERO(x) memset(x, '\0', sizeof(x))
|
|
|
|
#define PREALLOC_BUFFER 100000
|
|
#undef YY_BUF_SIZE
|
|
#define YY_BUF_SIZE 65536
|
|
|
|
enum header_states
|
|
{
|
|
STATE_MAIL = 0,
|
|
STATE_CTYPE,
|
|
STATE_CDISP,
|
|
STATE_CENC,
|
|
STATE_MIME
|
|
};
|
|
|
|
|
|
|
|
%}
|
|
|
|
%option reentrant
|
|
%option yylineno
|
|
%option bison-bridge
|
|
|
|
%s headers
|
|
%s header
|
|
%s headervalue
|
|
%s tspecialvalue
|
|
%s comment
|
|
%s body
|
|
%s postamble
|
|
%s preamble
|
|
%s boundary
|
|
%s endboundary
|
|
%s endoffile
|
|
|
|
STRING [a-zA-Z0-9\-\.\_]
|
|
TSPECIAL [a-zA-Z0-9)(<>@,;:/\-.=_\+'? ]
|
|
TSPECIAL_LITE [a-zA-Z0-9)(<>@,-._+'?\[\]]
|
|
|
|
%%
|
|
|
|
<INITIAL,headers>^[a-zA-Z]+[a-zA-Z0-9\-\_]* {
|
|
yylval_param->string=strdup(yytext);
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
BC(header);
|
|
|
|
/* Depending on what header we are processing, we enter a different
|
|
* state and return a different value.
|
|
*/
|
|
if (!strcasecmp(yytext, "Content-Type")) {
|
|
lstate->header_state = STATE_CTYPE;
|
|
return CONTENTTYPE_HEADER;
|
|
} else if (!strcasecmp(yytext, "Content-Transfer-Encoding")) {
|
|
lstate->header_state = STATE_CENC;
|
|
return CONTENTENCODING_HEADER;
|
|
} else if (!strcasecmp(yytext, "Content-Disposition")) {
|
|
lstate->header_state = STATE_CDISP;
|
|
return CONTENTDISPOSITION_HEADER;
|
|
} else if (!strcasecmp(yytext, "MIME-Version")) {
|
|
lstate->header_state = STATE_MAIL;
|
|
return MIMEVERSION_HEADER;
|
|
} else {
|
|
lstate->header_state = STATE_MAIL;
|
|
return MAIL_HEADER;
|
|
}
|
|
}
|
|
|
|
<INITIAL,headers>. {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
/* dprintf2("Unknown header char: %c\n", *yytext); */
|
|
lstate->current_pos += yyleng;
|
|
return ANY;
|
|
}
|
|
|
|
<headers>^(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->lineno++;
|
|
|
|
lstate->current_pos += yyleng;
|
|
|
|
/* This marks the end of headers. Depending on whether we are in the
|
|
* envelope currently we need to parse either a body or the preamble
|
|
* now.
|
|
*/
|
|
if (lstate->is_envelope == 0 || lstate->boundary_string == NULL) {
|
|
BC(body);
|
|
lstate->body_start = lstate->current_pos;
|
|
} else {
|
|
lstate->is_envelope = 0;
|
|
lstate->preamble_start = lstate->current_pos;
|
|
BC(preamble);
|
|
}
|
|
|
|
return ENDOFHEADERS;
|
|
}
|
|
|
|
<header>\: {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(headervalue);
|
|
lstate->current_pos += yyleng;
|
|
return COLON;
|
|
}
|
|
|
|
<header>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(headers);
|
|
/* dprintf2("Invalid header, returning EOL\n"); */
|
|
lstate->current_pos += yyleng;
|
|
return EOL;
|
|
}
|
|
|
|
<headervalue>(\n|\r\n)[\ \t]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
<headervalue>.+|(.+(\n|\r\n)[\ \t]+.+)+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
if (lstate->header_state != STATE_MAIL && lstate->header_state != STATE_CENC) {
|
|
REJECT;
|
|
}
|
|
lstate->current_pos += yyleng;
|
|
while (*yytext && isspace(*yytext)) yytext++;
|
|
/* Do we actually have a header value? */
|
|
if (*yytext == '\0') {
|
|
yylval_param->string = strdup("");
|
|
} else {
|
|
yylval_param->string=strdup(yytext);
|
|
lstate->lineno += count_lines(yytext);
|
|
}
|
|
return WORD;
|
|
}
|
|
|
|
<headervalue,tspecialvalue>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
/* marks the end of one header line */
|
|
lstate->lineno++;
|
|
BC(headers);
|
|
lstate->current_pos += yyleng;
|
|
return EOL;
|
|
}
|
|
|
|
<headervalue>;|;(\r\n|\n)[\ \t]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->lineno += count_lines(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return SEMICOLON;
|
|
}
|
|
|
|
<headervalue>\= {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
return EQUAL;
|
|
}
|
|
|
|
<headervalue>\" {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(tspecialvalue);
|
|
lstate->current_pos += yyleng;
|
|
return *yytext;
|
|
}
|
|
|
|
<headervalue>{STRING}+|{TSPECIAL_LITE}+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
yylval_param->string=strdup(yytext);
|
|
lstate->lineno += count_lines(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return WORD;
|
|
}
|
|
|
|
<headervalue>[\ |\t]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
<tspecialvalue>{TSPECIAL}+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->lineno += count_lines(yytext);
|
|
yylval_param->string=strdup(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return TSPECIAL;
|
|
}
|
|
|
|
<tspecialvalue>\" {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(headervalue);
|
|
lstate->current_pos += yyleng;
|
|
return *yytext;
|
|
}
|
|
|
|
<body>^\-\-{TSPECIAL}+\-\- {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
/**
|
|
* Make sure we only catch matching boundaries, and not other lines
|
|
* that begin and end with two dashes. If we have catched a valid
|
|
* end boundary, which actually ends a body, we save the current
|
|
* position, put the token back on the input stream and let the
|
|
* endboundary condition parse the actual token.
|
|
*/
|
|
if (lstate->endboundary_string != NULL) {
|
|
if (strcmp(lstate->endboundary_string, yytext)) {
|
|
/* dprintf2("YYTEXT != end_boundary: '%s'\n", yytext); */
|
|
REJECT;
|
|
} else {
|
|
lstate->current_pos += yyleng;
|
|
/* dprintf2("YYTEXT == lstate->end_boundary: '%s'\n", yytext); */
|
|
if (lstate->body_start) {
|
|
yylval_param->position.opaque_start =
|
|
lstate->body_opaque_start;
|
|
yylval_param->position.start = lstate->body_start;
|
|
yylval_param->position.end = lstate->current_pos - yyleng;
|
|
lstate->body_opaque_start = 0;
|
|
lstate->body_start = 0;
|
|
lstate->body_end = 0;
|
|
yyless(0);
|
|
BC(endboundary);
|
|
return BODY;
|
|
}
|
|
}
|
|
} else {
|
|
}
|
|
|
|
REJECT;
|
|
}
|
|
|
|
<body,preamble>^\-\-{TSPECIAL}+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
/**
|
|
* Make sure we only catch matching boundaries, and not other lines
|
|
* that begin with two dashes.
|
|
*/
|
|
if (lstate->boundary_string != NULL) {
|
|
if (strcmp(lstate->boundary_string, yytext)) {
|
|
/* dprintf2("YYTEXT != boundary: '%s'\n", yytext);*/
|
|
REJECT;
|
|
} else {
|
|
/* dprintf2("YYTEXT == boundary: '%s'\n", yytext);*/
|
|
if (lstate->body_start) {
|
|
yylval_param->position.opaque_start = lstate->body_opaque_start;
|
|
yylval_param->position.start = lstate->body_start;
|
|
yylval_param->position.end = lstate->current_pos;
|
|
lstate->body_opaque_start = 0;
|
|
lstate->body_start = 0;
|
|
lstate->body_end = 0;
|
|
yyless(0);
|
|
BC(boundary);
|
|
return BODY;
|
|
} else if (lstate->preamble_start) {
|
|
yylval_param->position.start = lstate->preamble_start;
|
|
yylval_param->position.end = lstate->current_pos;
|
|
lstate->preamble_start = lstate->preamble_end = 0;
|
|
yyless(0);
|
|
BC(boundary);
|
|
return PREAMBLE;
|
|
} else {
|
|
BC(boundary);
|
|
yylval_param->string = strdup(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return(BOUNDARY);
|
|
}
|
|
}
|
|
} else {
|
|
}
|
|
|
|
REJECT;
|
|
}
|
|
|
|
<body>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
lstate->lineno++;
|
|
}
|
|
|
|
<body>\r {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
/* dprintf2("stray CR in body...\n"); */
|
|
}
|
|
|
|
<body>[^\r\n]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
<body><<EOF>> {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
if (lstate->boundary_string == NULL && lstate->body_start) {
|
|
yylval_param->position.opaque_start = 0;
|
|
yylval_param->position.start = lstate->body_start;
|
|
yylval_param->position.end = lstate->current_pos;
|
|
lstate->body_start = 0;
|
|
return BODY;
|
|
} else if (lstate->body_start) {
|
|
return POSTAMBLE;
|
|
}
|
|
yyterminate();
|
|
}
|
|
|
|
<preamble,postamble>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
/* dprintf2("Preamble CR/LF at line %d\n", lineno); */
|
|
lstate->lineno++;
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
<boundary>[^\r\n]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
yylval_param->string = strdup(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return BOUNDARY;
|
|
}
|
|
|
|
<endboundary>[^\r\n]+ {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
yylval_param->string = strdup(yytext);
|
|
lstate->current_pos += yyleng;
|
|
return ENDBOUNDARY;
|
|
}
|
|
|
|
<boundary>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(headers);
|
|
lstate->lineno++;
|
|
lstate->current_pos += yyleng;
|
|
lstate->body_opaque_start = lstate->current_pos;
|
|
return EOL;
|
|
}
|
|
|
|
<endboundary>(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
BC(postamble);
|
|
lstate->lineno++;
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
<preamble>. {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
|
|
<postamble>. {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
}
|
|
|
|
(\r\n|\n) {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->lineno++;
|
|
lstate->current_pos += yyleng;
|
|
return EOL;
|
|
}
|
|
|
|
. {
|
|
struct lexer_state *lstate = yyget_extra(yyscanner);
|
|
lstate->current_pos += yyleng;
|
|
return((int)*yytext);
|
|
}
|
|
|
|
|
|
%%
|
|
|
|
void reset_lexer_state(void *yyscanner, struct parser_state *pstate)
|
|
{
|
|
struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
|
|
struct lexer_state *lstate = &(pstate->lstate);
|
|
|
|
yyset_extra((void*)lstate, yyscanner);
|
|
BEGIN(0);
|
|
lstate->header_state = STATE_MAIL;
|
|
lstate->lineno = 0;
|
|
lstate->current_pos = 1;
|
|
lstate->condition = 0;
|
|
|
|
lstate->is_envelope = 1;
|
|
|
|
lstate->message_len = 0;
|
|
lstate->buffer_length = 0;
|
|
|
|
/* temporary marker variables */
|
|
lstate->body_opaque_start = 0;
|
|
lstate->body_start = 0;
|
|
lstate->body_end = 0;
|
|
lstate->preamble_start = 0;
|
|
lstate->preamble_end = 0;
|
|
lstate->postamble_start = 0;
|
|
lstate->postamble_end = 0;
|
|
}
|
|
|
|
void
|
|
PARSER_setbuffer(char *string, yyscan_t scanner)
|
|
{
|
|
struct lexer_state *lstate = yyget_extra(scanner);
|
|
lstate->message_buffer = string;
|
|
yy_scan_string(string, scanner);
|
|
}
|
|
|
|
void
|
|
PARSER_setfp(FILE *fp, yyscan_t yyscanner)
|
|
{
|
|
/* looks like a bug in bison 2.2a -- the wrong code is generated for yyset_in !! */
|
|
struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
|
|
yyg->yyin_r = fp;
|
|
|
|
if (0) {
|
|
/* This is just to make a compiler warning go away */
|
|
yyunput(0, NULL, yyscanner);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Counts how many lines a given string represents in the message (in case of
|
|
* folded header values, for example, or a message body).
|
|
*/
|
|
int
|
|
count_lines(char *txt)
|
|
{
|
|
char *o;
|
|
int line;
|
|
|
|
line = 0;
|
|
|
|
for (o = txt; *o != '\0'; o++)
|
|
if (*o == '\n')
|
|
line++;
|
|
|
|
return line;
|
|
}
|