asterisk/main/minimime/mimeparser.l

485 lines
12 KiB
Plaintext

%{
/*
* Copyright (c) 2004 Jann Fischer. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/**
* This is a lexer file for parsing MIME compatible messages. It is intended
* to satisfy at least RFC 2045 (Format of Internet Message Bodies). It still
* has quite a few problems:
*
* - The parsing could probably be done in a more elegant way
* - I don't know what performance impact REJECT has on the parser
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include "mimeparser.h"
#include "mimeparser.tab.h"
#define NAMEOF(v) #v
/* BC() is a debug wrapper for lex' BEGIN() macro */
#define BC(x) do { \
struct lexer_state *lstate = yyget_extra(yyscanner); \
BEGIN(x); \
lstate->condition = x; \
} while(0);
#define ZERO(x) memset(x, '\0', sizeof(x))
#define PREALLOC_BUFFER 100000
#undef YY_BUF_SIZE
#define YY_BUF_SIZE 65536
enum header_states
{
STATE_MAIL = 0,
STATE_CTYPE,
STATE_CDISP,
STATE_CENC,
STATE_MIME
};
%}
%option reentrant
%option yylineno
%option bison-bridge
%s headers
%s header
%s headervalue
%s tspecialvalue
%s comment
%s body
%s postamble
%s preamble
%s boundary
%s endboundary
%s endoffile
STRING [a-zA-Z0-9\-\.\_]
TSPECIAL [a-zA-Z0-9)(<>@,;:/\-.=_\+'? ]
TSPECIAL_LITE [a-zA-Z0-9)(<>@,-._+'?\[\]]
%%
<INITIAL,headers>^[a-zA-Z]+[a-zA-Z0-9\-\_]* {
struct lexer_state *lstate = yyget_extra(yyscanner);
yylval_param->string=strdup(yytext);
lstate->current_pos += yyleng;
BC(header);
/* Depending on what header we are processing, we enter a different
* state and return a different value.
*/
if (!strcasecmp(yytext, "Content-Type")) {
lstate->header_state = STATE_CTYPE;
return CONTENTTYPE_HEADER;
} else if (!strcasecmp(yytext, "Content-Transfer-Encoding")) {
lstate->header_state = STATE_CENC;
return CONTENTENCODING_HEADER;
} else if (!strcasecmp(yytext, "Content-Disposition")) {
lstate->header_state = STATE_CDISP;
return CONTENTDISPOSITION_HEADER;
} else if (!strcasecmp(yytext, "MIME-Version")) {
lstate->header_state = STATE_MAIL;
return MIMEVERSION_HEADER;
} else {
lstate->header_state = STATE_MAIL;
return MAIL_HEADER;
}
}
<INITIAL,headers>. {
struct lexer_state *lstate = yyget_extra(yyscanner);
/* dprintf2("Unknown header char: %c\n", *yytext); */
lstate->current_pos += yyleng;
return ANY;
}
<headers>^(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->lineno++;
lstate->current_pos += yyleng;
/* This marks the end of headers. Depending on whether we are in the
* envelope currently we need to parse either a body or the preamble
* now.
*/
if (lstate->is_envelope == 0 || lstate->boundary_string == NULL) {
BC(body);
lstate->body_start = lstate->current_pos;
} else {
lstate->is_envelope = 0;
lstate->preamble_start = lstate->current_pos;
BC(preamble);
}
return ENDOFHEADERS;
}
<header>\: {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(headervalue);
lstate->current_pos += yyleng;
return COLON;
}
<header>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(headers);
/* dprintf2("Invalid header, returning EOL\n"); */
lstate->current_pos += yyleng;
return EOL;
}
<headervalue>(\n|\r\n)[\ \t]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
}
<headervalue>.+|(.+(\n|\r\n)[\ \t]+.+)+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
if (lstate->header_state != STATE_MAIL && lstate->header_state != STATE_CENC) {
REJECT;
}
lstate->current_pos += yyleng;
while (*yytext && isspace(*yytext)) yytext++;
/* Do we actually have a header value? */
if (*yytext == '\0') {
yylval_param->string = strdup("");
} else {
yylval_param->string=strdup(yytext);
lstate->lineno += count_lines(yytext);
}
return WORD;
}
<headervalue,tspecialvalue>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
/* marks the end of one header line */
lstate->lineno++;
BC(headers);
lstate->current_pos += yyleng;
return EOL;
}
<headervalue>;|;(\r\n|\n)[\ \t]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->lineno += count_lines(yytext);
lstate->current_pos += yyleng;
return SEMICOLON;
}
<headervalue>\= {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
return EQUAL;
}
<headervalue>\" {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(tspecialvalue);
lstate->current_pos += yyleng;
return *yytext;
}
<headervalue>{STRING}+|{TSPECIAL_LITE}+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
yylval_param->string=strdup(yytext);
lstate->lineno += count_lines(yytext);
lstate->current_pos += yyleng;
return WORD;
}
<headervalue>[\ |\t]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
}
<tspecialvalue>{TSPECIAL}+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->lineno += count_lines(yytext);
yylval_param->string=strdup(yytext);
lstate->current_pos += yyleng;
return TSPECIAL;
}
<tspecialvalue>\" {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(headervalue);
lstate->current_pos += yyleng;
return *yytext;
}
<body>^\-\-{TSPECIAL}+\-\- {
struct lexer_state *lstate = yyget_extra(yyscanner);
/**
* Make sure we only catch matching boundaries, and not other lines
* that begin and end with two dashes. If we have catched a valid
* end boundary, which actually ends a body, we save the current
* position, put the token back on the input stream and let the
* endboundary condition parse the actual token.
*/
if (lstate->endboundary_string != NULL) {
if (strcmp(lstate->endboundary_string, yytext)) {
/* dprintf2("YYTEXT != end_boundary: '%s'\n", yytext); */
REJECT;
} else {
lstate->current_pos += yyleng;
/* dprintf2("YYTEXT == lstate->end_boundary: '%s'\n", yytext); */
if (lstate->body_start) {
yylval_param->position.opaque_start =
lstate->body_opaque_start;
yylval_param->position.start = lstate->body_start;
yylval_param->position.end = lstate->current_pos - yyleng;
lstate->body_opaque_start = 0;
lstate->body_start = 0;
lstate->body_end = 0;
yyless(0);
BC(endboundary);
return BODY;
}
}
} else {
}
REJECT;
}
<body,preamble>^\-\-{TSPECIAL}+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
/**
* Make sure we only catch matching boundaries, and not other lines
* that begin with two dashes.
*/
if (lstate->boundary_string != NULL) {
if (strcmp(lstate->boundary_string, yytext)) {
/* dprintf2("YYTEXT != boundary: '%s'\n", yytext);*/
REJECT;
} else {
/* dprintf2("YYTEXT == boundary: '%s'\n", yytext);*/
if (lstate->body_start) {
yylval_param->position.opaque_start = lstate->body_opaque_start;
yylval_param->position.start = lstate->body_start;
yylval_param->position.end = lstate->current_pos;
lstate->body_opaque_start = 0;
lstate->body_start = 0;
lstate->body_end = 0;
yyless(0);
BC(boundary);
return BODY;
} else if (lstate->preamble_start) {
yylval_param->position.start = lstate->preamble_start;
yylval_param->position.end = lstate->current_pos;
lstate->preamble_start = lstate->preamble_end = 0;
yyless(0);
BC(boundary);
return PREAMBLE;
} else {
BC(boundary);
yylval_param->string = strdup(yytext);
lstate->current_pos += yyleng;
return(BOUNDARY);
}
}
} else {
}
REJECT;
}
<body>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
lstate->lineno++;
}
<body>\r {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
/* dprintf2("stray CR in body...\n"); */
}
<body>[^\r\n]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
}
<body><<EOF>> {
struct lexer_state *lstate = yyget_extra(yyscanner);
if (lstate->boundary_string == NULL && lstate->body_start) {
yylval_param->position.opaque_start = 0;
yylval_param->position.start = lstate->body_start;
yylval_param->position.end = lstate->current_pos;
lstate->body_start = 0;
return BODY;
} else if (lstate->body_start) {
return POSTAMBLE;
}
yyterminate();
}
<preamble,postamble>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
/* dprintf2("Preamble CR/LF at line %d\n", lineno); */
lstate->lineno++;
lstate->current_pos += yyleng;
}
<boundary>[^\r\n]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
yylval_param->string = strdup(yytext);
lstate->current_pos += yyleng;
return BOUNDARY;
}
<endboundary>[^\r\n]+ {
struct lexer_state *lstate = yyget_extra(yyscanner);
yylval_param->string = strdup(yytext);
lstate->current_pos += yyleng;
return ENDBOUNDARY;
}
<boundary>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(headers);
lstate->lineno++;
lstate->current_pos += yyleng;
lstate->body_opaque_start = lstate->current_pos;
return EOL;
}
<endboundary>(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
BC(postamble);
lstate->lineno++;
lstate->current_pos += yyleng;
}
<preamble>. {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
}
<postamble>. {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
}
(\r\n|\n) {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->lineno++;
lstate->current_pos += yyleng;
return EOL;
}
. {
struct lexer_state *lstate = yyget_extra(yyscanner);
lstate->current_pos += yyleng;
return((int)*yytext);
}
%%
void reset_lexer_state(void *yyscanner, struct parser_state *pstate)
{
struct yyguts_t * yyg = (struct yyguts_t*)yyscanner;
struct lexer_state *lstate = &(pstate->lstate);
yyset_extra((void*)lstate, yyscanner);
BEGIN(0);
lstate->header_state = STATE_MAIL;
lstate->lineno = 0;
lstate->current_pos = 1;
lstate->condition = 0;
lstate->is_envelope = 1;
lstate->message_len = 0;
lstate->buffer_length = 0;
/* temporary marker variables */
lstate->body_opaque_start = 0;
lstate->body_start = 0;
lstate->body_end = 0;
lstate->preamble_start = 0;
lstate->preamble_end = 0;
lstate->postamble_start = 0;
lstate->postamble_end = 0;
}
void
PARSER_setbuffer(const char *string, yyscan_t scanner)
{
struct lexer_state *lstate = yyget_extra(scanner);
lstate->message_buffer = string;
yy_scan_string(string, scanner);
}
void
PARSER_setfp(FILE *fp, yyscan_t scanner)
{
/* looks like a bug in bison 2.2a -- the wrong code is generated for yyset_in !! */
struct yyguts_t * yyg = (struct yyguts_t*) scanner;
yyg->yyin_r = fp;
if (0) {
/* This is just to make a compiler warning go away */
yyunput(0, NULL, scanner);
}
}
/**
* Counts how many lines a given string represents in the message (in case of
* folded header values, for example, or a message body).
*/
int
count_lines(char *txt)
{
char *o;
int line;
line = 0;
for (o = txt; *o != '\0'; o++)
if (*o == '\n')
line++;
return line;
}