mime-parsefull.cc

mime-parsefull.cc (15206B)
      1 /* -*- Mode: c++; -*- */
      2 /*  --------------------------------------------------------------------
      3  *  Filename:
      4  *    mime-parsefull.cc
      5  *  
      6  *  Description:
      7  *    Implementation of main mime parser components
      8  *
      9  *  Authors:
     10  *    Andreas Aardal Hanssen <andreas-binc curly bincimap spot org>
     11  *
     12  *  Bugs:
     13  *
     14  *  ChangeLog:
     15  *
     16  *  --------------------------------------------------------------------
     17  *  Copyright 2002-2005 Andreas Aardal Hanssen
     18  *
     19  *  This program is free software; you can redistribute it and/or modify
     20  *  it under the terms of the GNU General Public License as published by
     21  *  the Free Software Foundation; either version 2 of the License, or
     22  *  (at your option) any later version.
     23  *
     24  *  This program is distributed in the hope that it will be useful,
     25  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
     26  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     27  *  GNU General Public License for more details.
     28  *
     29  *  You should have received a copy of the GNU General Public License
     30  *  along with this program; if not, write to the Free Software
     31  *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
     32  *  --------------------------------------------------------------------
     33  */
     34 #ifdef HAVE_CONFIG_H
     35 #include <config.h>
     36 #endif
     37 
     38 #include "mime.h"
     39 #include "mime-utils.h"
     40 #include "convert.h"
     41 #include "io.h"
     42 #include <string>
     43 #include <vector>
     44 #include <map>
     45 #include <exception>
     46 #include <iostream>
     47 
     48 #include <string.h>
     49 #include <ctype.h>
     50 #include <stdio.h>
     51 #include <errno.h>
     52 
     53 using namespace ::std;
     54 
     55 int crlffile = 0;
     56 char crlfdata[4096];
     57 unsigned int crlftail = 0;
     58 unsigned int crlfhead = 0;
     59 unsigned int crlfoffset = 0;
     60 char lastchar = '\0';
     61 
     62 //------------------------------------------------------------------------
     63 bool fillInputBuffer(void)
     64 {
     65   char raw[1024];
     66   
     67   ssize_t nbytes;
     68   for (;;) {
     69     nbytes = read(crlffile, raw, sizeof(raw) - 1);
     70     if (nbytes <= 0) {
     71       // FIXME: If ferror(crlffile) we should log this.
     72       return false;
     73     }
     74     else break;
     75   }
     76   
     77   for (ssize_t i = 0; i < nbytes; ++i) {
     78     const char c = raw[i];
     79     switch (c) {
     80     case '\r':
     81       if (lastchar == '\r') {
     82 	crlfdata[crlftail++ & 0xfff] = '\r';
     83 	crlfdata[crlftail++ & 0xfff] = '\n';
     84       }
     85       break;
     86     case '\n':
     87       crlfdata[crlftail++ & 0xfff] = '\r';
     88       crlfdata[crlftail++ & 0xfff] = '\n';
     89       break;
     90     default:
     91       if (lastchar == '\r') {
     92 	crlfdata[crlftail++ & 0xfff] = '\r';
     93 	crlfdata[crlftail++ & 0xfff] = '\n';
     94       }
     95 
     96       crlfdata[crlftail++ & 0xfff] = c;
     97       break;
     98     }
     99       
    100     lastchar = c;
    101   }
    102 
    103   return true;
    104 }
    105 
    106 
    107 //------------------------------------------------------------------------
    108 void Binc::MimeDocument::parseFull(int fd) const
    109 {
    110   if (allIsParsed)
    111     return;
    112 
    113   allIsParsed = true;
    114 
    115   crlffile = fd;
    116   crlfReset();
    117 
    118   headerstartoffsetcrlf = 0;
    119   headerlength = 0;
    120   bodystartoffsetcrlf = 0;
    121   bodylength = 0;
    122   size = 0;
    123   messagerfc822 = false;
    124   multipart = false;
    125 
    126   int bsize = 0;
    127   MimePart::parseFull("", bsize);
    128 
    129   // eat any trailing junk to get the correct size
    130   char c;
    131   while (crlfGetChar(c));
    132 
    133   size = crlfoffset;
    134 }
    135 
    136 //------------------------------------------------------------------------
    137 int Binc::MimePart::parseFull(const string &toboundary, int &boundarysize) const
    138 {
    139   string name;
    140   string content;
    141   char cqueue[4];
    142   memset(cqueue, 0, sizeof(cqueue));
    143 
    144   bool quit = false;
    145   char c;
    146   bool eof = false;
    147 
    148   headerstartoffsetcrlf = crlfoffset;
    149 
    150   while (!quit && !eof) {
    151     // read name
    152     while (1) {
    153       if (!crlfGetChar(c)) {
    154 	eof = true;
    155 	break;
    156       }
    157 
    158       if (c == '\n') ++nlines;
    159       if (c == ':') break;
    160       if (c == '\n') {
    161 	// If we encounter a \n before we got to the first ':', then
    162 	// if the line is not empty, rewind back to the start of the
    163 	// line and assume we're at the start of the body. If not,
    164 	// just skip the line and assume we're at the start of the
    165 	// body.
    166 	string ntmp = name;
    167 	trim(ntmp);
    168 	if (ntmp != "")
    169 	  for (int i = name.length() - 1; i >= 0; --i)
    170 	    crlfUnGetChar();
    171 
    172 	quit = true;
    173 	name = "";
    174 	break;
    175       }
    176 
    177       name += c;
    178 
    179       if (name.length() == 2 && name.substr(0, 2) == "\r\n") {
    180 	name = "";
    181 	quit = true;
    182 	break;
    183       }
    184     }
    185 
    186     if (name.length() == 1 && name[0] == '\r') {
    187       name = "";
    188       break;
    189     }
    190 
    191     if (quit || eof) break;
    192 
    193     while (!quit) {
    194       if (!crlfGetChar(c)) {
    195 	quit = true;
    196 	break;
    197       }
    198 
    199       if (c == '\n') ++nlines;
    200 
    201       for (int i = 0; i < 3; ++i)
    202 	cqueue[i] = cqueue[i + 1];
    203       cqueue[3] = c;
    204 
    205       if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
    206 	quit = true;
    207 	break;
    208       }
    209 
    210       if (cqueue[2] == '\n') {
    211 
    212 	// guess the mime rfc says what can not appear on the beginning
    213 	// of a line.
    214 	if (!isspace(cqueue[3])) {
    215 	  if (content.length() > 2)
    216 	    content.resize(content.length() - 2);
    217 
    218 	  trim(content);
    219 	  h.add(name, content);
    220 
    221 	  name = c;
    222 	  content = "";
    223 	  break;
    224 	}
    225       }
    226 
    227       content += c;
    228     }
    229   }
    230 
    231   if (name != "") {
    232     if (content.length() > 2)
    233       content.resize(content.length() - 2);
    234     h.add(name, content);
    235   }
    236 
    237   // Headerlength includes the seperating CRLF. Body starts after the
    238   // CRLF.
    239   headerlength = crlfoffset - headerstartoffsetcrlf;
    240   bodystartoffsetcrlf = crlfoffset;
    241   bodylength = 0;
    242 
    243   // If we encounter the end of file, we return 1 as if we found our
    244   // parent's terminal boundary. This will cause a safe exit, and
    245   // whatever we parsed until now will be available.
    246   if (eof)
    247     return 1;
    248 
    249   // Do simple parsing of headers to determine the
    250   // type of message (multipart,messagerfc822 etc)
    251   HeaderItem ctype;
    252   if (h.getFirstHeader("content-type", ctype)) {
    253     vector<string> types;
    254     split(ctype.getValue(), ";", types);
    255 
    256     if (types.size() > 0) {
    257       // first element should describe content type
    258       string tmp = types[0];
    259       trim(tmp);
    260       vector<string> v;
    261       split(tmp, "/", v);
    262       string key, value;
    263 
    264       key = (v.size() > 0) ? v[0] : "text";
    265       value = (v.size() > 1) ? v[1] : "plain";
    266       lowercase(key);
    267       
    268       if (key == "multipart") {
    269 	multipart = true;
    270 	lowercase(value);
    271 	subtype = value;
    272       } else if (key == "message") {
    273 	lowercase(value);
    274 	if (value == "rfc822")
    275 	  messagerfc822 = true;
    276       }
    277     }
    278 
    279     for (vector<string>::const_iterator i = types.begin();
    280 	 i != types.end(); ++i) {
    281       string element = *i;
    282       trim(element);
    283 
    284       if (element.find("=") != string::npos) {
    285 	string::size_type pos = element.find('=');
    286 	string key = element.substr(0, pos);
    287 	string value = element.substr(pos + 1);
    288 	
    289 	lowercase(key);
    290 	trim(key);
    291 
    292 	if (key == "boundary") {
    293 	  trim(value, " \"");
    294 	  boundary = value;
    295 	}
    296       }
    297     }
    298   }
    299 
    300   bool foundendofpart = false;
    301   if (messagerfc822) {
    302     // message rfc822 means a completely enclosed mime document. we
    303     // call the parser recursively, and pass on the boundary string
    304     // that we got. when parse() finds this boundary, it returns 0. if
    305     // it finds the end boundary (boundary + "--"), it returns != 0.
    306     MimePart m;
    307 
    308     // parsefull returns the number of bytes that need to be removed
    309     // from the body because of the terminating boundary string.
    310     int bsize;
    311     if (m.parseFull(toboundary, bsize))
    312       foundendofpart = true;
    313 
    314     // make sure bodylength doesn't overflow    
    315     bodylength = crlfoffset;
    316     if (bodylength >= bodystartoffsetcrlf) {
    317       bodylength -= bodystartoffsetcrlf;
    318       if (bodylength >= (unsigned int) bsize) {
    319 	bodylength -= (unsigned int) bsize;
    320       } else {
    321 	bodylength = 0;
    322       }
    323     } else {
    324       bodylength = 0;
    325     }
    326 
    327     nbodylines += m.getNofLines();
    328 
    329     members.push_back(m);
    330 
    331   } else if (multipart) {
    332     // multipart parsing starts with skipping to the first
    333     // boundary. then we call parse() for all parts. the last parse()
    334     // command will return a code indicating that it found the last
    335     // boundary of this multipart. Note that the first boundary does
    336     // not have to start with CRLF.
    337     string delimiter = "--" + boundary;
    338 
    339     char *delimiterqueue = 0;
    340     int endpos = delimiter.length();
    341     delimiterqueue = new char[endpos];
    342     int delimiterpos = 0;
    343     bool eof = false;
    344 
    345     // first, skip to the first delimiter string. Anything between the
    346     // header and the first delimiter string is simply ignored (it's
    347     // usually a text message intended for non-mime clients)
    348     do {    
    349       if (!crlfGetChar(c)) {
    350 	eof = true;
    351 	break;
    352       }
    353 
    354       if (c == '\n')
    355 	++nlines;
    356 
    357       delimiterqueue[delimiterpos++ % endpos] = c;
    358 
    359       // Fixme: Must also check for all parents' delimiters.
    360     } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
    361 
    362     delete delimiterqueue;
    363 
    364     if (!eof)
    365       boundarysize = delimiter.size();
    366 
    367     // Read two more characters. This may be CRLF, it may be "--" and
    368     // it may be any other two characters.
    369     char a;
    370     if (!crlfGetChar(a))
    371       eof = true;
    372 
    373     if (a == '\n')
    374       ++nlines; 
    375 
    376     char b;
    377     if (!crlfGetChar(b))
    378       eof = true;
    379     
    380     if (b == '\n')
    381       ++nlines;
    382     
    383     // If we find two dashes after the boundary, then this is the end
    384     // of boundary marker.
    385     if (!eof) {
    386       if (a == '-' && b == '-') {
    387 	foundendofpart = true;
    388 	boundarysize += 2;
    389 	
    390 	if (!crlfGetChar(a))
    391 	  eof = true;
    392 	
    393 	if (a == '\n')
    394 	  ++nlines; 
    395 	
    396 	if (!crlfGetChar(b))
    397 	  eof = true;
    398 	
    399 	if (b == '\n')
    400 	  ++nlines;
    401       }
    402 
    403       if (a == '\r' && b == '\n') {
    404 	// This exception is to handle a special case where the
    405 	// delimiter of one part is not followed by CRLF, but
    406 	// immediately followed by a CRLF prefixed delimiter.
    407 	if (!crlfGetChar(a) || !crlfGetChar(b))
    408 	  eof = true; 
    409 	else if (a == '-' && b == '-') {
    410 	  crlfUnGetChar();
    411 	  crlfUnGetChar();
    412 	  crlfUnGetChar();
    413 	  crlfUnGetChar();
    414 	} else {
    415 	  crlfUnGetChar();
    416 	  crlfUnGetChar();
    417 	}
    418 
    419 	boundarysize += 2;
    420       } else {
    421 	crlfUnGetChar();
    422 	crlfUnGetChar();
    423       }
    424     }
    425 
    426     // read all mime parts.
    427     if (!foundendofpart && !eof) {
    428       bool quit = false;
    429       do {
    430 	MimePart m;
    431 
    432 	// If parseFull returns != 0, then it encountered the multipart's
    433 	// final boundary.
    434 	int bsize = 0;
    435 	if (m.parseFull(boundary, bsize)) {
    436 	  quit = true;
    437 	  boundarysize = bsize;
    438 	}
    439 
    440 	members.push_back(m);
    441 	nlines += m.getNofLines();
    442 
    443       } while (!quit);
    444     }
    445 
    446     if (!foundendofpart && !eof) {
    447       // multipart parsing starts with skipping to the first
    448       // boundary. then we call parse() for all parts. the last parse()
    449       // command will return a code indicating that it found the last
    450       // boundary of this multipart. Note that the first boundary does
    451       // not have to start with CRLF.
    452       string delimiter = "\r\n--" + toboundary;
    453 
    454       char *delimiterqueue = 0;
    455       int endpos = delimiter.length();
    456       delimiterqueue = new char[endpos];
    457       int delimiterpos = 0;
    458       bool eof = false;
    459 
    460       // first, skip to the first delimiter string. Anything between the
    461       // header and the first delimiter string is simply ignored (it's
    462       // usually a text message intended for non-mime clients)
    463       do {    
    464 	if (!crlfGetChar(c)) {
    465 	  eof = true;
    466 	  break;
    467 	}
    468 
    469 	if (c == '\n')
    470 	  ++nlines;
    471 
    472 	delimiterqueue[delimiterpos++ % endpos] = c;
    473 
    474 	// Fixme: Must also check for all parents' delimiters.
    475       } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
    476 
    477       delete delimiterqueue;
    478 
    479       if (!eof)
    480 	boundarysize = delimiter.size();
    481 
    482       // Read two more characters. This may be CRLF, it may be "--" and
    483       // it may be any other two characters.
    484       char a;
    485       if (!crlfGetChar(a))
    486 	eof = true;
    487 
    488       if (a == '\n')
    489 	++nlines; 
    490 
    491       char b;
    492       if (!crlfGetChar(b))
    493 	eof = true;
    494     
    495       if (b == '\n')
    496 	++nlines;
    497     
    498       // If we find two dashes after the boundary, then this is the end
    499       // of boundary marker.
    500       if (!eof) {
    501 	if (a == '-' && b == '-') {
    502 	  foundendofpart = true;
    503 	  boundarysize += 2;
    504 	
    505 	  if (!crlfGetChar(a))
    506 	    eof = true;
    507 	
    508 	  if (a == '\n')
    509 	    ++nlines; 
    510 	
    511 	  if (!crlfGetChar(b))
    512 	    eof = true;
    513 	
    514 	  if (b == '\n')
    515 	    ++nlines;
    516 	}
    517 
    518 	if (a == '\r' && b == '\n') {
    519 	  // This exception is to handle a special case where the
    520 	  // delimiter of one part is not followed by CRLF, but
    521 	  // immediately followed by a CRLF prefixed delimiter.
    522 	  if (!crlfGetChar(a) || !crlfGetChar(b))
    523 	    eof = true; 
    524 	  else if (a == '-' && b == '-') {
    525 	    crlfUnGetChar();
    526 	    crlfUnGetChar();
    527 	    crlfUnGetChar();
    528 	    crlfUnGetChar();
    529 	  } else {
    530 	    crlfUnGetChar();
    531 	    crlfUnGetChar();
    532 	  }
    533 
    534 	  boundarysize += 2;
    535 	} else {
    536 	  crlfUnGetChar();
    537 	  crlfUnGetChar();
    538 	}
    539       }
    540     }
    541 
    542     // make sure bodylength doesn't overflow    
    543     bodylength = crlfoffset;
    544     if (bodylength >= bodystartoffsetcrlf) {
    545       bodylength -= bodystartoffsetcrlf;
    546       if (bodylength >= (unsigned int) boundarysize) {
    547 	bodylength -= (unsigned int) boundarysize;
    548       } else {
    549 	bodylength = 0;
    550       }
    551     } else {
    552       bodylength = 0;
    553     }
    554 
    555   } else {
    556     // If toboundary is empty, then we read until the end of the
    557     // file. Otherwise we will read until we encounter toboundary.
    558     string _toboundary; 
    559     if (toboundary != "") {
    560       _toboundary = "\r\n--";
    561       _toboundary += toboundary;
    562     }
    563 
    564     char *boundaryqueue = 0;
    565     int endpos = _toboundary.length();
    566     if (toboundary != "")
    567       boundaryqueue = new char[endpos];
    568     int boundarypos = 0;
    569 
    570     boundarysize = 0;
    571 
    572     string line;
    573     int nchars = 0;
    574     while (crlfGetChar(c)) {
    575       if (c == '\n') { ++nbodylines; ++nlines; }
    576       nchars++;
    577 
    578       if (toboundary == "")
    579 	continue;
    580 
    581       // find boundary
    582       boundaryqueue[boundarypos++ % endpos] = c;
    583       
    584       if (compareStringToQueue(_toboundary, boundaryqueue, boundarypos, endpos)) {
    585 	boundarysize = _toboundary.length();
    586 	break;
    587       }
    588     }
    589 
    590     delete boundaryqueue;
    591  
    592     if (toboundary != "") {
    593       char a;
    594       if (!crlfGetChar(a))
    595 	eof = true;
    596 
    597       if (a == '\n')
    598 	++nlines;
    599       char b;
    600       if (!crlfGetChar(b))
    601 	eof = true;
    602 
    603       if (b == '\n') 
    604 	++nlines;
    605 
    606       if (a == '-' && b == '-') {
    607 	boundarysize += 2;
    608 	foundendofpart = true;
    609 	if (!crlfGetChar(a))
    610 	  eof = true;
    611 
    612 	if (a == '\n')
    613 	  ++nlines;
    614 
    615 	if (!crlfGetChar(b))
    616 	  eof = true;
    617 	  
    618 	if (b == '\n')
    619 	  ++nlines;
    620       }
    621 
    622       if (a == '\r' && b == '\n') {
    623 	// This exception is to handle a special case where the
    624 	// delimiter of one part is not followed by CRLF, but
    625 	// immediately followed by a CRLF prefixed delimiter.
    626 	if (!crlfGetChar(a) || !crlfGetChar(b))
    627 	  eof = true; 
    628 	else if (a == '-' && b == '-') {
    629 	  crlfUnGetChar();
    630 	  crlfUnGetChar();
    631 	  crlfUnGetChar();
    632 	  crlfUnGetChar();
    633 	} else {
    634 	  crlfUnGetChar();
    635 	  crlfUnGetChar();
    636 	}
    637 
    638 	boundarysize += 2;
    639       } else {
    640 	crlfUnGetChar();
    641 	crlfUnGetChar();
    642       }
    643     }
    644 
    645     // make sure bodylength doesn't overflow    
    646     bodylength = crlfoffset;
    647     if (bodylength >= bodystartoffsetcrlf) {
    648       bodylength -= bodystartoffsetcrlf;
    649       if (bodylength >= (unsigned int) boundarysize) {
    650 	bodylength -= (unsigned int) boundarysize;
    651       } else {
    652 	bodylength = 0;
    653       }
    654     } else {
    655       bodylength = 0;
    656     }
    657   }
    658 
    659   return (eof || foundendofpart) ? 1 : 0;
    660 }
	bincimap
	Log \| Files \| Refs \| LICENSE