mime-parsefull.cc (15206B)
1 /* -*- Mode: c++; -*- */ 2 /* -------------------------------------------------------------------- 3 * Filename: 4 * mime-parsefull.cc 5 * 6 * Description: 7 * Implementation of main mime parser components 8 * 9 * Authors: 10 * Andreas Aardal Hanssen <andreas-binc curly bincimap spot org> 11 * 12 * Bugs: 13 * 14 * ChangeLog: 15 * 16 * -------------------------------------------------------------------- 17 * Copyright 2002-2005 Andreas Aardal Hanssen 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of the GNU General Public License as published by 21 * the Free Software Foundation; either version 2 of the License, or 22 * (at your option) any later version. 23 * 24 * This program is distributed in the hope that it will be useful, 25 * but WITHOUT ANY WARRANTY; without even the implied warranty of 26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 27 * GNU General Public License for more details. 28 * 29 * You should have received a copy of the GNU General Public License 30 * along with this program; if not, write to the Free Software 31 * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. 32 * -------------------------------------------------------------------- 33 */ 34 #ifdef HAVE_CONFIG_H 35 #include <config.h> 36 #endif 37 38 #include "mime.h" 39 #include "mime-utils.h" 40 #include "convert.h" 41 #include "io.h" 42 #include <string> 43 #include <vector> 44 #include <map> 45 #include <exception> 46 #include <iostream> 47 48 #include <string.h> 49 #include <ctype.h> 50 #include <stdio.h> 51 #include <errno.h> 52 53 using namespace ::std; 54 55 int crlffile = 0; 56 char crlfdata[4096]; 57 unsigned int crlftail = 0; 58 unsigned int crlfhead = 0; 59 unsigned int crlfoffset = 0; 60 char lastchar = '\0'; 61 62 //------------------------------------------------------------------------ 63 bool fillInputBuffer(void) 64 { 65 char raw[1024]; 66 67 ssize_t nbytes; 68 for (;;) { 69 nbytes = read(crlffile, raw, sizeof(raw) - 1); 70 if (nbytes <= 0) { 71 // FIXME: If ferror(crlffile) we should log this. 72 return false; 73 } 74 else break; 75 } 76 77 for (ssize_t i = 0; i < nbytes; ++i) { 78 const char c = raw[i]; 79 switch (c) { 80 case '\r': 81 if (lastchar == '\r') { 82 crlfdata[crlftail++ & 0xfff] = '\r'; 83 crlfdata[crlftail++ & 0xfff] = '\n'; 84 } 85 break; 86 case '\n': 87 crlfdata[crlftail++ & 0xfff] = '\r'; 88 crlfdata[crlftail++ & 0xfff] = '\n'; 89 break; 90 default: 91 if (lastchar == '\r') { 92 crlfdata[crlftail++ & 0xfff] = '\r'; 93 crlfdata[crlftail++ & 0xfff] = '\n'; 94 } 95 96 crlfdata[crlftail++ & 0xfff] = c; 97 break; 98 } 99 100 lastchar = c; 101 } 102 103 return true; 104 } 105 106 107 //------------------------------------------------------------------------ 108 void Binc::MimeDocument::parseFull(int fd) const 109 { 110 if (allIsParsed) 111 return; 112 113 allIsParsed = true; 114 115 crlffile = fd; 116 crlfReset(); 117 118 headerstartoffsetcrlf = 0; 119 headerlength = 0; 120 bodystartoffsetcrlf = 0; 121 bodylength = 0; 122 size = 0; 123 messagerfc822 = false; 124 multipart = false; 125 126 int bsize = 0; 127 MimePart::parseFull("", bsize); 128 129 // eat any trailing junk to get the correct size 130 char c; 131 while (crlfGetChar(c)); 132 133 size = crlfoffset; 134 } 135 136 //------------------------------------------------------------------------ 137 int Binc::MimePart::parseFull(const string &toboundary, int &boundarysize) const 138 { 139 string name; 140 string content; 141 char cqueue[4]; 142 memset(cqueue, 0, sizeof(cqueue)); 143 144 bool quit = false; 145 char c; 146 bool eof = false; 147 148 headerstartoffsetcrlf = crlfoffset; 149 150 while (!quit && !eof) { 151 // read name 152 while (1) { 153 if (!crlfGetChar(c)) { 154 eof = true; 155 break; 156 } 157 158 if (c == '\n') ++nlines; 159 if (c == ':') break; 160 if (c == '\n') { 161 // If we encounter a \n before we got to the first ':', then 162 // if the line is not empty, rewind back to the start of the 163 // line and assume we're at the start of the body. If not, 164 // just skip the line and assume we're at the start of the 165 // body. 166 string ntmp = name; 167 trim(ntmp); 168 if (ntmp != "") 169 for (int i = name.length() - 1; i >= 0; --i) 170 crlfUnGetChar(); 171 172 quit = true; 173 name = ""; 174 break; 175 } 176 177 name += c; 178 179 if (name.length() == 2 && name.substr(0, 2) == "\r\n") { 180 name = ""; 181 quit = true; 182 break; 183 } 184 } 185 186 if (name.length() == 1 && name[0] == '\r') { 187 name = ""; 188 break; 189 } 190 191 if (quit || eof) break; 192 193 while (!quit) { 194 if (!crlfGetChar(c)) { 195 quit = true; 196 break; 197 } 198 199 if (c == '\n') ++nlines; 200 201 for (int i = 0; i < 3; ++i) 202 cqueue[i] = cqueue[i + 1]; 203 cqueue[3] = c; 204 205 if (strncmp(cqueue, "\r\n\r\n", 4) == 0) { 206 quit = true; 207 break; 208 } 209 210 if (cqueue[2] == '\n') { 211 212 // guess the mime rfc says what can not appear on the beginning 213 // of a line. 214 if (!isspace(cqueue[3])) { 215 if (content.length() > 2) 216 content.resize(content.length() - 2); 217 218 trim(content); 219 h.add(name, content); 220 221 name = c; 222 content = ""; 223 break; 224 } 225 } 226 227 content += c; 228 } 229 } 230 231 if (name != "") { 232 if (content.length() > 2) 233 content.resize(content.length() - 2); 234 h.add(name, content); 235 } 236 237 // Headerlength includes the seperating CRLF. Body starts after the 238 // CRLF. 239 headerlength = crlfoffset - headerstartoffsetcrlf; 240 bodystartoffsetcrlf = crlfoffset; 241 bodylength = 0; 242 243 // If we encounter the end of file, we return 1 as if we found our 244 // parent's terminal boundary. This will cause a safe exit, and 245 // whatever we parsed until now will be available. 246 if (eof) 247 return 1; 248 249 // Do simple parsing of headers to determine the 250 // type of message (multipart,messagerfc822 etc) 251 HeaderItem ctype; 252 if (h.getFirstHeader("content-type", ctype)) { 253 vector<string> types; 254 split(ctype.getValue(), ";", types); 255 256 if (types.size() > 0) { 257 // first element should describe content type 258 string tmp = types[0]; 259 trim(tmp); 260 vector<string> v; 261 split(tmp, "/", v); 262 string key, value; 263 264 key = (v.size() > 0) ? v[0] : "text"; 265 value = (v.size() > 1) ? v[1] : "plain"; 266 lowercase(key); 267 268 if (key == "multipart") { 269 multipart = true; 270 lowercase(value); 271 subtype = value; 272 } else if (key == "message") { 273 lowercase(value); 274 if (value == "rfc822") 275 messagerfc822 = true; 276 } 277 } 278 279 for (vector<string>::const_iterator i = types.begin(); 280 i != types.end(); ++i) { 281 string element = *i; 282 trim(element); 283 284 if (element.find("=") != string::npos) { 285 string::size_type pos = element.find('='); 286 string key = element.substr(0, pos); 287 string value = element.substr(pos + 1); 288 289 lowercase(key); 290 trim(key); 291 292 if (key == "boundary") { 293 trim(value, " \""); 294 boundary = value; 295 } 296 } 297 } 298 } 299 300 bool foundendofpart = false; 301 if (messagerfc822) { 302 // message rfc822 means a completely enclosed mime document. we 303 // call the parser recursively, and pass on the boundary string 304 // that we got. when parse() finds this boundary, it returns 0. if 305 // it finds the end boundary (boundary + "--"), it returns != 0. 306 MimePart m; 307 308 // parsefull returns the number of bytes that need to be removed 309 // from the body because of the terminating boundary string. 310 int bsize; 311 if (m.parseFull(toboundary, bsize)) 312 foundendofpart = true; 313 314 // make sure bodylength doesn't overflow 315 bodylength = crlfoffset; 316 if (bodylength >= bodystartoffsetcrlf) { 317 bodylength -= bodystartoffsetcrlf; 318 if (bodylength >= (unsigned int) bsize) { 319 bodylength -= (unsigned int) bsize; 320 } else { 321 bodylength = 0; 322 } 323 } else { 324 bodylength = 0; 325 } 326 327 nbodylines += m.getNofLines(); 328 329 members.push_back(m); 330 331 } else if (multipart) { 332 // multipart parsing starts with skipping to the first 333 // boundary. then we call parse() for all parts. the last parse() 334 // command will return a code indicating that it found the last 335 // boundary of this multipart. Note that the first boundary does 336 // not have to start with CRLF. 337 string delimiter = "--" + boundary; 338 339 char *delimiterqueue = 0; 340 int endpos = delimiter.length(); 341 delimiterqueue = new char[endpos]; 342 int delimiterpos = 0; 343 bool eof = false; 344 345 // first, skip to the first delimiter string. Anything between the 346 // header and the first delimiter string is simply ignored (it's 347 // usually a text message intended for non-mime clients) 348 do { 349 if (!crlfGetChar(c)) { 350 eof = true; 351 break; 352 } 353 354 if (c == '\n') 355 ++nlines; 356 357 delimiterqueue[delimiterpos++ % endpos] = c; 358 359 // Fixme: Must also check for all parents' delimiters. 360 } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos)); 361 362 delete delimiterqueue; 363 364 if (!eof) 365 boundarysize = delimiter.size(); 366 367 // Read two more characters. This may be CRLF, it may be "--" and 368 // it may be any other two characters. 369 char a; 370 if (!crlfGetChar(a)) 371 eof = true; 372 373 if (a == '\n') 374 ++nlines; 375 376 char b; 377 if (!crlfGetChar(b)) 378 eof = true; 379 380 if (b == '\n') 381 ++nlines; 382 383 // If we find two dashes after the boundary, then this is the end 384 // of boundary marker. 385 if (!eof) { 386 if (a == '-' && b == '-') { 387 foundendofpart = true; 388 boundarysize += 2; 389 390 if (!crlfGetChar(a)) 391 eof = true; 392 393 if (a == '\n') 394 ++nlines; 395 396 if (!crlfGetChar(b)) 397 eof = true; 398 399 if (b == '\n') 400 ++nlines; 401 } 402 403 if (a == '\r' && b == '\n') { 404 // This exception is to handle a special case where the 405 // delimiter of one part is not followed by CRLF, but 406 // immediately followed by a CRLF prefixed delimiter. 407 if (!crlfGetChar(a) || !crlfGetChar(b)) 408 eof = true; 409 else if (a == '-' && b == '-') { 410 crlfUnGetChar(); 411 crlfUnGetChar(); 412 crlfUnGetChar(); 413 crlfUnGetChar(); 414 } else { 415 crlfUnGetChar(); 416 crlfUnGetChar(); 417 } 418 419 boundarysize += 2; 420 } else { 421 crlfUnGetChar(); 422 crlfUnGetChar(); 423 } 424 } 425 426 // read all mime parts. 427 if (!foundendofpart && !eof) { 428 bool quit = false; 429 do { 430 MimePart m; 431 432 // If parseFull returns != 0, then it encountered the multipart's 433 // final boundary. 434 int bsize = 0; 435 if (m.parseFull(boundary, bsize)) { 436 quit = true; 437 boundarysize = bsize; 438 } 439 440 members.push_back(m); 441 nlines += m.getNofLines(); 442 443 } while (!quit); 444 } 445 446 if (!foundendofpart && !eof) { 447 // multipart parsing starts with skipping to the first 448 // boundary. then we call parse() for all parts. the last parse() 449 // command will return a code indicating that it found the last 450 // boundary of this multipart. Note that the first boundary does 451 // not have to start with CRLF. 452 string delimiter = "\r\n--" + toboundary; 453 454 char *delimiterqueue = 0; 455 int endpos = delimiter.length(); 456 delimiterqueue = new char[endpos]; 457 int delimiterpos = 0; 458 bool eof = false; 459 460 // first, skip to the first delimiter string. Anything between the 461 // header and the first delimiter string is simply ignored (it's 462 // usually a text message intended for non-mime clients) 463 do { 464 if (!crlfGetChar(c)) { 465 eof = true; 466 break; 467 } 468 469 if (c == '\n') 470 ++nlines; 471 472 delimiterqueue[delimiterpos++ % endpos] = c; 473 474 // Fixme: Must also check for all parents' delimiters. 475 } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos)); 476 477 delete delimiterqueue; 478 479 if (!eof) 480 boundarysize = delimiter.size(); 481 482 // Read two more characters. This may be CRLF, it may be "--" and 483 // it may be any other two characters. 484 char a; 485 if (!crlfGetChar(a)) 486 eof = true; 487 488 if (a == '\n') 489 ++nlines; 490 491 char b; 492 if (!crlfGetChar(b)) 493 eof = true; 494 495 if (b == '\n') 496 ++nlines; 497 498 // If we find two dashes after the boundary, then this is the end 499 // of boundary marker. 500 if (!eof) { 501 if (a == '-' && b == '-') { 502 foundendofpart = true; 503 boundarysize += 2; 504 505 if (!crlfGetChar(a)) 506 eof = true; 507 508 if (a == '\n') 509 ++nlines; 510 511 if (!crlfGetChar(b)) 512 eof = true; 513 514 if (b == '\n') 515 ++nlines; 516 } 517 518 if (a == '\r' && b == '\n') { 519 // This exception is to handle a special case where the 520 // delimiter of one part is not followed by CRLF, but 521 // immediately followed by a CRLF prefixed delimiter. 522 if (!crlfGetChar(a) || !crlfGetChar(b)) 523 eof = true; 524 else if (a == '-' && b == '-') { 525 crlfUnGetChar(); 526 crlfUnGetChar(); 527 crlfUnGetChar(); 528 crlfUnGetChar(); 529 } else { 530 crlfUnGetChar(); 531 crlfUnGetChar(); 532 } 533 534 boundarysize += 2; 535 } else { 536 crlfUnGetChar(); 537 crlfUnGetChar(); 538 } 539 } 540 } 541 542 // make sure bodylength doesn't overflow 543 bodylength = crlfoffset; 544 if (bodylength >= bodystartoffsetcrlf) { 545 bodylength -= bodystartoffsetcrlf; 546 if (bodylength >= (unsigned int) boundarysize) { 547 bodylength -= (unsigned int) boundarysize; 548 } else { 549 bodylength = 0; 550 } 551 } else { 552 bodylength = 0; 553 } 554 555 } else { 556 // If toboundary is empty, then we read until the end of the 557 // file. Otherwise we will read until we encounter toboundary. 558 string _toboundary; 559 if (toboundary != "") { 560 _toboundary = "\r\n--"; 561 _toboundary += toboundary; 562 } 563 564 char *boundaryqueue = 0; 565 int endpos = _toboundary.length(); 566 if (toboundary != "") 567 boundaryqueue = new char[endpos]; 568 int boundarypos = 0; 569 570 boundarysize = 0; 571 572 string line; 573 int nchars = 0; 574 while (crlfGetChar(c)) { 575 if (c == '\n') { ++nbodylines; ++nlines; } 576 nchars++; 577 578 if (toboundary == "") 579 continue; 580 581 // find boundary 582 boundaryqueue[boundarypos++ % endpos] = c; 583 584 if (compareStringToQueue(_toboundary, boundaryqueue, boundarypos, endpos)) { 585 boundarysize = _toboundary.length(); 586 break; 587 } 588 } 589 590 delete boundaryqueue; 591 592 if (toboundary != "") { 593 char a; 594 if (!crlfGetChar(a)) 595 eof = true; 596 597 if (a == '\n') 598 ++nlines; 599 char b; 600 if (!crlfGetChar(b)) 601 eof = true; 602 603 if (b == '\n') 604 ++nlines; 605 606 if (a == '-' && b == '-') { 607 boundarysize += 2; 608 foundendofpart = true; 609 if (!crlfGetChar(a)) 610 eof = true; 611 612 if (a == '\n') 613 ++nlines; 614 615 if (!crlfGetChar(b)) 616 eof = true; 617 618 if (b == '\n') 619 ++nlines; 620 } 621 622 if (a == '\r' && b == '\n') { 623 // This exception is to handle a special case where the 624 // delimiter of one part is not followed by CRLF, but 625 // immediately followed by a CRLF prefixed delimiter. 626 if (!crlfGetChar(a) || !crlfGetChar(b)) 627 eof = true; 628 else if (a == '-' && b == '-') { 629 crlfUnGetChar(); 630 crlfUnGetChar(); 631 crlfUnGetChar(); 632 crlfUnGetChar(); 633 } else { 634 crlfUnGetChar(); 635 crlfUnGetChar(); 636 } 637 638 boundarysize += 2; 639 } else { 640 crlfUnGetChar(); 641 crlfUnGetChar(); 642 } 643 } 644 645 // make sure bodylength doesn't overflow 646 bodylength = crlfoffset; 647 if (bodylength >= bodystartoffsetcrlf) { 648 bodylength -= bodystartoffsetcrlf; 649 if (bodylength >= (unsigned int) boundarysize) { 650 bodylength -= (unsigned int) boundarysize; 651 } else { 652 bodylength = 0; 653 } 654 } else { 655 bodylength = 0; 656 } 657 } 658 659 return (eof || foundendofpart) ? 1 : 0; 660 }