Check-in [c9c3f95db4]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improved decoding of headers.
Timelines: family | ancestors | descendants | both | python
Files: files | file ages | folders
SHA1: c9c3f95db458a72ca43edb99ae99679d046889c7
User & Date: tinus 2017-01-04 21:20:51
Context
2017-01-06
15:40
Take into account the scenario where two encoded-words are separated by a single space; in that situation, the space should be ignored. check-in: 76b4f8baff user: MCO tags: python
2017-01-04
21:20
Improved decoding of headers. check-in: c9c3f95db4 user: tinus tags: python
15:13
Since email.header.decode_header is so strict, preprocess the headers before we decode them. check-in: e28f89a2db user: tinus tags: python
Changes

Changes to mailjanitor.py.

15
16
17
18
19
20
21

22
23
24
25
26



27


28
29
30
31
32
33
34
35


36
37




38
39
40
41
42
43
44
45


46
47
48
49
50
51
52
53
54
55
56
57
import time
import json
import sqlite3
import re

DEFAULT_MAIL_ENCODING = "iso-8859-1"


ENCODED_HEADER_PATTERN = re.compile(u'=\?(.+?)\?=')
UNFOLD_HEADER_PATTERN = re.compile(u'(\r\n|\r|\n)[ \t]+')

def decode_header_part(match):
    # TODO: the result should be unicode; which means the input string should also be unicode



    return unicode(email.header.make_header(email.header.decode_header(match.group(0))))



def decode_header(message_header):
    if message_header is None:
        return None
    elif isinstance(message_header, str):
        message_header = unicode(message_header, DEFAULT_MAIL_ENCODING)
    
    try:


        # Since email.header.decode_header is too strict, call it for every separate encoded part
        # See also http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8




        result = re.sub(ENCODED_HEADER_PATTERN, decode_header_part, message_header)
    except:
        result = u''
        # TODO: test if above code works in all cases; if so, remove code below; if not, then
        #  use re.sub() to replace each bit matching =\?(.+?)\?= (see RFC2047)
        # See also http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8
        for decode in email.header.decode_header(message_header.replace("\r\n", " ")):
            if result != u'':


                result = result + u" "
            result = result + unicode(decode[0], decode[1] or DEFAULT_MAIL_ENCODING, 'replace')
    # also replace (\r\n|\r|\n)\s+ by single space
    result = re.sub(UNFOLD_HEADER_PATTERN, u' ', result)
    return None if result == u'' else result.strip()

def mime_structure(msg, indent = 0):
    result = "\t" * indent + msg.get_content_type() + "\n"
    if msg.is_multipart():
        for part in msg.get_payload():
            result = result + mime_structure(part, indent + 1)
    return result







>
|



|
>
>
>
|
>
>







|
>
>
|
|
>
>
>
>
|
<
<
<
<
<
<
<
>
>
|
<
<
<
|







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50







51
52
53



54
55
56
57
58
59
60
61
import time
import json
import sqlite3
import re

DEFAULT_MAIL_ENCODING = "iso-8859-1"

# https://tools.ietf.org/html/rfc2047#section-2
ENCODED_HEADER_PATTERN = re.compile(u'=\?([^ \t\r\n()<>@,;:"/[\]?.=]+)\?(\w+)\?([^ ?]+)\?=')
UNFOLD_HEADER_PATTERN = re.compile(u'(\r\n|\r|\n)[ \t]+')

def decode_header_part(match):
    encoded_header = match.group(0)
    #print "\tDecoding header:", encoded_header
    header_parts = email.header.decode_header(encoded_header)
    #print "\tDecoded header parts:", header_parts
    result = unicode(email.header.make_header(header_parts))
    #print "\tDecoded header:", result
    return result

def decode_header(message_header):
    if message_header is None:
        return None
    elif isinstance(message_header, str):
        message_header = unicode(message_header, DEFAULT_MAIL_ENCODING)
    
    # unfold the header: replace (\r\n|\r|\n)\s+ by single space
    message_header = re.sub(UNFOLD_HEADER_PATTERN, u' ', message_header)

    # Since email.header.decode_header is too strict, call it for every separate encoded part
    # See also http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8
    # TODO: take into account the scenario where two encoded-words are separated by a single space;
    #  in that situation, the space should be ignored.
    # TODO: take into account cases where two encoded-words are not separated by a space, but by
    #  other text.
    result = re.sub(ENCODED_HEADER_PATTERN, decode_header_part, message_header)








    # Trim any leading or trailing whitespace
    result = result.strip()



    return None if result == u'' else result

def mime_structure(msg, indent = 0):
    result = "\t" * indent + msg.get_content_type() + "\n"
    if msg.is_multipart():
        for part in msg.get_payload():
            result = result + mime_structure(part, indent + 1)
    return result