Check-in [76b4f8baff]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Take into account the scenario where two encoded-words are separated by a single space; in that situation, the space should be ignored.
Timelines: family | ancestors | descendants | both | python
Files: files | file ages | folders
SHA1: 76b4f8baff94d6a92d935c970017957170dea3dc
User & Date: MCO 2017-01-06 15:40:48
Context
2017-01-06
15:51
Attempt to solve the UnboundLocalError by making decode_header_part an inner function. check-in: 5eaed419c9 user: MCO tags: python
15:40
Take into account the scenario where two encoded-words are separated by a single space; in that situation, the space should be ignored. check-in: 76b4f8baff user: MCO tags: python
2017-01-04
21:20
Improved decoding of headers. check-in: c9c3f95db4 user: tinus tags: python
Changes

Changes to mailjanitor.py.

16
17
18
19
20
21
22
23
24
25


26

27



28
29
30
31
32

33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49

50
51
52
53
54
55
56
...
148
149
150
151
152
153
154




155
156
157
158
159
160
161
import json
import sqlite3
import re

DEFAULT_MAIL_ENCODING = "iso-8859-1"

# https://tools.ietf.org/html/rfc2047#section-2
ENCODED_HEADER_PATTERN = re.compile(u'=\?([^ \t\r\n()<>@,;:"/[\]?.=]+)\?(\w+)\?([^ ?]+)\?=')
UNFOLD_HEADER_PATTERN = re.compile(u'(\r\n|\r|\n)[ \t]+')



def decode_header_part(match):

    encoded_header = match.group(0)



    #print "\tDecoding header:", encoded_header
    header_parts = email.header.decode_header(encoded_header)
    #print "\tDecoded header parts:", header_parts
    result = unicode(email.header.make_header(header_parts))
    #print "\tDecoded header:", result

    return result

def decode_header(message_header):
    if message_header is None:
        return None
    elif isinstance(message_header, str):
        message_header = unicode(message_header, DEFAULT_MAIL_ENCODING)
    
    # unfold the header: replace (\r\n|\r|\n)\s+ by single space
    message_header = re.sub(UNFOLD_HEADER_PATTERN, u' ', message_header)

    # Since email.header.decode_header is too strict, call it for every separate encoded part
    # See also http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8
    # TODO: take into account the scenario where two encoded-words are separated by a single space;
    #  in that situation, the space should be ignored.
    # TODO: take into account cases where two encoded-words are not separated by a space, but by
    #  other text.

    result = re.sub(ENCODED_HEADER_PATTERN, decode_header_part, message_header)

    # Trim any leading or trailing whitespace
    result = result.strip()
    return None if result == u'' else result

def mime_structure(msg, indent = 0):
................................................................................
    uids = data[0].split()
    for uid in uids:
        i += 1
        reply, data = mailbox.uid("FETCH", uid, '(UID INTERNALDATE FLAGS)')
        if reply != 'OK':
            print "ERROR getting message", uid
            return





        #print data
        #print data[0]
        # TODO: email_data = FETCH_RESPONSE_PATTERN.match(data[0]).groupdict()
        uid, internaldate, flags = FETCH_RESPONSE_PATTERN.match(data[0]).groups()
        #print "UID", uid
        #print "Internal date", internaldate







|


>
>

>

>
>
>
|



|
>













<
<


>







 







>
>
>
>







16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


53
54
55
56
57
58
59
60
61
62
...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import json
import sqlite3
import re

DEFAULT_MAIL_ENCODING = "iso-8859-1"

# https://tools.ietf.org/html/rfc2047#section-2
ENCODED_HEADER_PATTERN = re.compile(u'( )?=\?([^ \t\r\n()<>@,;:"/[\]?.=]+)\?(\w+)\?([^ ?]+)\?=')
UNFOLD_HEADER_PATTERN = re.compile(u'(\r\n|\r|\n)[ \t]+')

last_match_end = None

def decode_header_part(match):
    debug = False
    encoded_header = match.group(0)
    if match.group(1) == ' ' and last_match_end == match.start():
        debug = True
        encoded_header = encoded_header[1:]
    if debug: print "\tDecoding header:", encoded_header
    header_parts = email.header.decode_header(encoded_header)
    #print "\tDecoded header parts:", header_parts
    result = unicode(email.header.make_header(header_parts))
    if debug: print "\tDecoded header:", result
    last_match_end = match.end()
    return result

def decode_header(message_header):
    if message_header is None:
        return None
    elif isinstance(message_header, str):
        message_header = unicode(message_header, DEFAULT_MAIL_ENCODING)
    
    # unfold the header: replace (\r\n|\r|\n)\s+ by single space
    message_header = re.sub(UNFOLD_HEADER_PATTERN, u' ', message_header)

    # Since email.header.decode_header is too strict, call it for every separate encoded part
    # See also http://stackoverflow.com/questions/7331351/python-email-header-decoding-utf-8


    # TODO: take into account cases where two encoded-words are not separated by a space, but by
    #  other text.
    last_match_end = -1
    result = re.sub(ENCODED_HEADER_PATTERN, decode_header_part, message_header)

    # Trim any leading or trailing whitespace
    result = result.strip()
    return None if result == u'' else result

def mime_structure(msg, indent = 0):
................................................................................
    uids = data[0].split()
    for uid in uids:
        i += 1
        reply, data = mailbox.uid("FETCH", uid, '(UID INTERNALDATE FLAGS)')
        if reply != 'OK':
            print "ERROR getting message", uid
            return

        if len(data) == 0 or not isinstance(data[0], str):
            print "!!! Message", i, "with uid", uid, " does not seem to exist\x07"
            continue

        #print data
        #print data[0]
        # TODO: email_data = FETCH_RESPONSE_PATTERN.match(data[0]).groupdict()
        uid, internaldate, flags = FETCH_RESPONSE_PATTERN.match(data[0]).groups()
        #print "UID", uid
        #print "Internal date", internaldate