python/imapUTF7.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# This code was originally in PloneMailList, a GPL'd software.
# http://svn.plone.org/svn/collective/mxmImapClient/trunk/imapUTF7.py
# http://bugs.python.org/issue5305
#
# Port to Python 3.x
# Credit: https://github.com/MarechJ/py3_imap_utf7
#
# 2016-01-23
# Aaron LI
#

"""
Imap folder names are encoded using a special version of utf-7 as defined in RFC
2060 section 5.1.3.

5.1.3.  Mailbox International Naming Convention

   By convention, international mailbox names are specified using a
   modified version of the UTF-7 encoding described in [UTF-7].  The
   purpose of these modifications is to correct the following problems
   with UTF-7:

      1) UTF-7 uses the "+" character for shifting; this conflicts with
         the common use of "+" in mailbox names, in particular USENET
         newsgroup names.

      2) UTF-7's encoding is BASE64 which uses the "/" character; this
         conflicts with the use of "/" as a popular hierarchy delimiter.

      3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
         the use of "\" as a popular hierarchy delimiter.

      4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
         the use of "~" in some servers as a home directory indicator.

      5) UTF-7 permits multiple alternate forms to represent the same
         string; in particular, printable US-ASCII chararacters can be
         represented in encoded form.

   In modified UTF-7, printable US-ASCII characters except for "&"
   represent themselves; that is, characters with octet values 0x20-0x25
   and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
   octet sequence "&-".

   All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
   Unicode 16-bit octets) are represented in modified BASE64, with a
   further modification from [UTF-7] that "," is used instead of "/".
   Modified BASE64 MUST NOT be used to represent any printing US-ASCII
   character which can represent itself.

   "&" is used to shift to modified BASE64 and "-" to shift back to US-
   ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
   is, a name that ends with a Unicode 16-bit octet MUST end with a "-
   ").

      For example, here is a mailbox name which mixes English, Japanese,
      and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
"""


import binascii
import codecs


## encoding

def modified_base64(s:str):
    s = s.encode('utf-16be')  # UTF-16, big-endian byte order
    return binascii.b2a_base64(s).rstrip(b'\n=').replace(b'/', b',')

def doB64(_in, r):
    if _in:
        r.append(b'&' + modified_base64(''.join(_in)) + b'-')
        del _in[:]

def encoder(s:str):
    r = []
    _in = []
    for c in s:
        ordC = ord(c)
        if 0x20 <= ordC <= 0x25 or 0x27 <= ordC <= 0x7e:
            doB64(_in, r)
            r.append(c.encode())
        elif c == '&':
            doB64(_in, r)
            r.append(b'&-')
        else:
            _in.append(c)
    doB64(_in, r)
    return (b''.join(r), len(s))


## decoding

def modified_unbase64(s:bytes):
    b = binascii.a2b_base64(s.replace(b',', b'/') + b'===')
    return b.decode('utf-16be')

def decoder(s:bytes):
    r = []
    decode = bytearray()
    for c in s:
        if c == ord('&') and not decode:
            decode.append(ord('&'))
        elif c == ord('-') and decode:
            if len(decode) == 1:
                r.append('&')
            else:
                r.append(modified_unbase64(decode[1:]))
            decode = bytearray()
        elif decode:
            decode.append(c)
        else:
            r.append(chr(c))
    if decode:
        r.append(modified_unbase64(decode[1:]))
    bin_str = ''.join(r)
    return (bin_str, len(s))


class StreamReader(codecs.StreamReader):
    def decode(self, s, errors='strict'):
        return decoder(s)


class StreamWriter(codecs.StreamWriter):
    def decode(self, s, errors='strict'):
        return encoder(s)


def imap4_utf_7(name):
    if name == 'imap4-utf-7':
        return (encoder, decoder, StreamReader, StreamWriter)


codecs.register(imap4_utf_7)


## testing methods

def imapUTF7Encode(ust):
    "Returns imap utf-7 encoded version of string"
    return ust.encode('imap4-utf-7')

def imapUTF7EncodeSequence(seq):
    "Returns imap utf-7 encoded version of strings in sequence"
    return [imapUTF7Encode(itm) for itm in seq]


def imapUTF7Decode(st):
    "Returns utf7 encoded version of imap utf-7 string"
    return st.decode('imap4-utf-7')

def imapUTF7DecodeSequence(seq):
    "Returns utf7 encoded version of imap utf-7 strings in sequence"
    return [imapUTF7Decode(itm) for itm in seq]


def utf8Decode(st):
    "Returns utf7 encoded version of imap utf-7 string"
    return st.decode('utf-8')


def utf7SequenceToUTF8(seq):
    "Returns utf7 encoded version of imap utf-7 strings in sequence"
    return [itm.decode('imap4-utf-7').encode('utf-8') for itm in seq]


__all__ = [ 'imapUTF7Encode', 'imapUTF7Decode' ]


if __name__ == '__main__':
    testdata = [
            (u'foo\r\n\nbar\n', b'foo&AA0ACgAK-bar&AAo-'),
            (u'测试', b'&bUuL1Q-'),
            (u'Hello 世界', b'Hello &ThZ1TA-')
    ]
    for s, e in testdata:
        #assert s == decoder(encoder(s)[0])[0]
        assert s == imapUTF7Decode(e)
        assert e == imapUTF7Encode(s)
        assert s == imapUTF7Decode(imapUTF7Encode(s))
        assert e == imapUTF7Encode(imapUTF7Decode(e))
    print("All tests passed!")

#  vim: set ts=4 sw=4 tw=0 fenc=utf-8 ft=python: #