diff options
Diffstat (limited to 'python/imapUTF7.py')
-rw-r--r-- | python/imapUTF7.py | 189 |
1 files changed, 189 insertions, 0 deletions
diff --git a/python/imapUTF7.py b/python/imapUTF7.py new file mode 100644 index 0000000..2e4db0a --- /dev/null +++ b/python/imapUTF7.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# This code was originally in PloneMailList, a GPL'd software. +# http://svn.plone.org/svn/collective/mxmImapClient/trunk/imapUTF7.py +# http://bugs.python.org/issue5305 +# +# Port to Python 3.x +# Credit: https://github.com/MarechJ/py3_imap_utf7 +# +# 2016-01-23 +# Aaron LI +# + +""" +Imap folder names are encoded using a special version of utf-7 as defined in RFC +2060 section 5.1.3. + +5.1.3. Mailbox International Naming Convention + + By convention, international mailbox names are specified using a + modified version of the UTF-7 encoding described in [UTF-7]. The + purpose of these modifications is to correct the following problems + with UTF-7: + + 1) UTF-7 uses the "+" character for shifting; this conflicts with + the common use of "+" in mailbox names, in particular USENET + newsgroup names. + + 2) UTF-7's encoding is BASE64 which uses the "/" character; this + conflicts with the use of "/" as a popular hierarchy delimiter. + + 3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with + the use of "\" as a popular hierarchy delimiter. + + 4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with + the use of "~" in some servers as a home directory indicator. + + 5) UTF-7 permits multiple alternate forms to represent the same + string; in particular, printable US-ASCII chararacters can be + represented in encoded form. + + In modified UTF-7, printable US-ASCII characters except for "&" + represent themselves; that is, characters with octet values 0x20-0x25 + and 0x27-0x7e. The character "&" (0x26) is represented by the two- + octet sequence "&-". + + All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all + Unicode 16-bit octets) are represented in modified BASE64, with a + further modification from [UTF-7] that "," is used instead of "/". + Modified BASE64 MUST NOT be used to represent any printing US-ASCII + character which can represent itself. + + "&" is used to shift to modified BASE64 and "-" to shift back to US- + ASCII. All names start in US-ASCII, and MUST end in US-ASCII (that + is, a name that ends with a Unicode 16-bit octet MUST end with a "- + "). + + For example, here is a mailbox name which mixes English, Japanese, + and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw- +""" + + +import binascii +import codecs + + +## encoding + +def modified_base64(s:str): + s = s.encode('utf-16be') # UTF-16, big-endian byte order + return binascii.b2a_base64(s).rstrip(b'\n=').replace(b'/', b',') + +def doB64(_in, r): + if _in: + r.append(b'&' + modified_base64(''.join(_in)) + b'-') + del _in[:] + +def encoder(s:str): + r = [] + _in = [] + for c in s: + ordC = ord(c) + if 0x20 <= ordC <= 0x25 or 0x27 <= ordC <= 0x7e: + doB64(_in, r) + r.append(c.encode()) + elif c == '&': + doB64(_in, r) + r.append(b'&-') + else: + _in.append(c) + doB64(_in, r) + return (b''.join(r), len(s)) + + +## decoding + +def modified_unbase64(s:bytes): + b = binascii.a2b_base64(s.replace(b',', b'/') + b'===') + return b.decode('utf-16be') + +def decoder(s:bytes): + r = [] + decode = bytearray() + for c in s: + if c == ord('&') and not decode: + decode.append(ord('&')) + elif c == ord('-') and decode: + if len(decode) == 1: + r.append('&') + else: + r.append(modified_unbase64(decode[1:])) + decode = bytearray() + elif decode: + decode.append(c) + else: + r.append(chr(c)) + if decode: + r.append(modified_unbase64(decode[1:])) + bin_str = ''.join(r) + return (bin_str, len(s)) + + +class StreamReader(codecs.StreamReader): + def decode(self, s, errors='strict'): + return decoder(s) + + +class StreamWriter(codecs.StreamWriter): + def decode(self, s, errors='strict'): + return encoder(s) + + +def imap4_utf_7(name): + if name == 'imap4-utf-7': + return (encoder, decoder, StreamReader, StreamWriter) + + +codecs.register(imap4_utf_7) + + +## testing methods + +def imapUTF7Encode(ust): + "Returns imap utf-7 encoded version of string" + return ust.encode('imap4-utf-7') + +def imapUTF7EncodeSequence(seq): + "Returns imap utf-7 encoded version of strings in sequence" + return [imapUTF7Encode(itm) for itm in seq] + + +def imapUTF7Decode(st): + "Returns utf7 encoded version of imap utf-7 string" + return st.decode('imap4-utf-7') + +def imapUTF7DecodeSequence(seq): + "Returns utf7 encoded version of imap utf-7 strings in sequence" + return [imapUTF7Decode(itm) for itm in seq] + + +def utf8Decode(st): + "Returns utf7 encoded version of imap utf-7 string" + return st.decode('utf-8') + + +def utf7SequenceToUTF8(seq): + "Returns utf7 encoded version of imap utf-7 strings in sequence" + return [itm.decode('imap4-utf-7').encode('utf-8') for itm in seq] + + +__all__ = [ 'imapUTF7Encode', 'imapUTF7Decode' ] + + +if __name__ == '__main__': + testdata = [ + (u'foo\r\n\nbar\n', b'foo&AA0ACgAK-bar&AAo-'), + (u'测试', b'&bUuL1Q-'), + (u'Hello 世界', b'Hello &ThZ1TA-') + ] + for s, e in testdata: + #assert s == decoder(encoder(s)[0])[0] + assert s == imapUTF7Decode(e) + assert e == imapUTF7Encode(s) + assert s == imapUTF7Decode(imapUTF7Encode(s)) + assert e == imapUTF7Encode(imapUTF7Decode(e)) + print("All tests passed!") + +# vim: set ts=4 sw=4 tw=0 fenc=utf-8 ft=python: # |