1 files changed, 189 insertions, 0 deletions
diff --git a/python/imapUTF7.py b/python/imapUTF7.py
new file mode 100644
index 0000000..2e4db0a
--- /dev/null
+++ b/python/imapUTF7.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# This code was originally in PloneMailList, a GPL'd software.
+# http://svn.plone.org/svn/collective/mxmImapClient/trunk/imapUTF7.py
+# http://bugs.python.org/issue5305
+#
+# Port to Python 3.x
+# Credit: https://github.com/MarechJ/py3_imap_utf7
+#
+# 2016-01-23
+# Aaron LI
+#
+
+"""
+Imap folder names are encoded using a special version of utf-7 as defined in RFC
+2060 section 5.1.3.
+
+5.1.3.  Mailbox International Naming Convention
+
+   By convention, international mailbox names are specified using a
+   modified version of the UTF-7 encoding described in [UTF-7].  The
+   purpose of these modifications is to correct the following problems
+   with UTF-7:
+
+      1) UTF-7 uses the "+" character for shifting; this conflicts with
+         the common use of "+" in mailbox names, in particular USENET
+         newsgroup names.
+
+      2) UTF-7's encoding is BASE64 which uses the "/" character; this
+         conflicts with the use of "/" as a popular hierarchy delimiter.
+
+      3) UTF-7 prohibits the unencoded usage of "\"; this conflicts with
+         the use of "\" as a popular hierarchy delimiter.
+
+      4) UTF-7 prohibits the unencoded usage of "~"; this conflicts with
+         the use of "~" in some servers as a home directory indicator.
+
+      5) UTF-7 permits multiple alternate forms to represent the same
+         string; in particular, printable US-ASCII chararacters can be
+         represented in encoded form.
+
+   In modified UTF-7, printable US-ASCII characters except for "&"
+   represent themselves; that is, characters with octet values 0x20-0x25
+   and 0x27-0x7e.  The character "&" (0x26) is represented by the two-
+   octet sequence "&-".
+
+   All other characters (octet values 0x00-0x1f, 0x7f-0xff, and all
+   Unicode 16-bit octets) are represented in modified BASE64, with a
+   further modification from [UTF-7] that "," is used instead of "/".
+   Modified BASE64 MUST NOT be used to represent any printing US-ASCII
+   character which can represent itself.
+
+   "&" is used to shift to modified BASE64 and "-" to shift back to US-
+   ASCII.  All names start in US-ASCII, and MUST end in US-ASCII (that
+   is, a name that ends with a Unicode 16-bit octet MUST end with a "-
+   ").
+
+      For example, here is a mailbox name which mixes English, Japanese,
+      and Chinese text: ~peter/mail/&ZeVnLIqe-/&U,BTFw-
+"""
+
+
+import binascii
+import codecs
+
+
+## encoding
+
+def modified_base64(s:str):
+    s = s.encode('utf-16be')  # UTF-16, big-endian byte order
+    return binascii.b2a_base64(s).rstrip(b'\n=').replace(b'/', b',')
+
+def doB64(_in, r):
+    if _in:
+        r.append(b'&' + modified_base64(''.join(_in)) + b'-')
+        del _in[:]
+
+def encoder(s:str):
+    r = []
+    _in = []
+    for c in s:
+        ordC = ord(c)
+        if 0x20 <= ordC <= 0x25 or 0x27 <= ordC <= 0x7e:
+            doB64(_in, r)
+            r.append(c.encode())
+        elif c == '&':
+            doB64(_in, r)
+            r.append(b'&-')
+        else:
+            _in.append(c)
+    doB64(_in, r)
+    return (b''.join(r), len(s))
+
+
+## decoding
+
+def modified_unbase64(s:bytes):
+    b = binascii.a2b_base64(s.replace(b',', b'/') + b'===')
+    return b.decode('utf-16be')
+
+def decoder(s:bytes):
+    r = []
+    decode = bytearray()
+    for c in s:
+        if c == ord('&') and not decode:
+            decode.append(ord('&'))
+        elif c == ord('-') and decode:
+            if len(decode) == 1:
+                r.append('&')
+            else:
+                r.append(modified_unbase64(decode[1:]))
+            decode = bytearray()
+        elif decode:
+            decode.append(c)
+        else:
+            r.append(chr(c))
+    if decode:
+        r.append(modified_unbase64(decode[1:]))
+    bin_str = ''.join(r)
+    return (bin_str, len(s))
+
+
+class StreamReader(codecs.StreamReader):
+    def decode(self, s, errors='strict'):
+        return decoder(s)
+
+
+class StreamWriter(codecs.StreamWriter):
+    def decode(self, s, errors='strict'):
+        return encoder(s)
+
+
+def imap4_utf_7(name):
+    if name == 'imap4-utf-7':
+        return (encoder, decoder, StreamReader, StreamWriter)
+
+
+codecs.register(imap4_utf_7)
+
+
+## testing methods
+
+def imapUTF7Encode(ust):
+    "Returns imap utf-7 encoded version of string"
+    return ust.encode('imap4-utf-7')
+
+def imapUTF7EncodeSequence(seq):
+    "Returns imap utf-7 encoded version of strings in sequence"
+    return [imapUTF7Encode(itm) for itm in seq]
+
+
+def imapUTF7Decode(st):
+    "Returns utf7 encoded version of imap utf-7 string"
+    return st.decode('imap4-utf-7')
+
+def imapUTF7DecodeSequence(seq):
+    "Returns utf7 encoded version of imap utf-7 strings in sequence"
+    return [imapUTF7Decode(itm) for itm in seq]
+
+
+def utf8Decode(st):
+    "Returns utf7 encoded version of imap utf-7 string"
+    return st.decode('utf-8')
+
+
+def utf7SequenceToUTF8(seq):
+    "Returns utf7 encoded version of imap utf-7 strings in sequence"
+    return [itm.decode('imap4-utf-7').encode('utf-8') for itm in seq]
+
+
+__all__ = [ 'imapUTF7Encode', 'imapUTF7Decode' ]
+
+
+if __name__ == '__main__':
+    testdata = [
+            (u'foo\r\n\nbar\n', b'foo&AA0ACgAK-bar&AAo-'),
+            (u'测试', b'&bUuL1Q-'),
+            (u'Hello 世界', b'Hello &ThZ1TA-')
+    ]
+    for s, e in testdata:
+        #assert s == decoder(encoder(s)[0])[0]
+        assert s == imapUTF7Decode(e)
+        assert e == imapUTF7Encode(s)
+        assert s == imapUTF7Decode(imapUTF7Encode(s))
+        assert e == imapUTF7Encode(imapUTF7Decode(e))
+    print("All tests passed!")
+
+#  vim: set ts=4 sw=4 tw=0 fenc=utf-8 ft=python: #