Index: HyperArch.py
===================================================================
RCS file: /cvsroot/mailman/mailman/Mailman/Archiver/HyperArch.py,v
retrieving revision 2.11
diff -u -r2.11 HyperArch.py
--- HyperArch.py	2001/12/24 21:35:09	2.11
+++ HyperArch.py	2002/01/29 23:12:35
@@ -37,6 +37,7 @@
 import urllib
 import time
 import os
+import types
 import HyperDatabase
 import pipermail
 
@@ -58,6 +59,18 @@
 
 
 
+def unicode_quote(arg):
+    if isinstance(arg, types.UnicodeType):
+        result = []
+        for c in arg:
+            c = ord(c)
+            if c < 128:
+                result.append(chr(c))
+            else:
+                result.append("&#%d;" % c)
+        arg = "".join(result)
+    return arg
+
 def html_quote(s):
     repls = ( ('&', '&amp;'),
 	      ("<", '&lt;'),
@@ -65,7 +78,7 @@
 	      ('"', '&quot;'))
     for thing, repl in repls:
 	s = s.replace(thing, repl)
-    return s
+    return unicode_quote(s)
 
 def url_quote(s):
     return urllib.quote(s)
@@ -86,9 +99,12 @@
 html_charset = '<META http-equiv="Content-Type" ' \
                'content="text/html; charset=%s">'
 
-def CGIescape(arg): 
-    s = cgi.escape(str(arg))
-    return s.replace('"', '&quot;')
+def CGIescape(arg):
+    if isinstance(arg, types.UnicodeType):
+        s = cgi.escape(arg)
+    else:
+        s = cgi.escape(str(arg))
+    return unicode_quote(s.replace('"', '&quot;'))
 
 # Parenthesized human name
 paren_name_pat = re.compile(r'([(].*[)])') 
@@ -170,11 +186,14 @@
         self.decoded = {}
         charset = message.get_param('charset')
         if charset:
-            self.check_header_charsets(charset)
-        else:
-            self.check_header_charsets()
+            charset = charset.lower().strip()
+            if charset[0]=='"' and charset[-1]=='"':
+                charset = charset[1:-1]
+            if charset[0]=="'" and charset[-1]=="'":
+                charset = charset[1:-1]
+        self.check_header_charsets(charset)
         if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING:
-            self.quote = lambda x:x
+            self.quote = unicode_quote
 
     def quote(self, buf):
         return html_quote(buf)
@@ -189,36 +208,44 @@
 
         If the charsets used by these headers differ from each other
         or from the charset specified by the message's Content-Type
-        header, then an arbitrary charset is chosen.  Only those
-        values that match the chosen charset are decoded.
+        header, then an arbitrary charset is chosen. If the decoded
+        fields match that charset, they are preserved
+        literally. Otherwise, an attempt is made to decode them as
+        Unicode. If that fails, they are left undecoded.
         """
+        
         self.charset = msg_charset
         author, a_charset = self.decode_charset(self.author)
+        if self.charset is None and a_charset:
+            self.charset = a_charset
         subject, s_charset = self.decode_charset(self.subject)
-        if author is not None or subject is not None:
-            # Both charsets should be the same.  If they aren't, we
-            # can only handle one way.
-            if msg_charset is None:
-                self.charset = a_charset or s_charset
-            else:
-                self.charset = msg_charset
+        if self.charset is None and a_charset:
+            self.charset = s_charset        
+        if author:
+            self.decoded['author'] = author
+            email, e_charset = self.decode_charset(self.email)
+            if email:
+                self.decoded['email'] = email
+        if subject:
+            self.decoded['subject'] = subject
 
-            if author and self.charset == a_charset:
-                self.decoded['author'] = author
-                email, e_charset = self.decode_charset(self.email)
-                if email:
-                    self.decoded['email'] = email
-            if subject and self.charset == s_charset:
-                self.decoded['subject'] = subject
-
     def decode_charset(self, field):
         if field.find("=?") == -1:
             return None, None
         try:
             s, c = EncWord.decode(field)
         except ValueError:
+            return None, None
+        c = c.lower()
+        # If the charset of the header matches the article charset,
+        # leave it as encoded. Otherwise, try Unicode decoding
+        if c.lower() == self.charset:
+            return s, c
+        try:
+            return unicode(s, c), None
+        except UnicodeError:
+            # Unknown encoding
             return None, None
-        return s, c.lower()
 
     def as_html(self):
 	d = self.__dict__.copy()
@@ -325,7 +352,7 @@
                          ('email', 'email_html'),
                          ('subject', 'subject_html')):
             if self.decoded.has_key(src):
-                d[dst] = self.decoded[src]
+                d[dst] = self.quote(self.decoded[src])
     
     def as_text(self):
 	d = self.__dict__.copy()
@@ -823,14 +850,29 @@
         print self.html_foot()
 
     def write_index_entry(self, article):
-        if article.charset == self.charset:
-            d = article.decoded
-            subject = d.get("subject", article.subject)
-            author = d.get("author", article.author)
-        subject = CGIescape(article.subject)
-        author = CGIescape(article.author)
+        subject = self.get_header("subject", article)
+        author = self.get_header("author", article)
+        subject = CGIescape(subject)
+        author = CGIescape(author)
+            
         print index_entry_template % (urllib.quote(article.filename),
                                       subject, article.sequence, author)
+
+    def get_header(self, field, article):
+        # if we have no decoded header, return the encoded one
+        result = article.decoded.get(field)
+        if result is None:
+            return getattr(article, field)
+        # if the encodings match, use the result
+        if self.charset == article.charset:
+            return result
+        # otherwise, try to return a Unicode result
+        if isinstance(result, types.UnicodeType):
+            return result
+        try:
+            return unicode(result, article.charset)
+        except UnicodeError:
+            return getattr(article, field)
 
     def write_threadindex_entry(self, article, depth):
 	if depth < 0: