Index: 2_0_5.1/Mailman/Handlers/HandlerAPI.py --- 2_0_5.1/Mailman/Handlers/HandlerAPI.py Tue, 27 Mar 2001 14:03:02 -0800 dairiki (mailman/i/37_HandlerAPI 1.1 664) +++ 0.10/Mailman/Handlers/HandlerAPI.py Wed, 04 Apr 2001 09:11:59 -0700 dairiki (mailman/i/37_HandlerAPI 1.2 664) @@ -56,6 +56,7 @@ # this pipeline of handler modules. LIST_PIPELINE = ['SpamDetect', 'Approve', + 'PlainText', 'Replybot', 'Hold', 'Cleanse', Index: 2_0_5.1/Mailman/Version.py --- 2_0_5.1/Mailman/Version.py Mon, 07 May 2001 09:13:38 -0700 dairiki (mailman/j/27_Version.py 1.1.2.2 664) +++ 0.10/Mailman/Version.py Mon, 07 May 2001 09:15:44 -0700 dairiki (mailman/j/27_Version.py 1.1.2.3 664) @@ -36,7 +36,7 @@ (REL_LEVEL << 4) | (REL_SERIAL << 0)) # config.db schema version number -DATA_FILE_VERSION = 21 +DATA_FILE_VERSION = (21, 'plaintext_patch') # qfile/*.db schema version number QFILE_SCHEMA_VERSION = 2 Index: 2_0_5.1/Mailman/MailList.py --- 2_0_5.1/Mailman/MailList.py Tue, 27 Mar 2001 14:03:02 -0800 dairiki (mailman/j/34_MailList.p 1.1 664) +++ 0.10/Mailman/MailList.py Tue, 27 Mar 2001 19:49:41 -0800 dairiki (mailman/j/34_MailList.p 1.2 664) @@ -291,6 +291,7 @@ self.dont_respond_to_post_requests = 0 self.advertised = mm_cfg.DEFAULT_LIST_ADVERTISED self.max_num_recipients = mm_cfg.DEFAULT_MAX_NUM_RECIPIENTS + self.force_plain_text = mm_cfg.DEFAULT_FORCE_PLAIN_TEXT self.max_message_size = mm_cfg.DEFAULT_MAX_MESSAGE_SIZE self.web_page_url = mm_cfg.DEFAULT_URL self.owner = [admin] @@ -541,6 +542,24 @@ " limits except routine list moderation and spam" " filters, for which notices are not sent. This" " option overrides ever sending the notice."), + + ('force_plain_text', mm_cfg.Radio, ('No', 'Yes'), 0, + 'Convert all mail to the list into plain text format?', + + "If you enable this option, all posts to the list will be" + " converted to plain text:" + "" + "

" + "Note: Currently, there are bugs in the" + " handling of non-ASCII HTML entities when the message is" + " in a charset other than iso-8859-1."), ('max_message_size', mm_cfg.Number, 7, 0, 'Maximum length in Kb of a message body. Use 0 for no limit.'), Index: 2_0_5.1/Mailman/Defaults.py.in --- 2_0_5.1/Mailman/Defaults.py.in Tue, 27 Mar 2001 14:03:02 -0800 dairiki (mailman/j/44_Defaults.p 1.1 664) +++ 0.10/Mailman/Defaults.py.in Tue, 27 Mar 2001 19:49:41 -0800 dairiki (mailman/j/44_Defaults.p 1.2 664) @@ -253,6 +253,8 @@ # allowed? DEFAULT_LIST_ADVERTISED = 1 DEFAULT_MAX_NUM_RECIPIENTS = 10 +# Should mail posted to the list be coerced into plain text? +DEFAULT_FORCE_PLAIN_TEXT = 0 DEFAULT_MAX_MESSAGE_SIZE = 40 # KB # These format strings will be expanded w.r.t. the dictionary for the Index: 2_0_5.1/Mailman/richtext.py --- 2_0_5.1/Mailman/richtext.py Sat, 19 May 2001 16:33:35 -0700 dairiki () +++ 0.10/Mailman/richtext.py Wed, 04 Apr 2001 09:11:59 -0700 dairiki (mailman/k/3_richtext.p 1.1 664) @@ -0,0 +1,408 @@ +# Copyright (C) 20001 by Geoffrey T. Dairiki +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +"""Classes for parsing text/richtext and text/enriched content. + +This module contains two public classes, RichtextParser and EnrichedTextParser, +which can be used for parsing text. The API of these classes follows that +of HTMLParser. + +SEE ALSO + + HTMLParser + +""" + +import string +import re +from formatter import AS_IS + +class _ParserBase: + """Basic parser functionality.""" + + #This regexp is suitable for richtext + CMD = re.compile('(?P.*?)(?P)', re.I | re.S) + MAX_CMD_LEN = 43 + + def __init__(self): + self.reset() + + def reset(self): + self.buf = '' + self.stack = [] + + def close(self): + self.__parse('flush') + + def feed(self, str): + self.buf = self.buf + str + self.__parse() + + def __parse(self, flush = 0): + buf, i = self.buf, 0 + while i < len(buf): + m = self.CMD.match(buf, i) + if not m: + break + i = m.end() + if m.group('text'): + self.handle_text(m.group('text')) + self.handle_command(m.group('command')) + + if flush: + keep = 0 + else: + keep = self.MAX_CMD_LEN - 1 + end = len(buf) - keep + if i < end: + self.handle_text(buf[i:end]) + i = end + self.buf = buf[i:] + + def handle_command(self, command): + command = string.lower(command) + if command[:2] == ' horizontal rule + self.formatter.add_hor_rule() + self.end('np') # no end tag allowed + + def start_paragraph(self): + self.formatter.end_paragraph(1) + def end_paragraph(self): + self.formatter.end_paragraph(1) + + def start_signature(self): + self.formatter.add_hor_rule() + self.start_italic() + def end_signature(self): + self.formatter.end_paragraph(1) + self.end_italic() + + # Margin control + def start_indent(self): + self.formatter.push_margin('indent') + def end_indent(self): + self.formatter.pop_margin() + def start_indentright(self): + self.formatter.push_margin('indentright') + def end_indentright(self): + self.formatter.pop_margin() + + +class EnrichedTextParser(_ParserCommon): + """A parser for text/enriched. + + This class parses textual data in the text/enriched format defined + in RFC 1896. + + The API is the same as that of HTMLParser. Output is through a + formatter object. + + SEE ALSO + + HTMLParser. + + BUGS + + The following commands from RFC 1896 are ignored by this parser: + + FontFamily, Color, Lang + + Paragraph breaking/new-line handling is not quite right. + + """ + CMD = re.compile('(?P.*?)(?P|<<)', re.I | re.S) + MAX_CMD_LEN = 63 + + def __init__(self, formatter): + _ParserCommon.__init__(self, formatter) + self.nofill = 0 + self.deferred = None + self.param = None + self.pistack = [] + + def handle_command(self, command): + if command == '<<': + self.handle_text('<') + elif self.param is not None: + if string.lower(command) == '': + self.__start_deferred() + else: + self.param = self.param + command + else: + _ParserCommon.handle_command(self, command) + + def __start_deferred(self): + if self.deferred: + _ParserCommon.start(self, self.deferred) + self.deferred = None + self.param = None + + def start(self, command): + if command == 'param': + self.param = '' + else: + self.__start_deferred() + self.deferred = command + + def end(self, command): + self.__start_deferred() + _ParserCommon.end(self, command) + + _re_NLS = re.compile('(.*?)(\n+)') + + def handle_text(self, text): + if self.param is not None: + self.param = self.param + text + return + + self.__start_deferred() + + text = string.replace(text, '\r', '') + if not self.nofill: + def replace_nls(m): + match = m.group() + if len(match) > 1: + return match[1:] + else: + return ' ' + text = re.sub('[\n]+', replace_nls, text) + + i = 0 + while i < len(text): + m = self._re_NLS.match(text, i) + if not m: + self.formatter.add_flowing_data(text[i:]) + break + i = m.end() + if m.group(1): + self.formatter.add_flowing_data(m.group(1)) + self.formatter.add_literal_data(m.group(2)) + + def start_nofill(self): + self.nofill = 1 + def end_nofill(self): + self.nofill = 'nofill' in self.stack + + def start_paraindent(self): + self.formatter.end_paragraph(0) + param = string.strip(string.lower(self.param)) + if param == 'left': + self.formatter.push_margin('indent') + self.pistack.append(self.formatter.pop_margin) + elif param == 'right': + self.formatter.push_margin('rightindent') + self.pistack.append(self.formatter.pop_margin) + elif param == 'in': + self.formatter.push_style('indent') + self.pistack.append(self.formatter.pop_style) + elif param == 'out': + self.formatter.push_style('hangingindent') + self.pistack.append(self.formatter.pop_style) + else: + self.pistack.append(None) + def end_paraindent(self): + endfunc = self.pistack.pop() + if endfunc: + endfunc() + self.formatter.end_paragraph(0) + +# Some tests: +if __name__ == '__main__': + import formatter + import sys + + writer = formatter.DumbWriter(sys.stdout) + formatter = formatter.AbstractFormatter(writer) + parser = EnrichedTextParser(formatter) + + while 1: + line = sys.stdin.readline() + if not line: break + parser.feed(line) + parser.close() + Index: 2_0_5.1/Mailman/FilteringMimeWriter.py --- 2_0_5.1/Mailman/FilteringMimeWriter.py Sat, 19 May 2001 16:33:35 -0700 dairiki () +++ 0.10/Mailman/FilteringMimeWriter.py Sat, 19 May 2001 16:31:26 -0700 dairiki (mailman/k/4_FilteringM 1.5 664) @@ -0,0 +1,677 @@ +# Copyright (C) 20001 by Geoffrey T. Dairiki +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#FIXME: fix comments +#FIXME: test with no space after colon. + +"""Tools for filtering MIME messages.""" + +import string +import re +import mimetools +import rfc822 +from Mailman.pythonlib import multifile +import StringIO +import cStringIO +import MimeWriter + +def is_raw_encoding(encoding): + """Check whether any transformation is required for encoding. + + Returns true iff no transformation is required for the + MIME Transfer-Encoding given by the argument. + + """ + return not encoding or string.lower(encoding) in ('7bit', '8bit', 'binary') + +def decode(infp, outfp, encoding): + """Wrapper for mimetools.decode(). + + Mimetools.decode() fails on the raw encodings (the ones which are + not really encoded. This will work on any recognized encoding. + + """ + if is_raw_encoding(encoding): + mimetools.copybinary(infp, outfp) + else: + mimetools.decode(infp, outfp, string.lower(encoding)) + +def encode(infp, outfp, encoding): + """Wrapper for mimetools.encode(). + + Mimetools.encode() fails on the raw encodings (the ones which are + not really encoded. This will work on any recognized encoding""" + + if is_raw_encoding(encoding): + mimetools.copybinary(infp, outfp) + else: + mimetools.encode(infp, outfp, encoding) + +_re_NONBINARY = re.compile(r'[\x80-\xff]') + +def is_ascii(str): + """Does string contains only ASCII characters.""" + return not _re_NONBINARY.search(str) + +_re_7BITBODY = re.compile(r'(?:[^\x00\r\n\x80-\xff]{0,200}\r?\n)*' + r'[^\x00\r\n\x80-\xff]{0,200}' + r'$') + +def is_valid_7bit_data(body): + """Check whether BODY contains data suitable for '7bit' MIME encoding. + + The body of MIME messages using the '7bit' transfer-encoding is subject + to the following limitations: + o Only ASCII characters are allowed. + o No NUL ('\x00') characters are allowed. + o CR and LF can only occur as part of CR-LF pairs. (We + extend this to allow bare LF's to separate lines.) + o Lines can be a maximum of 998 characters long (not counting + terminating CR-LF). We further restrict lines to 200 + characters for the hell of it. + + """ + return _re_7BITBODY.match(body) + +_re_TEXTCHARS = re.compile(r'[\r\n\t\x20-\x7e]+') + +def looks_binary(body): + """Check whether BODY looks like binary data. + + If BODY is composed predominantly of ASCII characters, we assume + that it is textual rather than binary data. + + """ + nbin = len(_re_TEXTCHARS.sub('', body)) + return nbin > len(body) / 4 + +class Plist: + def __init__(self, plist = []): + self.plist = plist + + def __str__(self): + params = [] + for key, val in self.plist: + if re.search(r'[][()<>@,;:\\"/?=\r\n]', val): + val = '"%s"' % re.sub(r'(["\n\r])', r'\\\1', val) + params.append(';\n\t%s=%s' % (key, val)) + return string.join(params, '') + + def get(self, key, default = None): + try: + return self[key] + except IndexError: + return default + + def __len__(self): + return len(self.plist) + + def __getitem__(self, key): + key = string.lower(key) + for param, val in self.plist: + if string.lower(param) == key: + return val + raise IndexError + + def __setitem__(self, key, val): + del self[key] + self.plist.insert(0, (key, val)) + + def __delitem__(self, key): + key = string.lower(key) + def out_key (pair, key = string.lower(key)): + param, val = pair + return string.lower(param) != key + self.plist = filter(out_key, self.plist) + +def message_headers(message): + """Get a list of all headers of an rfc822.Message object. + + Returns a list of pairs, (header, value), where header is the + name of the header (case preserved, not lower-cased), and + value is the value of the header (all continuation lines + joined, leading space stripped). + + """ + rheaders, list, head = message.headers, [], '' + rheaders.reverse() + for line in rheaders: + head = line + head + if head[0] in ' \t': + continue + key, val = string.split(head, ':', 1) + list.insert(0, (key, string.lstrip(val))) + head = '' + return list + + +class DigestMessage(mimetools.Message): + """A Message with a default content-type of 'message/rfc822'. + + Subparts of 'multipart/digest's are messages of this type. + + """ + def __init__(self, fp, seekable = 1): + mimetools.Message.__init__(self, fp, seekable) + # A bit of a hack. + if self.getheader('content-type') is None: + self.typeheader = 'message/rfc822' + self.parsetype() + +def write_message(message, mimewriter): + """Write MESSAGE using MIMEWRITER. + + The MESSAGE (which should be a mimetools.Message) is written to + the MIMEWRITER (which should be a MimeWriter). + + """ + + def plist(message): + """Get MimeWriter style plist from mimetools.Message.""" + def split_param(param): + pname, pval = string.split(param, '=', 1) + return (string.lower(pname), rfc822.unquote(pval)) + + def valid_param(param): + return '=' in param + + # Trailing ;'s in content-type yield empty strings from getplist(). + return map(split_param, + filter(valid_param, message.getplist())) + + def discards_data(fp): + """Determine whether file object ignores data written to it. + + The startbody and startmultipartbody methods of + PlaintextMimeWriter can return dummy file-like objects, which, + in reality just discard any data written to them. + + The function detects those dummy file objects, so that + optimizations can be carried out. + + """ + try: + return fp.discards_data + except AttributeError: + return 0 + + # Write the headers + for head, val in message_headers(message): + if string.lower(head) not in ('mime-version', 'content-type'): + mimewriter.addheader(head, val) + mimewriter.addheader('Mime-Version', '1.0 (Plaintext.py)') + + if message.seekable: + message.rewindbody() + + if message.getmaintype() == 'multipart': + # Multipart message + boundary = message.getparam('boundary') + if not boundary: + raise RuntimeError, 'No boundary for multipart data' + + # Coerce input file to MultiFile + if isinstance(message.fp, multifile.MultiFile): + infp = message.fp + else: + infp = multifile.MultiFile(message.fp) + infp.push(boundary) + + params = filter(lambda p: p[0] != 'boundary', plist(message)) + outfp = mimewriter.startmultipartbody(message.getsubtype(), boundary, params) + + if discards_data(outfp): + infp.pop() + else: + if message.getsubtype() == 'digest': + message_type = DigestMessage + else: + message_type = mimetools.Message + + mimetools.copybinary(infp, outfp) # copy preamble + while infp.next(): + subwriter = mimewriter.nextpart() + subpart = message_type(infp, 0) + write_message(subpart, subwriter) + mimewriter.lastpart() + infp.pop() + mimetools.copybinary(infp, outfp) # Copy trailer. + else: + # Single part message + outfp = mimewriter.startbody(message.gettype(), plist(message)) + if not discards_data(outfp): + mimetools.copybinary(message.fp, outfp) + + + mimewriter.flushheaders() + +class FilteringMimeWriter(MimeWriter.MimeWriter): + #FIXME: fix comments + """A class for filter MIME output. + + DESCRIPTION + + This is a sub-class of MimeWriter, and shares the same basic + interface. + + It is designed to be easily sub-classed to perform useful filtering + of MIME data. + + + API DIFFERENCES FROM MimeWriter + + Always call flushheaders() + + You must always call the flushheaders() method. You should + call it once, only after you've finished all output (body as + well as headers) to the PlaintextMimeWriter. + + WARNINGS + + For the output to be valid MIME you must add a Mime-Type: + header to the outer entity (this is true for the plain MimeWriter + as well.) A value of '1.0' will work fine. + + + """ + def __init__(self, fp): + MimeWriter.MimeWriter.__init__(self, fp) + self._reset() + + def _reset(self): + self.input_encoding = None + self.body = None + self.subpart = None + + def addheader(self, header, value): + lchead = string.lower(header) + + assert not self.body, 'addheader() after startbody()' + + # One shouldn't add a content-type to a regular MimeWriter, + # since the MimeWriter generates one itself. + assert lchead != 'content-type', "Attempt to add Content-Type: header to MimeWriter" + + if lchead == 'content-transfer-encoding': + self.input_encoding = string.strip(value) + elif self._keepheader(header): + MimeWriter.MimeWriter.addheader(self, header, value) + + # A list of headers to always accept. + # This list can also include header prefixes (with trailing '*') + # which will match any header beginning with that prefix. + INCLUDE_HEADERS = ('content-description',) + # A list of headers to ignore. (Same format as above.) + EXCLUDE_HEADERS = () #('content-*',) + + def _keepheader(self, header): + def _match(header, pattern_list): + for pat in pattern_list: + if pat[-1] == '*': + if header[:len(pat)-1] == pat[:-1]: + return pat + elif header == pat: + return pat + return None + + header = string.lower(header) + if _match(header, self.INCLUDE_HEADERS): + return 1 + return not _match(header, self.EXCLUDE_HEADERS) + + def flushheaders(self): + if self.is_multipart(): + if self.subpart: + self.lastpart() + else: + assert not self.subpart + if self.body: + self.body.flush() + + self._flushheaders() + + def _flushheaders(self): + body = self.body + if body: + encoding = self._get_encoding_for_body() + self._add_mime_headers(encoding = encoding) + + have_headers = self._headers + MimeWriter.MimeWriter.flushheaders(self) + + if body: + if have_headers: + self._fp.write("\n") + body.seek(0) + encode(body, self._fp, encoding) + body.close() + + self._reset() + + def _get_encoding_for_body(self): + """Get MIME transfer-encoding suitable for encoding BODY. + + The encodings returned are only suitable for textual data (since + 'binary' or 'base64' should be used for non-textual data.) + + FIXME: should probably use some heuristic to identify binary + files and use base64 on them. + """ + if self.body.is_valid_7bit_data(): + return '7bit' + if self.body.looks_binary(): + return 'base64' + return 'quoted-printable' + + def _add_mime_headers(self, encoding = 'binary'): + # Add mime headers for body + body = self.body + if body: + MimeWriter.MimeWriter.addheader(self, "Content-Type", + body.content_type + str(body.plist)) + if encoding: + MimeWriter.MimeWriter.addheader(self, "Content-Transfer-Encoding", + encoding) + + def startbody(self, ctype, plist=[], prefix=1): + assert ctype + assert not self.body, 'can only startbody once' + + plist = Plist(plist) + self.body = self._get_body(ctype, plist) + + decoder = self._get_decoder(self.input_encoding) + if decoder and not self.body.discards_data: + self.body = decoder(self.body, self.input_encoding) + + return self.body + + def _get_body(self, ctype, plist): + return MimeBody(ctype, plist, annotate = self._annotate) + + def _get_decoder(self, encoding): + if is_raw_encoding(encoding): + return None + elif string.lower(encoding) in ('uuencode', 'x-uuencode', 'uue', 'x-uue'): + return _UUDecoder + return _DefaultDecoder + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1): + if not subtype: + # Some faulty mail clients don't provide a subtype + subtype = 'mixed' + assert not self.body, 'can only startmultipartbody once' + plist = Plist(plist) + plist['boundary'] = boundary + self.body = self._get_multipart_body(subtype, plist) + return self.body + + def _get_multipart_body(self, subtype, plist): + return MultipartBody(subtype, plist = plist, annotate = self._annotate) + + def nextpart(self): + assert self.is_multipart() + + if self.subpart: + self.subpart.flushheaders() # finish previous part + + self.body.add_separator() + + self.subpart = self.body.SubpartWriter(self.body) + return self.subpart + + def lastpart(self): + assert self.is_multipart() + + if self.subpart: + self.subpart.flushheaders() # finish previous part + self.subpart = None + + self.body.add_separator(final = 1) + + def is_multipart(self): + body = self.body + if not body: + return 0 + try: + body.SubpartWriter + except AttributeError: + return 0 + return 1 + + + def _annotate(self, str): + """Add a comment to the header of the message. + + A header with name 'X-Plaintext' and value STR is added to + the output message. + + This method can be called even after output to the body + has started. + + """ + from mimify import mime_encode_header + # Use mime_encode_header to allow non-ascii + # (FIXME: assumes iso-8859-1) + MimeWriter.MimeWriter.addheader(self, 'X-Plaintext', + mime_encode_header(str)) + + +class MimeBody(StringIO.StringIO): + + discards_data = 0 + + def __init__(self, ctype, plist = Plist(), annotate = None): + StringIO.StringIO.__init__(self) + self.content_type = string.lower(ctype) + self.plist = plist + if annotate: + self.annotate = annotate + + def annotate(str): + pass + + def writelines(self, list): + self.write(string.join(list, '')) + + def getvalue(self): + self.flush() + return StringIO.StringIO.getvalue(self) + + def is_valid_7bit_data(self): + return is_valid_7bit_data(self.getvalue()) + + def looks_binary(self): + return looks_binary(self.getvalue()) + + def is_ascii(self): + return is_ascii(self.getvalue()) + + def add_linebreak(self): + body = self.getvalue() + if body and body[-1] != '\n': + self.write('\n') + + def add_comment(self, str): + self.annotate(str) + self.add_linebreak() + self.write('[ %s ]\n' % str) + + +class MultipartBody(MimeBody): + def __init__(self, subtype, plist=Plist(), annotate = None): + MimeBody.__init__(self, 'multipart/' + subtype, plist = plist, annotate = annotate) + self.boundary = plist['boundary'] + + def add_separator(self, final = 0): + if final: + self.write('\n--%s--\n' % self.boundary) + else: + self.write('\n--%s\n' % self.boundary) + + class SubpartWriter(FilteringMimeWriter): + EXCLUDE_HEADERS = ('mime-version',) + FilteringMimeWriter.EXCLUDE_HEADERS + + +################################################################ +class BodyFilter: + """An file-like object which writes to the body of a FilteringMimeWriter. + + This object implements output file semantics. It appends all data + written to it to the body of the underlying FilteringMimeWriter object. + + """ + + def __init__(self, body): + self.body = body + + def __getattr__(self, attr): + # "Inherit" methods from underlying body. + self.__dict__[attr] = getattr(self.body, attr) + return self.__dict__[attr] + + def output(self, str): + self.body.write(str) + + # The following methods should be overloaded + def write(self, str): + self.output(str) + + def writelines(self, lines): + self.write(string.join(lines, '')) + + def flush(self): + self.flushoutput() + + def flushoutput(self): + self.body.flush() + + +class LineFilter(BodyFilter): + def __init__(self, body): + BodyFilter.__init__(self, body) + self.buf = '' + + def write(self, str): + j = string.rfind(str, '\n') + 1 + if not j: + self.buf = self.buf + str + return + self.handle_chunk(self.buf + str[:j]) + self.buf = str[j:] + + def flushinput(self): + if self.buf: + self.handle_chunk(self.buf + '\n') + self.buf = '' + + def flush(self): + self.flushinput() + self.flushoutput() + +class UUFilter(LineFilter): + def __init__(self, body): + LineFilter.__init__(self, body) + self.handle_data = self.handle_text + + SPLIT = re.compile('^(begin[ \t]+[0-7]{3,5}.*?|end)[ \t]*[\n\z]', re.M) + + def handle_chunk(self, chunk): + hunks = self.SPLIT.split(chunk) + self.handle_data(hunks.pop(0)) + assert len(hunks) % 2 == 0 + while hunks: + begin = re.match('begin\s+[0-7]{3,5}\s*(?P.*)', hunks.pop(0)) + if begin: + self.handle_begin(begin.group('name')) + self.handle_data = self.handle_uudata + else: + self.handle_end() + self.handle_data = self.handle_text + self.handle_data(hunks.pop(0)) + + def flush(self): + self.flushinput() + if self.handle_data == self.handle_uudata: + self.handle_end() + self.handle_data = self.handle_text + self.flushoutput() + + def handle_begin(self, filename): pass + def handle_end(self): pass + def handle_uudata(self, line): pass + def handle_text(self, line): pass + + + +################################################################ +class _DefaultDecoder(LineFilter): + def __init__(self, body, encoding): + LineFilter.__init__(self, body) + self.encoding = string.lower(encoding) + self.done = 0 + + def handle_chunk(self, chunk): + if self.done: + return + + class OutputThrough: + def __init__(self, decoder): + self.write = decoder.output + + infp = cStringIO.StringIO(chunk) + outfp = OutputThrough(self) + try: + mimetools.decode(infp, outfp, self.encoding) + except ValueError, detail: + self.add_comment("%s, discarding content" % detail) + self.done = 1 + infp.close() + +class _UUDecoder(UUFilter): + def __init__(self, body, encoding): + UUFilter.__init__(self, body) + self.done = 0 + + def handle_end(self): + self.done = 1 + + def handle_uudata(self, chunk): + import binascii + if not self.done: + for line in string.split(chunk, '\n'): + if line: + self.output(binascii.a2b_uu(line)) + + + +################################################################ + +# Some tests: +if __name__ == '__main__': + import sys + + assert is_valid_7bit_data('abd') + assert is_valid_7bit_data('abd\r\n') + assert is_valid_7bit_data('abd\r\ndef') + assert not is_valid_7bit_data('ab\x00') + assert not is_valid_7bit_data('abd\rdef') + + pname = sys.argv.pop(0) + for file in sys.argv: + msg = mimetools.Message(open(file, 'r')) + writer = FilteringMimeWriter(sys.stdout) + write_message(msg, writer) Index: 2_0_5.1/Mailman/PlaintextMimeWriter.py --- 2_0_5.1/Mailman/PlaintextMimeWriter.py Sat, 19 May 2001 16:33:35 -0700 dairiki () +++ 0.10/Mailman/PlaintextMimeWriter.py Thu, 05 Apr 2001 08:32:32 -0700 dairiki (mailman/k/2_Plaintext. 1.4 664) @@ -0,0 +1,466 @@ +# Copyright (C) 20001 by Geoffrey T. Dairiki +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#FIXME: fix comments +#FIXME: test with no space after colon. + +"""Tools for converting MIME messages into plain text.""" + +import string +import re +import mimetools +import cStringIO +import FilteringMimeWriter + +################################################################ +class PlaintextMimeWriter(FilteringMimeWriter.FilteringMimeWriter): + """A class for converting MIME output to plain text. + + DESCRIPTION + + This is a sub-class of MimeWriter, and shares the same basic + interface. + + The output is written as a (single-part) MIME message of type + 'text/plain'. The following transformations are made: + + Any uuencoded files are removed from plain text content. + + HTML, richtext, and enriched text content is converted to + plain text. + + Multipart content is flattened. (All but (at most) one + subpart is discarded from 'multipart/alternative' + entities.) + + All other content is deleted (and replaced by a diagnostic + message.) + + In addition, the transfer-encoding's of the various subparts + are handled correctly. All parts are converted to a uniform + encoding --- either '7bit', if possible, or + 'quoted-printable'. + + If different parts use conflicting character sets, a warning + is generated. Currently no attempt is made to recode the + data. + + API DIFFERENCES FROM MimeWriter + + Always call flushheaders() + + You must always call the flushheaders() method. You should + call it once, only after you've finished all output (body as + well as headers) to the PlaintextMimeWriter. + + Optimization of Discarded Entity Bodies + + Startbody(), and startmultipartbody() return file-like objects + which have an extra public attribute: discards_data. If this + attribute has a true value, then you may safely omit writing + the body content (as anything you write will be ignored anyway.) + + NEW METHODS + + annotate(cmnt) + + Add a comment to the headers of the message. + + comment(cmnt) + + Add a comment to both the body and headers of the message. + + WARNINGS + + For the output to be valid MIME you must add a Mime-Type: + header to the outer entity (this is true for the plain MimeWriter + as well.) A value of '1.0' will work fine. + + BUGS + + There are some problems with the handling of character sets, + particularly those other than ISO-8859-1 (or ASCII). + Currently: + + The HTML to text converter converts entities to their + ISO-8859-1, without regard for the declared charset of the + document. + + The HTML to text converter doesn't check META tags for + charset declarations. + + PlaintextMimeWriter.annotate(str) encodes headers assuming + ISO-8859-1 data. + + I'm sure the are other problems as well. + + """ + + EXCLUDE_HEADERS = ('content-*',) + + def _get_filter(self, ctype): + ctype = string.lower(ctype) + if ctype == 'text/plain': + return _PlainTextFilter + if ctype == 'text/enriched': + return _EnrichedTextFilter + if ctype == 'text/richtext': + return _RichtextFilter + if ctype == 'text/html': + return _HTMLFilter + if ctype == 'message/rfc822': + return _RFC822Filter + return _BinaryFilter + + def _get_body(self, ctype, plist): + body = FilteringMimeWriter.MimeBody(ctype, plist, annotate = self._annotate) + body = _TrailingSpaceStripper(body) + filter = self._get_filter(ctype) + if filter: + body = filter(body) + return body + + def _get_multipart_body(self, subtype, plist): + if subtype == 'alternative': + return PlaintextAlternativeBody(subtype, plist, annotate = self._annotate) + return PlaintextMultipartBody(subtype, plist, annotate = self._annotate) + + +class PlaintextMultipartBody(FilteringMimeWriter.MultipartBody): + def __init__(self, subtype, plist, annotate): + FilteringMimeWriter.MultipartBody.__init__(self, subtype, plist, annotate) + del self.plist['boundary'] + self.content_type = 'text/plain' + self.sep = None + + def write(self, str): + if self.sep: + str = self.sep + str + self.sep = None + FilteringMimeWriter.MultipartBody.write(self, str) + + def add_separator(self, final = 0): + if final: + self._lastpart() + self.add_linebreak() + self.sep = '--\n' + + def _lastpart(self): + pass + + class SubpartWriter(PlaintextMimeWriter): + EXCLUDE_HEADERS = ('mime-version',) + PlaintextMimeWriter.EXCLUDE_HEADERS + + def _get_encoding_for_body(self): + return 'binary' + def _add_mime_headers(self, **params): + pass + + def _extract_annotations(self): + """Remove any headers generated by self.annotate(). + + Returns a list of the annotations. + + """ + annotations = [] + headers = [] + for line in self._headers: + head, val = string.split(line, ':', 1) + if head == 'X-Plaintext': + annotations.append(val) + else: + headers.append(line) + self._headers = headers + return annotations + + def _flushheaders(self): + + body = self.body + parent_body = self._fp + + # Move annotations from subpart to parent part + for note in self._extract_annotations(): + parent_body.annotate(note) + + if body: + charset = body.plist.get('charset') + parent_charset = parent_body.plist.get('charset') + + if charset and not body.is_ascii(): + if not parent_charset: + parent_body.plist['charset'] = charset + elif parent_charset != charset: + # FIXME: recode content? + parent_body.add_comment("Warning: charset mismatch '%s' != '%s'" % + (charset, parent_charset)) + + PlaintextMimeWriter._flushheaders(self) + +class PlaintextAlternativeBody(PlaintextMultipartBody): + def __init__(self, subtype, plist, annotate): + PlaintextMultipartBody.__init__(self, subtype, plist, annotate) + self.kept_part = None + + def _lastpart(self): + if self.kept_part: + self.add_comment('Picked %s from multipart/alternative' + % self.kept_part.body.content_type) + self.kept_part.real_flushheaders() + self.kept_part = None + else: + self.add_comment('Deleted binary multipart/alternative') + + def wants(self, subbody): + try: + plaintextness = subbody.plaintextness + except AttributeError: + return 0 + if not self.kept_part: + return 1 + return plaintextness > self.kept_part.body.plaintextness + + class SubpartWriter(PlaintextMultipartBody.SubpartWriter): + def real_flushheaders(self): + PlaintextMultipartBody.SubpartWriter.flushheaders(self) + + def flushheaders(self): + body = self.body + parent_body = self._fp + kept_part = parent_body.kept_part + if kept_part and kept_part is self: + return + + if body and parent_body.wants(self.body): + parent_body.kept_part = self + elif body: + body.close() + + def startbody(self, ctype, plist=[], prefix=1): + #FIXME: + parent_body = self._fp + self.body = PlaintextMultipartBody.SubpartWriter.startbody(self, ctype, + plist, prefix) + if not parent_body.wants(self.body): + self.body.close() + self.body = NullBody(ctype, plist) + return self.body + + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1): + return NullMultipartBody(subtype, plist) + +class NullBody(FilteringMimeWriter.MimeBody): + discards_data = 1 + def write(self, str): + pass + +#FIXME: needs testing. and cleanup +class NullMultipartBody(NullBody): + def __init__(self, subtype, plist=FilteringMimeWriter.Plist(), annotate = None): + NullBody.__init__(self, 'multipart/' + subtype, plist = plist, annotate = annotate) + def add_separator(self, final = 0): + pass + class SubpartWriter(PlaintextMimeWriter): + EXCLUDE_HEADERS = ('*',) + def flushheaders(self): + self.body.close() + self._reset() + def startbody(self, ctype, plist=[], prefix=1): + return NullBody(ctype, plist) + def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1): + return NullMultipartBody(subtype, plist) + +################################################################ +class ParserFilter(FilteringMimeWriter.BodyFilter): + """A base class for filters based on parsers like HTMLParser.""" + def __init__(self, body, parser_class): + from formatter import DumbWriter, AbstractFormatter + + class OutputThrough: + def __init__(self, filter): + self.write = filter.output + FilteringMimeWriter.BodyFilter.__init__(self, body) + outfp = OutputThrough(self) + writer = DumbWriter(outfp, maxcol = 78) + self.parser = parser_class(AbstractFormatter(writer)) + + def write(self, str): + self.parser.feed(str) + + def flush(self): + self.parser.close() + self.output("\n") + self.flushoutput() + + + +class _TrailingSpaceStripper(FilteringMimeWriter.LineFilter): + def handle_chunk(self, chunk): + self.output(re.sub(r'[ \t]+(\r?\n)', r'\1', chunk)) + +class _PlainTextFilter(FilteringMimeWriter.UUFilter): + """Delete uuencoded files from body.""" + plaintextness = 10 + def __init__(self, body): + FilteringMimeWriter.UUFilter.__init__(self, body) + self.content_type = 'text/plain' + def handle_begin(self, filename): + self.add_comment("Deleted uuencoded file '%s'" % filename) + def handle_text(self, chunk): + self.output(chunk) + +class _HTMLFilter(ParserFilter): + """Convert HTML to plain text. + + This filter expects to have HTML written to it. It writes plain + text to the underlying message body. + + BUGS (FIXME:) + + Currently, all entities in the HTML are converted to ISO-8859-1 characters. + + """ + plaintextness = 0 + def __init__(self, body): + import htmllib + + class MyHTMLParser(htmllib.HTMLParser): + def start_div(self, attrs): + """

forces line break.""" + self.formatter.end_paragraph(0) + + def start_script(self, attrs): + """Ignore text within