google-group-archiver.py 48 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316
  1. # Copyright (C) 1998-2009 by the Free Software Foundation, Inc.
  2. #
  3. # This program is free software; you can redistribute it and/or
  4. # modify it under the terms of the GNU General Public License
  5. # as published by the Free Software Foundation; either version 2
  6. # of the License, or (at your option) any later version.
  7. #
  8. # This program is distributed in the hope that it will be useful,
  9. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. # GNU General Public License for more details.
  12. #
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program; if not, write to the Free Software
  15. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
  16. # USA.
  17. """HyperArch: Pipermail archiving for Mailman
  18. - The Dragon De Monsyne <dragondm@integral.org>
  19. TODO:
  20. - Should be able to force all HTML to be regenerated next time the
  21. archive is run, in case a template is changed.
  22. - Run a command to generate tarball of html archives for downloading
  23. (probably in the 'update_dirty_archives' method).
  24. """
  25. from __future__ import nested_scopes
  26. import sys
  27. import re
  28. import errno
  29. import urllib
  30. import time
  31. import os
  32. import types
  33. import HyperDatabase
  34. import pipermail
  35. import weakref
  36. import binascii
  37. from email.Header import decode_header, make_header
  38. from email.Errors import HeaderParseError
  39. from email.Charset import Charset
  40. from Mailman import mm_cfg
  41. from Mailman import Utils
  42. from Mailman import Errors
  43. from Mailman import LockFile
  44. from Mailman import MailList
  45. from Mailman import i18n
  46. from Mailman.SafeDict import SafeDict
  47. from Mailman.Logging.Syslog import syslog
  48. from Mailman.Mailbox import ArchiverMailbox
  49. # Set up i18n. Assume the current language has already been set in the caller.
  50. _ = i18n._
  51. gzip = None
  52. if mm_cfg.GZIP_ARCHIVE_TXT_FILES:
  53. try:
  54. import gzip
  55. except ImportError:
  56. pass
  57. EMPTYSTRING = ''
  58. NL = '\n'
  59. # MacOSX has a default stack size that is too small for deeply recursive
  60. # regular expressions. We see this as crashes in the Python test suite when
  61. # running test_re.py and test_sre.py. The fix is to set the stack limit to
  62. # 2048; the general recommendation is to do in the shell before running the
  63. # test suite. But that's inconvenient for a daemon like the qrunner.
  64. #
  65. # AFAIK, this problem only affects the archiver, so we're adding this work
  66. # around to this file (it'll get imported by the bundled pipermail or by the
  67. # bin/arch script. We also only do this on darwin, a.k.a. MacOSX.
  68. if sys.platform == 'darwin':
  69. try:
  70. import resource
  71. except ImportError:
  72. pass
  73. else:
  74. soft, hard = resource.getrlimit(resource.RLIMIT_STACK)
  75. newsoft = min(hard, max(soft, 1024*2048))
  76. resource.setrlimit(resource.RLIMIT_STACK, (newsoft, hard))
  77. def html_quote(s, lang=None):
  78. repls = ( ('&', '&amp;'),
  79. ("<", '&lt;'),
  80. (">", '&gt;'),
  81. ('"', '&quot;'))
  82. for thing, repl in repls:
  83. s = s.replace(thing, repl)
  84. return Utils.uncanonstr(s, lang)
  85. def url_quote(s):
  86. return urllib.quote(s)
  87. def null_to_space(s):
  88. return s.replace('\000', ' ')
  89. def sizeof(filename, lang):
  90. try:
  91. size = os.path.getsize(filename)
  92. except OSError as e:
  93. # ENOENT can happen if the .mbox file was moved away or deleted, and
  94. # an explicit mbox file name was given to bin/arch.
  95. if e.errno != errno.ENOENT: raise
  96. return _('size not available')
  97. if size < 1000:
  98. # Avoid i18n side-effects
  99. otrans = i18n.get_translation()
  100. try:
  101. i18n.set_language(lang)
  102. out = _(' %(size)i bytes ')
  103. finally:
  104. i18n.set_translation(otrans)
  105. return out
  106. elif size < 1000000:
  107. return ' %d KB ' % (size / 1000)
  108. # GB?? :-)
  109. return ' %d MB ' % (size / 1000000)
  110. html_charset = '<META http-equiv="Content-Type" ' \
  111. 'content="text/html; charset=%s">'
  112. def CGIescape(arg, lang=None):
  113. if isinstance(arg, types.UnicodeType):
  114. s = Utils.websafe(arg)
  115. else:
  116. s = Utils.websafe(str(arg))
  117. return Utils.uncanonstr(s.replace('"', '&quot;'), lang)
  118. # Parenthesized human name
  119. paren_name_pat = re.compile(r'([(].*[)])')
  120. # Subject lines preceded with 'Re:'
  121. REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE)
  122. # E-mail addresses and URLs in text
  123. emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)')
  124. # Argh! This pattern is buggy, and will choke on URLs with GET parameters.
  125. # MAS: Given that people are not constrained in how they write URIs in plain
  126. # text, it is not possible to have a single regexp to reliably match them.
  127. # The regexp below is intended to match straightforward cases. Even humans
  128. # can't reliably tell whether various punctuation at the end of a URI is part
  129. # of the URI or not.
  130. urlpat = re.compile(r'([a-z]+://.*?)(?:_\s|_$|$|[]})>\'"\s])', re.IGNORECASE)
  131. # Blank lines
  132. blankpat = re.compile(r'^\s*$')
  133. # Starting <html> directive
  134. htmlpat = re.compile(r'^\s*<HTML>\s*$', re.IGNORECASE)
  135. # Ending </html> directive
  136. nohtmlpat = re.compile(r'^\s*</HTML>\s*$', re.IGNORECASE)
  137. # Match quoted text
  138. quotedpat = re.compile(r'^([>|:]|&gt;)+')
  139. # Like Utils.maketext() but with caching to improve performance.
  140. #
  141. # _templatefilepathcache is used to associate a (templatefile, lang, listname)
  142. # key with the file system path to a template file. This path is the one that
  143. # the Utils.findtext() function has computed is the one to match the values in
  144. # the key tuple.
  145. #
  146. # _templatecache associate a file system path as key with the text
  147. # returned after processing the contents of that file by Utils.findtext()
  148. #
  149. # We keep two caches to reduce the amount of template text kept in memory,
  150. # since the _templatefilepathcache is a many->one mapping and _templatecache
  151. # is a one->one mapping. Imagine 1000 lists all using the same default
  152. # English template.
  153. _templatefilepathcache = {}
  154. _templatecache = {}
  155. def quick_maketext(templatefile, dict=None, lang=None, mlist=None):
  156. if mlist is None:
  157. listname = ''
  158. else:
  159. listname = mlist._internal_name
  160. if lang is None:
  161. if mlist is None:
  162. lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
  163. else:
  164. lang = mlist.preferred_language
  165. cachekey = (templatefile, lang, listname)
  166. filepath = _templatefilepathcache.get(cachekey)
  167. if filepath:
  168. template = _templatecache.get(filepath)
  169. if filepath is None or template is None:
  170. # Use the basic maketext, with defaults to get the raw template
  171. template, filepath = Utils.findtext(templatefile, lang=lang,
  172. raw=True, mlist=mlist)
  173. _templatefilepathcache[cachekey] = filepath
  174. _templatecache[filepath] = template
  175. # Copied from Utils.maketext()
  176. text = template
  177. if dict is not None:
  178. try:
  179. sdict = SafeDict(dict)
  180. try:
  181. text = sdict.interpolate(template)
  182. except UnicodeError:
  183. # Try again after coercing the template to unicode
  184. utemplate = unicode(template,
  185. Utils.GetCharSet(lang),
  186. 'replace')
  187. text = sdict.interpolate(utemplate)
  188. except (TypeError, ValueError):
  189. # The template is really screwed up
  190. pass
  191. # Make sure the text is in the given character set, or html-ify any bogus
  192. # characters.
  193. return Utils.uncanonstr(text, lang)
  194. # Note: I'm overriding most, if not all of the pipermail Article class
  195. # here -ddm
  196. # The Article class encapsulates a single posting. The attributes are:
  197. #
  198. # sequence : Sequence number, unique for each article in a set of archives
  199. # subject : Subject
  200. # datestr : The posting date, in human-readable format
  201. # date : The posting date, in purely numeric format
  202. # fromdate : The posting date, in `unixfrom' format
  203. # headers : Any other headers of interest
  204. # author : The author's name (and possibly organization)
  205. # email : The author's e-mail address
  206. # msgid : A unique message ID
  207. # in_reply_to : If !="", this is the msgid of the article being replied to
  208. # references: A (possibly empty) list of msgid's of earlier articles in
  209. # the thread
  210. # body : A list of strings making up the message body
  211. class Article(pipermail.Article):
  212. __super_init = pipermail.Article.__init__
  213. __super_set_date = pipermail.Article._set_date
  214. _last_article_time = time.time()
  215. def __init__(self, message=None, sequence=0, keepHeaders=[],
  216. lang=mm_cfg.DEFAULT_SERVER_LANGUAGE, mlist=None):
  217. self.__super_init(message, sequence, keepHeaders)
  218. self.prev = None
  219. self.next = None
  220. # Trim Re: from the subject line
  221. i = 0
  222. while i != -1:
  223. result = REpat.match(self.subject)
  224. if result:
  225. i = result.end(0)
  226. self.subject = self.subject[i:]
  227. else:
  228. i = -1
  229. # Useful to keep around
  230. self._lang = lang
  231. self._mlist = mlist
  232. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  233. # Avoid i18n side-effects. Note that the language for this
  234. # article (for this list) could be different from the site-wide
  235. # preferred language, so we need to ensure no side-effects will
  236. # occur. Think what happens when executing bin/arch.
  237. otrans = i18n.get_translation()
  238. try:
  239. i18n.set_language(lang)
  240. if self.author == self.email:
  241. self.author = self.email = re.sub('@', _(' at '),
  242. self.email)
  243. else:
  244. self.email = re.sub('@', _(' at '), self.email)
  245. finally:
  246. i18n.set_translation(otrans)
  247. # Snag the content-* headers. RFC 1521 states that their values are
  248. # case insensitive.
  249. ctype = message.get('Content-Type', 'text/plain')
  250. cenc = message.get('Content-Transfer-Encoding', '')
  251. self.ctype = ctype.lower()
  252. self.cenc = cenc.lower()
  253. self.decoded = {}
  254. cset = Utils.GetCharSet(mlist.preferred_language)
  255. cset_out = Charset(cset).output_charset or cset
  256. if isinstance(cset_out, unicode):
  257. # email 3.0.1 (python 2.4) doesn't like unicode
  258. cset_out = cset_out.encode('us-ascii')
  259. charset = message.get_content_charset(cset_out)
  260. if charset:
  261. charset = charset.lower().strip()
  262. if charset[0]=='"' and charset[-1]=='"':
  263. charset = charset[1:-1]
  264. if charset[0]=="'" and charset[-1]=="'":
  265. charset = charset[1:-1]
  266. try:
  267. body = message.get_payload(decode=True)
  268. except binascii.Error:
  269. body = None
  270. if body and charset != Utils.GetCharSet(self._lang):
  271. # decode body
  272. try:
  273. if charset == 'gb2312':
  274. body = unicode(body, 'gbk')
  275. else:
  276. body = unicode(body, charset)
  277. except (UnicodeError, LookupError):
  278. body = None
  279. if body:
  280. self.body = [l + "\n" for l in body.splitlines()]
  281. self.decode_headers()
  282. # Mapping of listnames to MailList instances as a weak value dictionary.
  283. # This code is copied from Runner.py but there's one important operational
  284. # difference. In Runner.py, we always .Load() the MailList object for
  285. # each _dispose() run, otherwise the object retrieved from the cache won't
  286. # be up-to-date. Since we're creating a new HyperArchive instance for
  287. # each message being archived, we don't need to worry about that -- but it
  288. # does mean there are additional opportunities for optimization.
  289. _listcache = weakref.WeakValueDictionary()
  290. def _open_list(self, listname):
  291. # Cache the open list so that any use of the list within this process
  292. # uses the same object. We use a WeakValueDictionary so that when the
  293. # list is no longer necessary, its memory is freed.
  294. mlist = self._listcache.get(listname)
  295. if not mlist:
  296. try:
  297. mlist = MailList.MailList(listname, lock=0)
  298. except Errors.MMListError as e:
  299. syslog('error', 'error opening list: %s\n%s', listname, e)
  300. return None
  301. else:
  302. self._listcache[listname] = mlist
  303. return mlist
  304. def __getstate__(self):
  305. d = self.__dict__.copy()
  306. # We definitely don't want to pickle the MailList instance, so just
  307. # pickle a reference to it.
  308. if d.has_key('_mlist'):
  309. mlist = d['_mlist']
  310. del d['_mlist']
  311. else:
  312. mlist = None
  313. if mlist:
  314. d['__listname'] = self._mlist.internal_name()
  315. else:
  316. d['__listname'] = None
  317. # Delete a few other things we don't want in the pickle
  318. for attr in ('prev', 'next', 'body'):
  319. if d.has_key(attr):
  320. del d[attr]
  321. d['body'] = []
  322. return d
  323. def __setstate__(self, d):
  324. # For loading older Articles via pickle. All this stuff was added
  325. # when Simone Piunni and Tokio Kikuchi i18n'ified Pipermail. See SF
  326. # patch #594771.
  327. self.__dict__ = d
  328. listname = d.get('__listname')
  329. if listname:
  330. del d['__listname']
  331. d['_mlist'] = self._open_list(listname)
  332. if not d.has_key('_lang'):
  333. if hasattr(self, '_mlist'):
  334. self._lang = self._mlist.preferred_language
  335. else:
  336. self._lang = mm_cfg.DEFAULT_SERVER_LANGUAGE
  337. if not d.has_key('cenc'):
  338. self.cenc = None
  339. if not d.has_key('decoded'):
  340. self.decoded = {}
  341. def setListIfUnset(self, mlist):
  342. if getattr(self, '_mlist', None) is None:
  343. self._mlist = mlist
  344. def quote(self, buf):
  345. return html_quote(buf, self._lang)
  346. def decode_headers(self):
  347. """MIME-decode headers.
  348. If the email, subject, or author attributes contain non-ASCII
  349. characters using the encoded-word syntax of RFC 2047, decoded versions
  350. of those attributes are placed in the self.decoded (a dictionary).
  351. If the list's charset differs from the header charset, an attempt is
  352. made to decode the headers as Unicode. If that fails, they are left
  353. undecoded.
  354. """
  355. author = self.decode_charset(self.author)
  356. subject = self.decode_charset(self.subject)
  357. if author:
  358. self.decoded['author'] = author
  359. email = self.decode_charset(self.email)
  360. if email:
  361. self.decoded['email'] = email
  362. if subject:
  363. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  364. otrans = i18n.get_translation()
  365. try:
  366. i18n.set_language(self._lang)
  367. atmark = unicode(_(' at '), Utils.GetCharSet(self._lang))
  368. subject = re.sub(r'([-+,.\w]+)@([-+.\w]+)',
  369. '\g<1>' + atmark + '\g<2>', subject)
  370. finally:
  371. i18n.set_translation(otrans)
  372. self.decoded['subject'] = subject
  373. self.decoded['stripped'] = self.strip_subject(subject or self.subject)
  374. def strip_subject(self, subject):
  375. # Strip subject_prefix and Re: for subject sorting
  376. # This part was taken from CookHeaders.py (TK)
  377. prefix = self._mlist.subject_prefix.strip()
  378. if prefix:
  379. prefix_pat = re.escape(prefix)
  380. prefix_pat = '%'.join(prefix_pat.split(r'\%'))
  381. prefix_pat = re.sub(r'%\d*d', r'\s*\d+\s*', prefix_pat)
  382. subject = re.sub(prefix_pat, '', subject)
  383. subject = subject.lstrip()
  384. strip_pat = re.compile('^((RE|AW|SV|VS)(\[\d+\])?:\s*)+', re.I)
  385. stripped = strip_pat.sub('', subject)
  386. return stripped
  387. def decode_charset(self, field):
  388. # TK: This function was rewritten for unifying to Unicode.
  389. # Convert 'field' into Unicode one line string.
  390. try:
  391. pairs = decode_header(field)
  392. ustr = make_header(pairs).__unicode__()
  393. except (LookupError, UnicodeError, ValueError, HeaderParseError):
  394. # assume list's language
  395. cset = Utils.GetCharSet(self._mlist.preferred_language)
  396. if cset == 'us-ascii':
  397. cset = 'iso-8859-1' # assume this for English list
  398. ustr = unicode(field, cset, 'replace')
  399. return u''.join(ustr.splitlines())
  400. def as_html(self):
  401. d = self.__dict__.copy()
  402. # avoid i18n side-effects
  403. otrans = i18n.get_translation()
  404. i18n.set_language(self._lang)
  405. try:
  406. d["prev"], d["prev_wsubj"] = self._get_prev()
  407. d["next"], d["next_wsubj"] = self._get_next()
  408. d["email_html"] = self.quote(self.email)
  409. d["title"] = self.quote(self.subject)
  410. d["subject_html"] = self.quote(self.subject)
  411. # TK: These two _url variables are used to compose a response
  412. # from the archive web page. So, ...
  413. d["subject_url"] = url_quote('Re: ' + self.subject)
  414. d["in_reply_to_url"] = url_quote(self._message_id)
  415. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  416. # Point the mailto url back to the list
  417. author = re.sub('@', _(' at '), self.author)
  418. emailurl = self._mlist.GetListEmail()
  419. else:
  420. author = self.author
  421. emailurl = self.email
  422. d["author_html"] = self.quote(author)
  423. d["email_url"] = url_quote(emailurl)
  424. d["datestr_html"] = self.quote(i18n.ctime(int(self.date)))
  425. d["body"] = self._get_body()
  426. d['listurl'] = self._mlist.GetScriptURL('listinfo', absolute=1)
  427. d['listname'] = self._mlist.real_name
  428. d['encoding'] = ''
  429. finally:
  430. i18n.set_translation(otrans)
  431. charset = Utils.GetCharSet(self._lang)
  432. d["encoding"] = html_charset % charset
  433. self._add_decoded(d)
  434. return quick_maketext(
  435. 'article.html', d,
  436. lang=self._lang, mlist=self._mlist)
  437. def _get_prev(self):
  438. """Return the href and subject for the previous message"""
  439. if self.prev:
  440. subject = self._get_subject_enc(self.prev)
  441. prev = ('<LINK REL="Previous" HREF="%s">'
  442. % (url_quote(self.prev.filename)))
  443. prev_wsubj = ('<LI>' + _('Previous message:') +
  444. ' <A HREF="%s">%s\n</A></li>'
  445. % (url_quote(self.prev.filename),
  446. self.quote(subject)))
  447. else:
  448. prev = prev_wsubj = ""
  449. return prev, prev_wsubj
  450. def _get_subject_enc(self, art):
  451. """Return the subject of art, decoded if possible.
  452. If the charset of the current message and art match and the
  453. article's subject is encoded, decode it.
  454. """
  455. return art.decoded.get('subject', art.subject)
  456. def _get_next(self):
  457. """Return the href and subject for the previous message"""
  458. if self.next:
  459. subject = self._get_subject_enc(self.next)
  460. next = ('<LINK REL="Next" HREF="%s">'
  461. % (url_quote(self.next.filename)))
  462. next_wsubj = ('<LI>' + _('Next message:') +
  463. ' <A HREF="%s">%s\n</A></li>'
  464. % (url_quote(self.next.filename),
  465. self.quote(subject)))
  466. else:
  467. next = next_wsubj = ""
  468. return next, next_wsubj
  469. _rx_quote = re.compile('=([A-F0-9][A-F0-9])')
  470. _rx_softline = re.compile('=[ \t]*$')
  471. def _get_body(self):
  472. """Return the message body ready for HTML, decoded if necessary"""
  473. try:
  474. body = self.html_body
  475. except AttributeError:
  476. body = self.body
  477. return null_to_space(EMPTYSTRING.join(body))
  478. def _add_decoded(self, d):
  479. """Add encoded-word keys to HTML output"""
  480. for src, dst in (('author', 'author_html'),
  481. ('email', 'email_html'),
  482. ('subject', 'subject_html'),
  483. ('subject', 'title')):
  484. if self.decoded.has_key(src):
  485. d[dst] = self.quote(self.decoded[src])
  486. def as_text(self):
  487. d = self.__dict__.copy()
  488. # We need to guarantee a valid From_ line, even if there are
  489. # bososities in the headers.
  490. if not d.get('fromdate', '').strip():
  491. d['fromdate'] = time.ctime(time.time())
  492. if not d.get('email', '').strip():
  493. d['email'] = 'bogus@does.not.exist.com'
  494. if not d.get('datestr', '').strip():
  495. d['datestr'] = time.ctime(time.time())
  496. #
  497. headers = ['From %(email)s %(fromdate)s',
  498. 'From: %(email)s (%(author)s)',
  499. 'Date: %(datestr)s',
  500. 'Subject: %(subject)s']
  501. if d['_in_reply_to']:
  502. headers.append('In-Reply-To: %(_in_reply_to)s')
  503. if d['_references']:
  504. headers.append('References: %(_references)s')
  505. if d['_message_id']:
  506. headers.append('Message-ID: %(_message_id)s')
  507. body = EMPTYSTRING.join(self.body)
  508. cset = Utils.GetCharSet(self._lang)
  509. # Coerce the body to Unicode and replace any invalid characters.
  510. if not isinstance(body, types.UnicodeType):
  511. body = unicode(body, cset, 'replace')
  512. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  513. otrans = i18n.get_translation()
  514. try:
  515. i18n.set_language(self._lang)
  516. atmark = unicode(_(' at '), cset)
  517. body = re.sub(r'([-+,.\w]+)@([-+.\w]+)',
  518. '\g<1>' + atmark + '\g<2>', body)
  519. finally:
  520. i18n.set_translation(otrans)
  521. # Return body to character set of article.
  522. body = body.encode(cset, 'replace')
  523. return NL.join(headers) % d + '\n\n' + body + '\n'
  524. def _set_date(self, message):
  525. self.__super_set_date(message)
  526. self.fromdate = time.ctime(int(self.date))
  527. def loadbody_fromHTML(self,fileobj):
  528. self.body = []
  529. begin = 0
  530. while 1:
  531. line = fileobj.readline()
  532. if not line:
  533. break
  534. if not begin:
  535. if line.strip() == '<!--beginarticle-->':
  536. begin = 1
  537. continue
  538. if line.strip() == '<!--endarticle-->':
  539. break
  540. self.body.append(line)
  541. def finished_update_article(self):
  542. self.body = []
  543. try:
  544. del self.html_body
  545. except AttributeError:
  546. pass
  547. class HyperArchive(pipermail.T):
  548. __super_init = pipermail.T.__init__
  549. __super_update_archive = pipermail.T.update_archive
  550. __super_update_dirty_archives = pipermail.T.update_dirty_archives
  551. __super_add_article = pipermail.T.add_article
  552. # some defaults
  553. DIRMODE = 2775
  554. FILEMODE = 660
  555. VERBOSE = 0
  556. DEFAULTINDEX = 'thread'
  557. ARCHIVE_PERIOD = 'month'
  558. THREADLAZY = 0
  559. THREADLEVELS = 3
  560. ALLOWHTML = 1 # "Lines between <html></html>" handled as is.
  561. SHOWHTML = 0 # Eg, nuke leading whitespace in html manner.
  562. IQUOTES = 1 # Italicize quoted text.
  563. SHOWBR = 0 # Add <br> onto every line
  564. def __init__(self, maillist):
  565. # can't init the database while other processes are writing to it!
  566. # XXX TODO- implement native locking
  567. # with mailman's LockFile module for HyperDatabase.HyperDatabase
  568. #
  569. dir = maillist.archive_dir()
  570. db = HyperDatabase.HyperDatabase(dir, maillist)
  571. self.__super_init(dir, reload=1, database=db)
  572. self.maillist = maillist
  573. self._lock_file = None
  574. self.lang = maillist.preferred_language
  575. self.charset = Utils.GetCharSet(maillist.preferred_language)
  576. if hasattr(self.maillist,'archive_volume_frequency'):
  577. if self.maillist.archive_volume_frequency == 0:
  578. self.ARCHIVE_PERIOD='year'
  579. elif self.maillist.archive_volume_frequency == 2:
  580. self.ARCHIVE_PERIOD='quarter'
  581. elif self.maillist.archive_volume_frequency == 3:
  582. self.ARCHIVE_PERIOD='week'
  583. elif self.maillist.archive_volume_frequency == 4:
  584. self.ARCHIVE_PERIOD='day'
  585. else:
  586. self.ARCHIVE_PERIOD='month'
  587. yre = r'(?P<year>[0-9]{4,4})'
  588. mre = r'(?P<month>[01][0-9])'
  589. dre = r'(?P<day>[0123][0-9])'
  590. self._volre = {
  591. 'year': '^' + yre + '$',
  592. 'quarter': '^' + yre + r'q(?P<quarter>[1234])$',
  593. 'month': '^' + yre + r'-(?P<month>[a-zA-Z]+)$',
  594. 'week': r'^Week-of-Mon-' + yre + mre + dre,
  595. 'day': '^' + yre + mre + dre + '$'
  596. }
  597. def _makeArticle(self, msg, sequence):
  598. return Article(msg, sequence,
  599. lang=self.maillist.preferred_language,
  600. mlist=self.maillist)
  601. def html_foot(self):
  602. # avoid i18n side-effects
  603. mlist = self.maillist
  604. otrans = i18n.get_translation()
  605. i18n.set_language(mlist.preferred_language)
  606. # Convenience
  607. def quotetime(s):
  608. return html_quote(i18n.ctime(s), self.lang)
  609. try:
  610. d = {"lastdate": quotetime(self.lastdate),
  611. "archivedate": quotetime(self.archivedate),
  612. "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
  613. "version": self.version,
  614. "listname": html_quote(mlist.real_name, self.lang),
  615. }
  616. i = {"thread": _("thread"),
  617. "subject": _("subject"),
  618. "author": _("author"),
  619. "date": _("date")
  620. }
  621. finally:
  622. i18n.set_translation(otrans)
  623. for t in i.keys():
  624. cap = t[0].upper() + t[1:]
  625. if self.type == cap:
  626. d["%s_ref" % (t)] = ""
  627. else:
  628. d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>'
  629. % (t, i[t]))
  630. return quick_maketext(
  631. 'archidxfoot.html', d,
  632. mlist=mlist)
  633. def html_head(self):
  634. # avoid i18n side-effects
  635. mlist = self.maillist
  636. otrans = i18n.get_translation()
  637. i18n.set_language(mlist.preferred_language)
  638. # Convenience
  639. def quotetime(s):
  640. return html_quote(i18n.ctime(s), self.lang)
  641. try:
  642. d = {"listname": html_quote(mlist.real_name, self.lang),
  643. "archtype": self.type,
  644. "archive": self.volNameToDesc(self.archive),
  645. "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
  646. "firstdate": quotetime(self.firstdate),
  647. "lastdate": quotetime(self.lastdate),
  648. "size": self.size,
  649. }
  650. i = {"thread": _("thread"),
  651. "subject": _("subject"),
  652. "author": _("author"),
  653. "date": _("date"),
  654. }
  655. finally:
  656. i18n.set_translation(otrans)
  657. for t in i.keys():
  658. cap = t[0].upper() + t[1:]
  659. if self.type == cap:
  660. d["%s_ref" % (t)] = ""
  661. d["archtype"] = i[t]
  662. else:
  663. d["%s_ref" % (t)] = ('<a href="%s.html#start">[ %s ]</a>'
  664. % (t, i[t]))
  665. if self.charset:
  666. d["encoding"] = html_charset % self.charset
  667. else:
  668. d["encoding"] = ""
  669. return quick_maketext(
  670. 'archidxhead.html', d,
  671. mlist=mlist)
  672. def html_TOC(self):
  673. mlist = self.maillist
  674. listname = mlist.internal_name()
  675. mbox = os.path.join(mlist.archive_dir()+'.mbox', listname+'.mbox')
  676. d = {"listname": mlist.real_name,
  677. "listinfo": mlist.GetScriptURL('listinfo', absolute=1),
  678. "fullarch": '../%s.mbox/%s.mbox' % (listname, listname),
  679. "size": sizeof(mbox, mlist.preferred_language),
  680. 'meta': '',
  681. }
  682. # Avoid i18n side-effects
  683. otrans = i18n.get_translation()
  684. i18n.set_language(mlist.preferred_language)
  685. try:
  686. if not self.archives:
  687. d["noarchive_msg"] = _(
  688. '<P>Currently, there are no archives. </P>')
  689. d["archive_listing_start"] = ""
  690. d["archive_listing_end"] = ""
  691. d["archive_listing"] = ""
  692. else:
  693. d["noarchive_msg"] = ""
  694. d["archive_listing_start"] = quick_maketext(
  695. 'archliststart.html',
  696. lang=mlist.preferred_language,
  697. mlist=mlist)
  698. d["archive_listing_end"] = quick_maketext(
  699. 'archlistend.html',
  700. mlist=mlist)
  701. accum = []
  702. for a in self.archives:
  703. accum.append(self.html_TOC_entry(a))
  704. d["archive_listing"] = EMPTYSTRING.join(accum)
  705. finally:
  706. i18n.set_translation(otrans)
  707. # The TOC is always in the charset of the list's preferred language
  708. d['meta'] += html_charset % Utils.GetCharSet(mlist.preferred_language)
  709. # The site can disable public access to the mbox file.
  710. if mm_cfg.PUBLIC_MBOX:
  711. template = 'archtoc.html'
  712. else:
  713. template = 'archtocnombox.html'
  714. return quick_maketext(template, d, mlist=mlist)
  715. def html_TOC_entry(self, arch):
  716. # Check to see if the archive is gzip'd or not
  717. txtfile = os.path.join(self.maillist.archive_dir(), arch + '.txt')
  718. gzfile = txtfile + '.gz'
  719. # which exists? .txt.gz first, then .txt
  720. if os.path.exists(gzfile):
  721. file = gzfile
  722. url = arch + '.txt.gz'
  723. templ = '<td><A href="%(url)s">[ ' + _('Gzip\'d Text%(sz)s') \
  724. + ']</a></td>'
  725. elif os.path.exists(txtfile):
  726. file = txtfile
  727. url = arch + '.txt'
  728. templ = '<td><A href="%(url)s">[ ' + _('Text%(sz)s') + ']</a></td>'
  729. else:
  730. # neither found?
  731. file = None
  732. # in Python 1.5.2 we have an easy way to get the size
  733. if file:
  734. textlink = templ % {
  735. 'url': url,
  736. 'sz' : sizeof(file, self.maillist.preferred_language)
  737. }
  738. else:
  739. # there's no archive file at all... hmmm.
  740. textlink = ''
  741. return quick_maketext(
  742. 'archtocentry.html',
  743. {'archive': arch,
  744. 'archivelabel': self.volNameToDesc(arch),
  745. 'textlink': textlink
  746. },
  747. mlist=self.maillist)
  748. def GetArchLock(self):
  749. if self._lock_file:
  750. return 1
  751. self._lock_file = LockFile.LockFile(
  752. os.path.join(mm_cfg.LOCK_DIR,
  753. self.maillist.internal_name() + '-arch.lock'))
  754. try:
  755. self._lock_file.lock(timeout=0.5)
  756. except LockFile.TimeOutError:
  757. return 0
  758. return 1
  759. def DropArchLock(self):
  760. if self._lock_file:
  761. self._lock_file.unlock(unconditionally=1)
  762. self._lock_file = None
  763. def processListArch(self):
  764. name = self.maillist.ArchiveFileName()
  765. wname= name+'.working'
  766. ename= name+'.err_unarchived'
  767. try:
  768. os.stat(name)
  769. except (IOError,os.error):
  770. #no archive file, nothin to do -ddm
  771. return
  772. #see if arch is locked here -ddm
  773. if not self.GetArchLock():
  774. #another archiver is running, nothing to do. -ddm
  775. return
  776. #if the working file is still here, the archiver may have
  777. # crashed during archiving. Save it, log an error, and move on.
  778. try:
  779. wf = open(wname)
  780. syslog('error',
  781. 'Archive working file %s present. '
  782. 'Check %s for possibly unarchived msgs',
  783. wname, ename)
  784. omask = os.umask(7)
  785. try:
  786. ef = open(ename, 'a+')
  787. finally:
  788. os.umask(omask)
  789. ef.seek(1,2)
  790. if ef.read(1) != '\n':
  791. ef.write('\n')
  792. ef.write(wf.read())
  793. ef.close()
  794. wf.close()
  795. os.unlink(wname)
  796. except IOError:
  797. pass
  798. os.rename(name,wname)
  799. archfile = open(wname)
  800. self.processUnixMailbox(archfile)
  801. archfile.close()
  802. os.unlink(wname)
  803. self.DropArchLock()
  804. def get_filename(self, article):
  805. return '%06i.html' % (article.sequence,)
  806. def get_archives(self, article):
  807. """Return a list of indexes where the article should be filed.
  808. A string can be returned if the list only contains one entry,
  809. and the empty list is legal."""
  810. res = self.dateToVolName(float(article.date))
  811. self.message(_("figuring article archives\n"))
  812. self.message(res + "\n")
  813. return res
  814. def volNameToDesc(self, volname):
  815. volname = volname.strip()
  816. # Don't make these module global constants since we have to runtime
  817. # translate them anyway.
  818. monthdict = [
  819. '',
  820. _('January'), _('February'), _('March'), _('April'),
  821. _('May'), _('June'), _('July'), _('August'),
  822. _('September'), _('October'), _('November'), _('December')
  823. ]
  824. for each in self._volre.keys():
  825. match = re.match(self._volre[each], volname)
  826. # Let ValueErrors percolate up
  827. if match:
  828. year = int(match.group('year'))
  829. if each == 'quarter':
  830. d =["", _("First"), _("Second"), _("Third"), _("Fourth") ]
  831. ord = d[int(match.group('quarter'))]
  832. return _("%(ord)s quarter %(year)i")
  833. elif each == 'month':
  834. monthstr = match.group('month').lower()
  835. for i in range(1, 13):
  836. monthname = time.strftime("%B", (1999,i,1,0,0,0,0,1,0))
  837. if monthstr.lower() == monthname.lower():
  838. month = monthdict[i]
  839. return _("%(month)s %(year)i")
  840. raise (ValueError, "%s is not a month!" % monthstr)
  841. elif each == 'week':
  842. month = monthdict[int(match.group("month"))]
  843. day = int(match.group("day"))
  844. return _("The Week Of Monday %(day)i %(month)s %(year)i")
  845. elif each == 'day':
  846. month = monthdict[int(match.group("month"))]
  847. day = int(match.group("day"))
  848. return _("%(day)i %(month)s %(year)i")
  849. else:
  850. return match.group('year')
  851. raise (ValueError, "%s is not a valid volname" % volname)
  852. # The following two methods should be inverses of each other. -ddm
  853. def dateToVolName(self,date):
  854. datetuple=time.localtime(date)
  855. if self.ARCHIVE_PERIOD=='year':
  856. return time.strftime("%Y",datetuple)
  857. elif self.ARCHIVE_PERIOD=='quarter':
  858. if datetuple[1] in [1,2,3]:
  859. return time.strftime("%Yq1",datetuple)
  860. elif datetuple[1] in [4,5,6]:
  861. return time.strftime("%Yq2",datetuple)
  862. elif datetuple[1] in [7,8,9]:
  863. return time.strftime("%Yq3",datetuple)
  864. else:
  865. return time.strftime("%Yq4",datetuple)
  866. elif self.ARCHIVE_PERIOD == 'day':
  867. return time.strftime("%Y%m%d", datetuple)
  868. elif self.ARCHIVE_PERIOD == 'week':
  869. # Reconstruct "seconds since epoch", and subtract weekday
  870. # multiplied by the number of seconds in a day.
  871. monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60
  872. # Build a new datetuple from this "seconds since epoch" value
  873. datetuple = time.localtime(monday)
  874. return time.strftime("Week-of-Mon-%Y%m%d", datetuple)
  875. # month. -ddm
  876. else:
  877. return time.strftime("%Y-%B",datetuple)
  878. def volNameToDate(self, volname):
  879. volname = volname.strip()
  880. for each in self._volre.keys():
  881. match = re.match(self._volre[each],volname)
  882. if match:
  883. year = int(match.group('year'))
  884. month = 1
  885. day = 1
  886. if each == 'quarter':
  887. q = int(match.group('quarter'))
  888. month = (q * 3) - 2
  889. elif each == 'month':
  890. monthstr = match.group('month').lower()
  891. m = []
  892. for i in range(1,13):
  893. m.append(
  894. time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower())
  895. try:
  896. month = m.index(monthstr) + 1
  897. except ValueError:
  898. pass
  899. elif each == 'week' or each == 'day':
  900. month = int(match.group("month"))
  901. day = int(match.group("day"))
  902. try:
  903. return time.mktime((year,month,1,0,0,0,0,1,-1))
  904. except OverflowError:
  905. return 0.0
  906. return 0.0
  907. def sortarchives(self):
  908. def sf(a, b):
  909. al = self.volNameToDate(a)
  910. bl = self.volNameToDate(b)
  911. if al > bl:
  912. return 1
  913. elif al < bl:
  914. return -1
  915. else:
  916. return 0
  917. if self.ARCHIVE_PERIOD in ('month','year','quarter'):
  918. self.archives.sort(sf)
  919. else:
  920. self.archives.sort()
  921. self.archives.reverse()
  922. def message(self, msg):
  923. if self.VERBOSE:
  924. f = sys.stderr
  925. f.write(msg)
  926. if msg[-1:] != '\n':
  927. f.write('\n')
  928. f.flush()
  929. def open_new_archive(self, archive, archivedir):
  930. index_html = os.path.join(archivedir, 'index.html')
  931. try:
  932. os.unlink(index_html)
  933. except:
  934. pass
  935. os.symlink(self.DEFAULTINDEX+'.html',index_html)
  936. def write_index_header(self):
  937. self.depth=0
  938. print (self.html_head())
  939. if not self.THREADLAZY and self.type=='Thread':
  940. self.message(_("Computing threaded index\n"))
  941. self.updateThreadedIndex()
  942. def write_index_footer(self):
  943. for i in range(self.depth):
  944. print ('</UL>')
  945. print (self.html_foot()
  946. )
  947. def write_index_entry(self, article):
  948. subject = self.get_header("subject", article)
  949. author = self.get_header("author", article)
  950. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  951. try:
  952. author = re.sub('@', _(' at '), author)
  953. except UnicodeError:
  954. # Non-ASCII author contains '@' ... no valid email anyway
  955. pass
  956. subject = CGIescape(subject, self.lang)
  957. author = CGIescape(author, self.lang)
  958. d = {
  959. 'filename': urllib.quote(article.filename),
  960. 'subject': subject,
  961. 'sequence': article.sequence,
  962. 'author': author
  963. }
  964. def get_header(self, field, article):
  965. # if we have no decoded header, return the encoded one
  966. result = article.decoded.get(field)
  967. if result is None:
  968. return getattr(article, field)
  969. # otherwise, the decoded one will be Unicode
  970. return result
  971. def write_threadindex_entry(self, article, depth):
  972. if depth < 0:
  973. self.message('depth<0')
  974. depth = 0
  975. if depth > self.THREADLEVELS:
  976. depth = self.THREADLEVELS
  977. self.depth = depth
  978. self.write_index_entry(article)
  979. def write_TOC(self):
  980. self.sortarchives()
  981. omask = os.umask(2)
  982. try:
  983. toc = open(os.path.join(self.basedir, 'index.html'), 'w')
  984. finally:
  985. os.umask(omask)
  986. toc.write(self.html_TOC())
  987. toc.close()
  988. def write_article(self, index, article, path):
  989. # called by add_article
  990. omask = os.umask(2)
  991. try:
  992. f = open(path, 'w')
  993. finally:
  994. os.umask(omask)
  995. f.write(article.as_html())
  996. f.close()
  997. # Write the text article to the text archive.
  998. path = os.path.join(self.basedir, "%s.txt" % index)
  999. omask = os.umask(2)
  1000. try:
  1001. f = open(path, 'a+')
  1002. finally:
  1003. os.umask(omask)
  1004. f.write(article.as_text())
  1005. f.close()
  1006. def update_archive(self, archive):
  1007. self.__super_update_archive(archive)
  1008. # only do this if the gzip module was imported globally, and
  1009. # gzip'ing was enabled via mm_cfg.GZIP_ARCHIVE_TXT_FILES. See
  1010. # above.
  1011. if gzip:
  1012. archz = None
  1013. archt = None
  1014. txtfile = os.path.join(self.basedir, '%s.txt' % archive)
  1015. gzipfile = os.path.join(self.basedir, '%s.txt.gz' % archive)
  1016. oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % archive)
  1017. try:
  1018. # open the plain text file
  1019. archt = open(txtfile)
  1020. except IOError:
  1021. return
  1022. try:
  1023. os.rename(gzipfile, oldgzip)
  1024. archz = gzip.open(oldgzip)
  1025. except (IOError, RuntimeError, os.error):
  1026. pass
  1027. try:
  1028. ou = os.umask(2)
  1029. newz = gzip.open(gzipfile, 'w')
  1030. finally:
  1031. # XXX why is this a finally?
  1032. os.umask(ou)
  1033. if archz:
  1034. newz.write(archz.read())
  1035. archz.close()
  1036. os.unlink(oldgzip)
  1037. # XXX do we really need all this in a try/except?
  1038. try:
  1039. newz.write(archt.read())
  1040. newz.close()
  1041. archt.close()
  1042. except IOError:
  1043. pass
  1044. os.unlink(txtfile)
  1045. _skip_attrs = ('maillist', '_lock_file', 'charset')
  1046. def getstate(self):
  1047. d={}
  1048. for each in self.__dict__.keys():
  1049. if not (each in self._skip_attrs
  1050. or each.upper() == each):
  1051. d[each] = self.__dict__[each]
  1052. return d
  1053. # Add <A HREF="..."> tags around URLs and e-mail addresses.
  1054. def __processbody_URLquote(self, lines):
  1055. # XXX a lot to do here:
  1056. # 1. use lines directly, rather than source and dest
  1057. # 2. make it clearer
  1058. # 3. make it faster
  1059. # TK: Prepare for unicode obscure.
  1060. atmark = _(' at ')
  1061. if lines and isinstance(lines[0], types.UnicodeType):
  1062. atmark = unicode(atmark, Utils.GetCharSet(self.lang), 'replace')
  1063. source = lines[:]
  1064. dest = lines
  1065. last_line_was_quoted = 0
  1066. for i in xrange(0, len(source)):
  1067. Lorig = L = source[i]
  1068. prefix = suffix = ""
  1069. if L is None:
  1070. continue
  1071. # Italicise quoted text
  1072. if self.IQUOTES:
  1073. quoted = quotedpat.match(L)
  1074. if quoted is None:
  1075. last_line_was_quoted = 0
  1076. else:
  1077. quoted = quoted.end(0)
  1078. prefix = CGIescape(L[:quoted], self.lang) + '<i>'
  1079. suffix = '</I>'
  1080. if self.SHOWHTML:
  1081. suffix += '<BR>'
  1082. if not last_line_was_quoted:
  1083. prefix = '<BR>' + prefix
  1084. L = L[quoted:]
  1085. last_line_was_quoted = 1
  1086. # Check for an e-mail address
  1087. L2 = ""
  1088. jr = emailpat.search(L)
  1089. kr = urlpat.search(L)
  1090. while jr is not None or kr is not None:
  1091. if jr == None:
  1092. j = -1
  1093. else:
  1094. j = jr.start(0)
  1095. if kr is None:
  1096. k = -1
  1097. else:
  1098. k = kr.start(0)
  1099. if j != -1 and (j < k or k == -1):
  1100. text = jr.group(1)
  1101. length = len(text)
  1102. if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS:
  1103. text = re.sub('@', atmark, text)
  1104. URL = self.maillist.GetScriptURL(
  1105. 'listinfo', absolute=1)
  1106. else:
  1107. URL = 'mailto:' + text
  1108. pos = j
  1109. elif k != -1 and (j > k or j == -1):
  1110. text = URL = kr.group(1)
  1111. length = len(text)
  1112. pos = k
  1113. else: # j==k
  1114. raise (ValueError, "j==k: This can't happen!")
  1115. #length = len(text)
  1116. #self.message("URL: %s %s %s \n"
  1117. # % (CGIescape(L[:pos]), URL, CGIescape(text)))
  1118. L2 += '%s<A HREF="%s">%s</A>' % (
  1119. CGIescape(L[:pos], self.lang),
  1120. html_quote(URL), CGIescape(text, self.lang))
  1121. L = L[pos+length:]
  1122. jr = emailpat.search(L)
  1123. kr = urlpat.search(L)
  1124. if jr is None and kr is None:
  1125. L = CGIescape(L, self.lang)
  1126. L = prefix + L2 + L + suffix
  1127. source[i] = None
  1128. dest[i] = L
  1129. # Perform Hypermail-style processing of <HTML></HTML> directives
  1130. # in message bodies. Lines between <HTML> and </HTML> will be written
  1131. # out precisely as they are; other lines will be passed to func2
  1132. # for further processing .
  1133. def __processbody_HTML(self, lines):
  1134. # XXX need to make this method modify in place
  1135. source = lines[:]
  1136. dest = lines
  1137. l = len(source)
  1138. i = 0
  1139. while i < l:
  1140. while i < l and htmlpat.match(source[i]) is None:
  1141. i = i + 1
  1142. if i < l:
  1143. source[i] = None
  1144. i = i + 1
  1145. while i < l and nohtmlpat.match(source[i]) is None:
  1146. dest[i], source[i] = source[i], None
  1147. i = i + 1
  1148. if i < l:
  1149. source[i] = None
  1150. i = i + 1
  1151. def format_article(self, article):
  1152. # called from add_article
  1153. # TBD: Why do the HTML formatting here and keep it in the
  1154. # pipermail database? It makes more sense to do the html
  1155. # formatting as the article is being written as html and toss
  1156. # the data after it has been written to the archive file.
  1157. lines = filter(None, article.body)
  1158. # Handle <HTML> </HTML> directives
  1159. if self.ALLOWHTML:
  1160. self.__processbody_HTML(lines)
  1161. self.__processbody_URLquote(lines)
  1162. if not self.SHOWHTML and lines:
  1163. lines.insert(0, '<PRE>')
  1164. lines.append('</PRE>')
  1165. else:
  1166. # Do fancy formatting here
  1167. if self.SHOWBR:
  1168. lines = map(lambda x:x + "<BR>", lines)
  1169. else:
  1170. for i in range(0, len(lines)):
  1171. s = lines[i]
  1172. if s[0:1] in ' \t\n':
  1173. lines[i] = '<P>' + s
  1174. article.html_body = lines
  1175. return article
  1176. def update_article(self, arcdir, article, prev, next):
  1177. seq = article.sequence
  1178. filename = os.path.join(arcdir, article.filename)
  1179. self.message(_('Updating HTML for article %(seq)s'))
  1180. try:
  1181. f = open(filename)
  1182. article.loadbody_fromHTML(f)
  1183. f.close()
  1184. except IOError as e:
  1185. if e.errno != errno.ENOENT: raise
  1186. self.message(_('article file %(filename)s is missing!'))
  1187. article.prev = prev
  1188. article.next = next
  1189. omask = os.umask(2)
  1190. try:
  1191. f = open(filename, 'w')
  1192. finally:
  1193. os.umask(omask)
  1194. f.write(article.as_html())
  1195. f.close()