# coding: latin1 """ MediaWiki-style markup Copyright (C) 2008 David Cramer This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import re, random, locale from base64 import b64encode, b64decode # a few patterns we use later MW_COLON_STATE_TEXT = 0 MW_COLON_STATE_TAG = 1 MW_COLON_STATE_TAGSTART = 2 MW_COLON_STATE_CLOSETAG = 3 MW_COLON_STATE_TAGSLASH = 4 MW_COLON_STATE_COMMENT = 5 MW_COLON_STATE_COMMENTDASH = 6 MW_COLON_STATE_COMMENTDASHDASH = 7 _attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE) _space = re.compile(ur'\s+', re.UNICODE) _closePrePat = re.compile(u"]*?)(/?>)([^<]*)$', re.UNICODE) _htmlpairs = ( # Tags that must be closed u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1', u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's', u'strike', u'strong', u'tt', u'var', u'div', u'center', u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre', u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u', ) _htmlsingle = ( u'br', u'hr', u'li', u'dt', u'dd', u'img', ) _htmlsingleonly = ( # Elements that cannot have close tags u'br', u'hr', u'img', ) _htmlnest = ( # Tags that can be nested--?? u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul', u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img', ) _tabletags = ( # Can only appear inside table u'td', u'th', u'tr', ) _htmllist = ( # Tags used by list u'ul', u'ol', ) _listtags = ( # Tags that can appear in a list u'li', ) _htmlsingleallowed = _htmlsingle + _tabletags _htmlelements = _htmlsingle + _htmlpairs + _htmlnest _htmlEntities = { u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180, u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501, u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197, u'aring': 229, u'asymp': 8776, u'Atilde': 195, u'atilde': 227, u'Auml': 196, u'auml': 228, u'bdquo': 8222, u'Beta': 914, u'beta': 946, u'brvbar': 166, u'bull': 8226, u'cap': 8745, u'Ccedil': 199, u'ccedil': 231, u'cedil': 184, u'cent': 162, u'Chi': 935, u'chi': 967, u'circ': 710, u'clubs': 9827, u'cong': 8773, u'copy': 169, u'crarr': 8629, u'cup': 8746, u'curren': 164, u'dagger': 8224, u'Dagger': 8225, u'darr': 8595, u'dArr': 8659, u'deg': 176, u'Delta': 916, u'delta': 948, u'diams': 9830, u'divide': 247, u'Eacute': 201, u'eacute': 233, u'Ecirc': 202, u'ecirc': 234, u'Egrave': 200, u'egrave': 232, u'empty': 8709, u'emsp': 8195, u'ensp': 8194, u'Epsilon': 917, u'epsilon': 949, u'equiv': 8801, u'Eta': 919, u'eta': 951, u'ETH': 208, u'eth': 240, u'Euml': 203, u'euml': 235, u'euro': 8364, u'exist': 8707, u'fnof': 402, u'forall': 8704, u'frac12': 189, u'frac14': 188, u'frac34': 190, u'frasl': 8260, u'Gamma': 915, u'gamma': 947, u'ge': 8805, u'gt': 62, u'harr': 8596, u'hArr': 8660, u'hearts': 9829, u'hellip': 8230, u'Iacute': 205, u'iacute': 237, u'Icirc': 206, u'icirc': 238, u'iexcl': 161, u'Igrave': 204, u'igrave': 236, u'image': 8465, u'infin': 8734, u'int': 8747, u'Iota': 921, u'iota': 953, u'iquest': 191, u'isin': 8712, u'Iuml': 207, u'iuml': 239, u'Kappa': 922, u'kappa': 954, u'Lambda': 923, u'lambda': 955, u'lang': 9001, u'laquo': 171, u'larr': 8592, u'lArr': 8656, u'lceil': 8968, u'ldquo': 8220, u'le': 8804, u'lfloor': 8970, u'lowast': 8727, u'loz': 9674, u'lrm': 8206, u'lsaquo': 8249, u'lsquo': 8216, u'lt': 60, u'macr': 175, u'mdash': 8212, u'micro': 181, u'middot': 183, u'minus': 8722, u'Mu': 924, u'mu': 956, u'nabla': 8711, u'nbsp': 160, u'ndash': 8211, u'ne': 8800, u'ni': 8715, u'not': 172, u'notin': 8713, u'nsub': 8836, u'Ntilde': 209, u'ntilde': 241, u'Nu': 925, u'nu': 957, u'Oacute': 211, u'oacute': 243, u'Ocirc': 212, u'ocirc': 244, u'OElig': 338, u'oelig': 339, u'Ograve': 210, u'ograve': 242, u'oline': 8254, u'Omega': 937, u'omega': 969, u'Omicron': 927, u'omicron': 959, u'oplus': 8853, u'or': 8744, u'ordf': 170, u'ordm': 186, u'Oslash': 216, u'oslash': 248, u'Otilde': 213, u'otilde': 245, u'otimes': 8855, u'Ouml': 214, u'ouml': 246, u'para': 182, u'part': 8706, u'permil': 8240, u'perp': 8869, u'Phi': 934, u'phi': 966, u'Pi': 928, u'pi': 960, u'piv': 982, u'plusmn': 177, u'pound': 163, u'prime': 8242, u'Prime': 8243, u'prod': 8719, u'prop': 8733, u'Psi': 936, u'psi': 968, u'quot': 34, u'radic': 8730, u'rang': 9002, u'raquo': 187, u'rarr': 8594, u'rArr': 8658, u'rceil': 8969, u'rdquo': 8221, u'real': 8476, u'reg': 174, u'rfloor': 8971, u'Rho': 929, u'rho': 961, u'rlm': 8207, u'rsaquo': 8250, u'rsquo': 8217, u'sbquo': 8218, u'Scaron': 352, u'scaron': 353, u'sdot': 8901, u'sect': 167, u'shy': 173, u'Sigma': 931, u'sigma': 963, u'sigmaf': 962, u'sim': 8764, u'spades': 9824, u'sub': 8834, u'sube': 8838, u'sum': 8721, u'sup': 8835, u'sup1': 185, u'sup2': 178, u'sup3': 179, u'supe': 8839, u'szlig': 223, u'Tau': 932, u'tau': 964, u'there4': 8756, u'Theta': 920, u'theta': 952, u'thetasym': 977, u'thinsp': 8201, u'THORN': 222, u'thorn': 254, u'tilde': 732, u'times': 215, u'trade': 8482, u'Uacute': 218, u'uacute': 250, u'uarr': 8593, u'uArr': 8657, u'Ucirc': 219, u'ucirc': 251, u'Ugrave': 217, u'ugrave': 249, u'uml': 168, u'upsih': 978, u'Upsilon': 933, u'upsilon': 965, u'Uuml': 220, u'uuml': 252, u'weierp': 8472, u'Xi': 926, u'xi': 958, u'Yacute': 221, u'yacute': 253, u'yen': 165, u'Yuml': 376, u'yuml': 255, u'Zeta': 918, u'zeta': 950, u'zwj': 8205, u'zwnj': 8204 } _charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE) _cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE) _toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE) _hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE) _hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE) _h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE) _h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE) _h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE) _h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE) _h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE) _h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE) _quotePat = re.compile(u"""(''+)""", re.UNICODE) _removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from", u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per", u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via", u"with")) + ur')\b', re.UNICODE | re.IGNORECASE) _nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE) _multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE) _spacePat = re.compile(ur' ', re.UNICODE) _linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL) _bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE) _protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE) _specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE) _protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE) _controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE) _hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE) _stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE) _zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE) _headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)", re.UNICODE) _templateSectionPat = re.compile(ur"", re.UNICODE) _tagPat = re.compile(ur"<.*?>", re.UNICODE) _startRegexHash = {} _endRegexHash = {} _endCommentPat = re.compile(ur'(-->)', re.UNICODE) _extractTagsAndParams_n = 1 _guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE) _guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE) def setupAttributeWhitelist(): common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' ) block = common + (u'align',) tablealign = ( u'align', u'char', u'charoff', u'valign' ) tablecell = ( u'abbr', u'axis', u'headers', u'scope', u'rowspan', u'colspan', u'nowrap', # deprecated u'width', # deprecated u'height', # deprecated u'bgcolor' # deprecated ) return { u'div': block, u'center': common, # deprecated u'span': block, # ?? u'h1': block, u'h2': block, u'h3': block, u'h4': block, u'h5': block, u'h6': block, u'em': common, u'strong': common, u'cite': common, u'code': common, u'var': common, u'img': common + (u'src', u'alt', u'width', u'height',), u'blockquote': common + (u'cite',), u'sub': common, u'sup': common, u'p': block, u'br': (u'id', u'class', u'title', u'style', u'clear',), u'pre': common + (u'width',), u'ins': common + (u'cite', u'datetime'), u'del': common + (u'cite', u'datetime'), u'ul': common + (u'type',), u'ol': common + (u'type', u'start'), u'li': common + (u'type', u'value'), u'dl': common, u'dd': common, u'dt': common, u'table': common + ( u'summary', u'width', u'border', u'frame', u'rules', u'cellspacing', u'cellpadding', u'align', u'bgcolor', ), u'caption': common + (u'align',), u'thead': common + tablealign, u'tfoot': common + tablealign, u'tbody': common + tablealign, u'colgroup': common + ( u'span', u'width' ) + tablealign, u'col': common + ( u'span', u'width' ) + tablealign, u'tr': common + ( u'bgcolor', ) + tablealign, u'td': common + tablecell + tablealign, u'th': common + tablecell + tablealign, u'tt': common, u'b': common, u'i': common, u'big': common, u'small': common, u'strike': common, u's': common, u'u': common, u'font': common + ( u'size', u'color', u'face' ), u'hr': common + ( u'noshade', u'size', u'width' ), u'ruby': common, u'rb': common, u'rt': common, #array_merge( $common, array( 'rbspan' ) ), u'rp': common, } _whitelist = setupAttributeWhitelist() _page_cache = {} env = {} def registerTagHook(tag, function): mTagHooks[tag] = function class BaseParser(object): def __init__(self): self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000)) self.strip_state = {} self.arg_stack = [] self.env = env self.keep_env = (env != {}) def __del__(self): if not self.keep_env: global env env = {} ''' Used to store objects in the environment used to prevent recursive imports ''' def store_object(self, namespace, key, value=True): # Store the item to not reprocess it if namespace not in self.env: self.env[namespace] = {} self.env[namespace][key] = value def has_object(self, namespace, key): if namespace not in self.env: self.env[namespace] = {} if hasattr(self, 'count'): data = self.env[namespace] test = key in data self.count = True return key in self.env[namespace] def retrieve_object(self, namespace, key, default=None): if not self.env.get(namespace): self.env[namespace] = {} return self.env[namespace].get(key, default) def parse(self, text): utf8 = isinstance(text, str) text = to_unicode(text) if text[-1:] != u'\n': text = text + u'\n' taggedNewline = True else: taggedNewline = False text = self.strip(text) text = self.removeHtmlTags(text) text = self.parseHorizontalRule(text) text = self.parseAllQuotes(text) text = self.replaceExternalLinks(text) text = self.unstrip(text) text = self.fixtags(text) text = self.doBlockLevels(text, True) text = self.unstripNoWiki(text) text = text.split(u'\n') text = u'\n'.join(text) if taggedNewline and text[-1:] == u'\n': text = text[:-1] if utf8: return text.encode("utf-8") return text def strip(self, text, stripcomments=False, dontstrip=[]): render = True commentState = {} elements = ['nowiki',] + mTagHooks.keys() if True: #wgRawHtml elements.append('html') # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) for k in dontstrip: if k in elements: del elements[k] matches = {} text = self.extractTagsAndParams(elements, text, matches) for marker in matches: element, content, params, tag = matches[marker] if render: tagName = element.lower() if tagName == u'!--': # comment output = tag if tag[-3:] != u'-->': output += "-->" elif tagName == u'html': output = content elif tagName == u'nowiki': output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') else: if tagName in mTagHooks: output = mTagHooks[tagName](self, content, params) else: output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') else: # Just stripping tags; keep the source output = tag # Unstrip the output, because unstrip() is no longer recursive so # it won't do it itself output = self.unstrip(output) if not stripcomments and element == u'!--': commentState[marker] = output elif element == u'html' or element == u'nowiki': if 'nowiki' not in self.strip_state: self.strip_state['nowiki'] = {} self.strip_state['nowiki'][marker] = output else: if 'general' not in self.strip_state: self.strip_state['general'] = {} self.strip_state['general'][marker] = output # Unstrip comments unless explicitly told otherwise. # (The comments are always stripped prior to this point, so as to # not invoke any extension tags / parser hooks contained within # a comment.) if not stripcomments: # Put them all back and forget them for k in commentState: v = commentState[k] text = text.replace(k, v) return text def removeHtmlTags(self, text): """convert bad tags into HTML identities""" sb = [] text = self.removeHtmlComments(text) bits = text.split(u'<') sb.append(bits.pop(0)) tagstack = [] tablestack = tagstack for x in bits: m = _tagPattern.match(x) if not m: continue slash, t, params, brace, rest = m.groups() t = t.lower() badtag = False if t in _htmlelements: # Check our stack if slash: # Closing a tag... if t in _htmlsingleonly or len(tagstack) == 0: badtag = True else: ot = tagstack.pop() if ot != t: if ot in _htmlsingleallowed: # Pop all elements with an optional close tag # and see if we find a match below them optstack = [] optstack.append(ot) while True: if len(tagstack) == 0: break ot = tagstack.pop() if ot == t or ot not in _htmlsingleallowed: break optstack.append(ot) if t != ot: # No match. Push the optinal elements back again badtag = True tagstack += reversed(optstack) else: tagstack.append(ot) #
  • can be nested in
      or
        , skip those cases: if ot not in _htmllist and t in _listtags: badtag = True elif t == u'table': if len(tablestack) == 0: bagtag = True else: tagstack = tablestack.pop() newparams = u'' else: # Keep track for later if t in _tabletags and u'table' not in tagstack: badtag = True elif t in tagstack and t not in _htmlnest: badtag = True # Is it a self-closed htmlpair? (bug 5487) elif brace == u'/>' and t in _htmlpairs: badTag = True elif t in _htmlsingleonly: # Hack to force empty tag for uncloseable elements brace = u'/>' elif t in _htmlsingle: # Hack to not close $htmlsingle tags brace = None else: if t == u'table': tablestack.append(tagstack) tagstack = [] tagstack.append(t) newparams = self.fixTagAttributes(params, t) if not badtag: rest = rest.replace(u'>', u'>') if brace == u'/>': close = u' /' else: close = u'' sb.append(u'<') sb.append(slash) sb.append(t) sb.append(newparams) sb.append(close) sb.append(u'>') sb.append(rest) continue sb.append(u'<') sb.append(x.replace(u'>', u'>')) # Close off any remaining tags while tagstack: t = tagstack.pop() sb.append(u'\n') if t == u'table': if not tablestack: break tagstack = tablestack.pop() return u''.join(sb) def removeHtmlComments(self, text): """remove comments from given text""" sb = [] start = text.find(u'', start) if end == -1: break end += 3 spaceStart = max(0, start-1) spaceEnd = end while text[spaceStart] == u' ' and spaceStart > 0: spaceStart -= 1 while text[spaceEnd] == u' ': spaceEnd += 1 if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n': sb.append(text[last:spaceStart]) sb.append(u'\n') last = spaceEnd+1 else: sb.append(text[last:spaceStart+1]) last = spaceEnd start = text.find(u'' return result, mDTopen def nextItem(self, char, mDTopen): if char == u'*' or char == '#': return u'
      1. ', None elif char == u':' or char == u';': close = u'' if mDTopen: close = '' if char == u';': return close + u'
        ', True else: return close + u'
        ', False return u'' def closeList(self, char, mDTopen): if char == u'*': return u'
    \n' elif char == u'#': return u'
  • \n' elif char == u':': if mDTopen: return u'\n' else: return u'\n' else: return u'' def findColonNoLinks(self, text, before, after): try: pos = text.search(':') except: return False lt = text.find('<') if lt == -1 or lt > pos: # Easy; no tag nesting to worry about before = text[0:pos] after = text[0:pos+1] return before, after, pos # Ugly state machine to walk through avoiding tags. state = MW_COLON_STATE_TEXT; stack = 0; i = 0 while i < len(text): c = text[i]; if state == 0: # MW_COLON_STATE_TEXT: if text[i] == '<': # Could be either a tag or an tag state = MW_COLON_STATE_TAGSTART elif text[i] == ':': if stack == 0: # we found it return text[0:i], text[i+1], i else: # Skip ahead looking for something interesting try: colon = text.search(':', i) except: return False lt = text.find('<', i) if stack == 0: if lt == -1 or colon < lt: # we found it return text[0:colon], text[colon+1], i if lt == -1: break # Skip ahead to next tag start i = lt state = MW_COLON_STATE_TAGSTART elif state == 1: # MW_COLON_STATE_TAG: # In a if text[i] == '>': stack += 1 state = MW_COLON_STATE_TEXT elif text[i] == '/': state = MW_COLON_STATE_TAGSLASH elif state == 2: # MW_COLON_STATE_TAGSTART: if text[i] == '/': state = MW_COLON_STATE_CLOSETAG elif text[i] == '!': state = MW_COLON_STATE_COMMENT elif text[i] == '>': # Illegal early close? This shouldn't happen D: state = MW_COLON_STATE_TEXT else: state = MW_COLON_STATE_TAG elif state == 3: # MW_COLON_STATE_CLOSETAG: # In a if text[i] == '>': stack -= 1 if stack < 0: return False state = MW_COLON_STATE_TEXT elif state == MW_COLON_STATE_TAGSLASH: if text[i] == '>': # Yes, a self-closed tag state = MW_COLON_STATE_TEXT else: # Probably we're jumping the gun, and this is an attribute state = MW_COLON_STATE_TAG elif state == 5: # MW_COLON_STATE_COMMENT: if text[i] == '-': state = MW_COLON_STATE_COMMENTDASH elif state == MW_COLON_STATE_COMMENTDASH: if text[i] == '-': state = MW_COLON_STATE_COMMENTDASHDASH else: state = MW_COLON_STATE_COMMENT elif state == MW_COLON_STATE_COMMENTDASHDASH: if text[i] == '>': state = MW_COLON_STATE_TEXT else: state = MW_COLON_STATE_COMMENT else: raise if stack > 0: return False return False def doBlockLevels(self, text, linestart): # Parsing through the text line by line. The main thing # happening here is handling of block-level elements p, pre, # and making lists from lines starting with * # : etc. lastPrefix = u'' mDTopen = inBlockElem = False prefixLength = 0 paragraphStack = False _closeMatchPat = re.compile(ur"( 0: tmpOutput, tmpMDTopen = self.nextItem(pref[commonPrefixLength-1], mDTopen) output.append(tmpOutput) if tmpMDTopen is not None: mDTopen = tmpMDTopen while prefixLength > commonPrefixLength: char = pref[commonPrefixLength:commonPrefixLength+1] tmpOutput, tmpMDTOpen = self.openList(char, mLastSection) if tmpMDTOpen: mDTopen = True output.append(tmpOutput) mLastSection = u'' mInPre = False if char == u';': # FIXME: This is dupe of code above term = t2 = u'' z = self.findColonNoLinks(t, term, t2) if z != False: term, t2 = z[1:2] t = t2 output.append(term) tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen) output.append(tmpOutput) if tmpMDTopen is not None: mDTopen = tmpMDTopen commonPrefixLength += 1 lastPrefix = pref2 if prefixLength == 0: # No prefix (not in list)--go to paragraph mode # XXX: use a stack for nestable elements like span, table and div openmatch = _openMatchPat.search(t) closematch = _closeMatchPat.search(t) if openmatch or closematch: paragraphStack = False output.append(self.closeParagraph(mLastSection)) mLastSection = u'' if preCloseMatch: mInPre = False if preOpenMatch: mInPre = True inBlockElem = bool(not closematch) elif not inBlockElem and not mInPre: if t[0:1] == u' ' and (mLastSection == u'pre' or t.strip() != u''): # pre if mLastSection != u'pre': paragraphStack = False output.append(self.closeParagraph(u'') + u'
    ')
    							mInPre = False
    							mLastSection = u'pre'
    						t = t[1:]
    					else:
    						# paragraph
    						if t.strip() == u'':
    							if paragraphStack:
    								output.append(paragraphStack + u'
    ') paragraphStack = False mLastSection = u'p' else: if mLastSection != u'p': output.append(self.closeParagraph(mLastSection)) mLastSection = u'' mInPre = False paragraphStack = u'

    ' else: paragraphStack = u'

    ' else: if paragraphStack: output.append(paragraphStack) paragraphStack = False mLastSection = u'p' elif mLastSection != u'p': output.append(self.closeParagraph(mLastSection) + u'

    ') mLastSection = u'p' mInPre = False # somewhere above we forget to get out of pre block (bug 785) if preCloseMatch and mInPre: mInPre = False if paragraphStack == False: output.append(t + u"\n") while prefixLength: output.append(self.closeList(pref2[prefixLength-1], mDTopen)) mDTopen = False prefixLength -= 1 if mLastSection != u'': output.append(u'') mLastSection = u'' return ''.join(output) class Parser(BaseParser): def __init__(self, show_toc=True): super(Parser, self).__init__() self.show_toc = show_toc def parse(self, text): utf8 = isinstance(text, str) text = to_unicode(text) if text[-1:] != u'\n': text = text + u'\n' taggedNewline = True else: taggedNewline = False text = self.strip(text) text = self.removeHtmlTags(text) text = self.doTableStuff(text) text = self.parseHorizontalRule(text) text = self.checkTOC(text) text = self.parseHeaders(text) text = self.parseAllQuotes(text) text = self.replaceExternalLinks(text) if not self.show_toc and text.find(u"") == -1: self.show_toc = False text = self.formatHeadings(text, True) text = self.unstrip(text) text = self.fixtags(text) text = self.doBlockLevels(text, True) text = self.unstripNoWiki(text) text = text.split(u'\n') text = u'\n'.join(text) if taggedNewline and text[-1:] == u'\n': text = text[:-1] if utf8: return text.encode("utf-8") return text def checkTOC(self, text): if text.find(u"__NOTOC__") != -1: text = text.replace(u"__NOTOC__", u"") self.show_toc = False if text.find(u"__TOC__") != -1: text = text.replace(u"__TOC__", u"") self.show_toc = True return text def doTableStuff(self, text): t = text.split(u"\n") td = [] # Is currently a td tag open? ltd = [] # Was it TD or TH? tr = [] # Is currently a tr tag open? ltr = [] # tr attributes has_opened_tr = [] # Did this table open a element? indent_level = 0 # indent level of the table for k, x in zip(range(len(t)), t): x = x.strip() fc = x[0:1] matches = _zomgPat.match(x) if matches: indent_level = len(matches.group(1)) attributes = self.unstripForHTML(matches.group(2)) t[k] = u'

    '*indent_level + u'' td.append(False) ltd.append(u'') tr.append(False) ltr.append(u'') has_opened_tr.append(False) elif len(td) == 0: pass elif u'|}' == x[0:2]: z = u"" + x[2:] l = ltd.pop() if not has_opened_tr.pop(): z = u"" + z if tr.pop(): z = u"" + z if td.pop(): z = u'' + z ltr.pop() t[k] = z + u'
    '*indent_level elif u'|-' == x[0:2]: # Allows for |------------- x = x[1:] while x != u'' and x[0:1] == '-': x = x[1:] z = '' l = ltd.pop() has_opened_tr.pop() has_opened_tr.append(True) if tr.pop(): z = u'' + z if td.pop(): z = u'' + z ltr.pop() t[k] = z tr.append(False) td.append(False) ltd.append(u'') attributes = self.unstripForHTML(x) ltr.append(self.fixTagAttributes(attributes, u'tr')) elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption # x is a table row if u'|+' == x[0:2]: fc = u'+' x = x[1:] x = x[1:] if fc == u'!': x = x.replace(u'!!', u'||') # Split up multiple cells on the same line. # FIXME: This can result in improper nesting of tags processed # by earlier parser steps, but should avoid splitting up eg # attribute values containing literal "||". x = x.split(u'||') t[k] = u'' # Loop through each table cell for theline in x: z = '' if fc != u'+': tra = ltr.pop() if not tr.pop(): z = u'\n' tr.append(True) ltr.append(u'') has_opened_tr.pop() has_opened_tr.append(True) l = ltd.pop() if td.pop(): z = u'' + z if fc == u'|': l = u'td' elif fc == u'!': l = u'th' elif fc == u'+': l = u'caption' else: l = u'' ltd.append(l) #Cell parameters y = theline.split(u'|', 1) # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if y[0].find(u'[[') != -1: y = [theline] if len(y) == 1: y = z + u"<" + l + u">" + y[0] else: attributes = self.unstripForHTML(y[0]) y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1] t[k] += y td.append(True) while len(td) > 0: l = ltd.pop() if td.pop(): t.append(u'') if tr.pop(): t.append(u'') if not has_opened_tr.pop(): t.append(u'') t.append(u'') text = u'\n'.join(t) # special case: don't return empty table if text == u"\n\n
    ": text = u'' return text def formatHeadings(self, text, isMain): """ This function accomplishes several tasks: 1) Auto-number headings if that option is enabled 2) Add an [edit] link to sections for logged in users who have enabled the option 3) Add a Table of contents on the top for users who have enabled the option 4) Auto-anchor headings It loops through all headlines, collects the necessary data, then splits up the string and re-inserts the newly formatted headlines. """ doNumberHeadings = False showEditLink = True # Can User Edit if text.find(u"__NOEDITSECTION__") != -1: showEditLink = False text = text.replace(u"__NOEDITSECTION__", u"") # Get all headlines for numbering them and adding funky stuff like [edit] # links - this is for later, but we need the number of headlines right now matches = _headerPat.findall(text) numMatches = len(matches) # if there are fewer than 4 headlines in the article, do not show TOC # unless it's been explicitly enabled. enoughToc = self.show_toc and (numMatches >= 4 or text.find(u"") != -1) # Allow user to stipulate that a page should have a "new section" # link added via __NEWSECTIONLINK__ showNewSection = False if text.find(u"__NEWSECTIONLINK__") != -1: showNewSection = True text = text.replace(u"__NEWSECTIONLINK__", u"") # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML, # override above conditions and always show TOC above first header if text.find(u"__FORCETOC__") != -1: self.show_toc = True enoughToc = True text = text.replace(u"__FORCETOC__", u"") # Never ever show TOC if no headers if numMatches < 1: enoughToc = False # headline counter headlineCount = 0 sectionCount = 0 # headlineCount excluding template sections # Ugh .. the TOC should have neat indentation levels which can be # passed to the skin functions. These are determined here toc = [] head = {} sublevelCount = {} levelCount = {} toclevel = 0 level = 0 prevlevel = 0 toclevel = 0 prevtoclevel = 0 refers = {} refcount = {} wgMaxTocLevel = 5 for match in matches: headline = match[2] istemplate = False templatetitle = u'' templatesection = 0 numbering = [] m = _templateSectionPat.search(headline) if m: istemplate = True templatetitle = b64decode(m[0]) templatesection = 1 + int(b64decode(m[1])) headline = _templateSectionPat.sub(u'', headline) if toclevel: prevlevel = level prevtoclevel = toclevel level = matches[headlineCount][0] if doNumberHeadings or enoughToc: if level > prevlevel: toclevel += 1 sublevelCount[toclevel] = 0 if toclevel < wgMaxTocLevel: toc.append(u'\n
      ') elif level < prevlevel and toclevel > 1: # Decrease TOC level, find level to jump to if toclevel == 2 and level < levelCount[1]: toclevel = 1 else: for i in range(toclevel, 0, -1): if levelCount[i] == level: # Found last matching level toclevel = i break elif levelCount[i] < level: toclevel = i + 1 break if toclevel < wgMaxTocLevel: toc.append(u"\n") toc.append(u"
    \n\n" * max(prevtoclevel - toclevel, 0)) else: if toclevel < wgMaxTocLevel: toc.append(u"\n") levelCount[toclevel] = level # count number of headlines for each level sublevelCount[toclevel] += 1 for i in range(1, toclevel+1): if sublevelCount[i]: numbering.append(to_unicode(sublevelCount[i])) # The canonized header is a version of the header text safe to use for links # Avoid insertion of weird stuff like by expanding the relevant sections canonized_headline = self.unstrip(headline) canonized_headline = self.unstripNoWiki(canonized_headline) # -- don't know what to do with this yet. # Remove link placeholders by the link text. # # turns into # link text with suffix # $canonized_headline = preg_replace( '//e', # "\$this->mLinkHolders['texts'][\$1]", # $canonized_headline ); # $canonized_headline = preg_replace( '//e', # "\$this->mInterwikiLinkHolders['texts'][\$1]", # $canonized_headline ); # strip out HTML canonized_headline = _tagPat.sub(u'', canonized_headline) tocline = canonized_headline.strip() # Save headline for section edit hint before it's escaped headline_hint = tocline canonized_headline = self.escapeId(tocline) refers[headlineCount] = canonized_headline # count how many in assoc. array so we can track dupes in anchors if canonized_headline not in refers: refers[canonized_headline] = 1 else: refers[canonized_headline] += 1 refcount[headlineCount] = refers[canonized_headline] numbering = '.'.join(numbering) # Don't number the heading if it is the only one (looks silly) if doNumberHeadings and numMatches > 1: # the two are different if the line contains a link headline = numbering + u' ' + headline # Create the anchor for linking from the TOC to the section anchor = canonized_headline; if refcount[headlineCount] > 1: anchor += u'_' + unicode(refcount[headlineCount]) if enoughToc: toc.append(u'\n
  • ') toc.append(numbering) toc.append(u' ') toc.append(tocline) toc.append(u'') # if showEditLink and (not istemplate or templatetitle != u""): # if not head[headlineCount]: # head[headlineCount] = u'' # # if istemplate: # head[headlineCount] += sk.editSectionLinkForOther(templatetile, templatesection) # else: # head[headlineCount] += sk.editSectionLink(mTitle, sectionCount+1, headline_hint) # give headline the correct tag if headlineCount not in head: head[headlineCount] = [] h = head[headlineCount] h.append(u'') h.append(matches[headlineCount][1].strip()) h.append(headline.strip()) h.append(u'') headlineCount += 1 if not istemplate: sectionCount += 1 if enoughToc: if toclevel < wgMaxTocLevel: toc.append(u"
  • \n") toc.append(u"\n\n" * max(0, toclevel - 1)) #TODO: use gettext #toc.insert(0, u'

    ' + _('Table of Contents') + '

    ') toc.insert(0, u'

    Table of Contents

    ') toc.append(u'\n
    ') # split up and insert constructed headlines blocks = _headerPat.split(text) i = 0 len_blocks = len(blocks) forceTocPosition = text.find(u"") full = [] while i < len_blocks: j = i/4 full.append(blocks[i]) if enoughToc and not i and isMain and forceTocPosition == -1: full += toc toc = None if j in head and head[j]: full += head[j] head[j] = None i += 4 full = u''.join(full) if forceTocPosition != -1: return full.replace(u"", u''.join(toc), 1) else: return full def parse(text, showToc=True): """Returns HTML from MediaWiki markup""" p = Parser(show_toc=showToc) return p.parse(text) def parselite(text): """Returns HTML from MediaWiki markup ignoring without headings""" p = BaseParser() return p.parse(text) def truncate_url(url, length=40): if len(url) <= length: return url import re pattern = r'(/[^/]+/?)$' match = re.search(pattern, url) if not match: return url l = len(match.group(1)) domain = url.replace(match.group(1), '') firstpart = url[0:len(url)-l] secondpart = match.group(1) if firstpart == firstpart[0:length-3]: secondpart = secondpart[0:length-3] + '...' else: firstpart = firstpart[0:length-3] secondpart = '...' + secondpart t_url = firstpart+secondpart return t_url def to_unicode(text, charset=None): """Convert a `str` object to an `unicode` object. If `charset` is given, we simply assume that encoding for the text, but we'll use the "replace" mode so that the decoding will always succeed. If `charset` is ''not'' specified, we'll make some guesses, first trying the UTF-8 encoding, then trying the locale preferred encoding, in "replace" mode. This differs from the `unicode` builtin, which by default uses the locale preferred encoding, in 'strict' mode, and is therefore prompt to raise `UnicodeDecodeError`s. Because of the "replace" mode, the original content might be altered. If this is not what is wanted, one could map the original byte content by using an encoding which maps each byte of the input to an unicode character, e.g. by doing `unicode(text, 'iso-8859-1')`. """ if not isinstance(text, str): if isinstance(text, Exception): # two possibilities for storing unicode strings in exception data: try: # custom __str__ method on the exception (e.g. PermissionError) return unicode(text) except UnicodeError: # unicode arguments given to the exception (e.g. parse_date) return ' '.join([to_unicode(arg) for arg in text.args]) return unicode(text) if charset: return unicode(text, charset, 'replace') else: try: return unicode(text, 'utf-8') except UnicodeError: return unicode(text, locale.getpreferredencoding(), 'replace') # tag hooks mTagHooks = {} ## IMPORTANT ## Make sure all hooks output CLEAN html. Escape any user input BEFORE it's returned # Arguments passed: # - wiki environment instance # - tag content # - dictionary of attributes # quote example: # quote from cgi import escape def hook_quote(env, body, attributes={}): text = [u'
    '] if 'cite' in attributes: text.append(u"%s wrote:\n" % escape(attributes['cite'])) text.append(body.strip()) text.append(u'
    ') return u'\n'.join(text) registerTagHook('quote', hook_quote) def safe_name(name=None, remove_slashes=True): if name is None: return None name = str2url(name) if remove_slashes: name = re.sub(r"[^a-zA-Z0-9\-_\s\.]", "", name) else: name = re.sub(r"[^a-zA-Z0-9\-_\s\.\/]", "", name) name = re.sub(r"[\s\._]", "-", name) name = re.sub(r"[-]+", "-", name) return name.strip("-").lower() def str2url(str): """ Takes a UTF-8 string and replaces all characters with the equivalent in 7-bit ASCII. It returns a plain ASCII string usable in URLs. """ try: str = str.encode('utf-8') except: pass mfrom = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîï" to = "AAAAAAECEEEEIIIIDNOOOOOOUUUUYSaaaaaaaceeeeiiii" mfrom += "ñòóôõöøùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ" to += "noooooouuuuyyaaaaaaccccccccddddeeeeeeeeeegggggggg" mfrom += "ĤĥĦħĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘř" to += "hhhhiiiiiiiiiijjkkkllllllllllnnnnnnnnnoooooooorrrrrr" mfrom += "ŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƂƃƄƅƇƈƉƊƐƑƒƓƔ" to += "ssssssssttttttuuuuuuuuuuuuwwyyyzzzzzzfbbbbbccddeffgv" mfrom += "ƖƗƘƙƚƝƞƟƠƤƦƫƬƭƮƯưƱƲƳƴƵƶǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩ" to += "likklnnoopettttuuuuyyzzaaiioouuuuuuuuuueaaaaeeggggkk" mfrom += "ǪǫǬǭǰǴǵǷǸǹǺǻǼǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȞȟȤȥȦȧȨȩ" to += "oooojggpnnaaeeooaaaaeeeeiiiioooorrrruuuusstthhzzaaee" mfrom += "ȪȫȬȭȮȯȰȱȲȳḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫ" to += "ooooooooyyaabbbbbbccddddddddddeeeeeeeeeeffgghhhhhhhhhh" mfrom += "ḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗṘṙṚṛṜṝṞṟ" to += "iiiikkkkkkllllllllmmmmmmnnnnnnnnoooooooopppprrrrrrrr" mfrom += "ṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕ" to += "ssssssssssttttttttuuuuuuuuuuvvvvwwwwwwwwwwxxxxxyzzzzzz" mfrom += "ẖẗẘẙẚẛẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊị" to += "htwyafaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeiiii" mfrom += "ỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ" to += "oooooooooooooooooooooooouuuuuuuuuuuuuuyyyyyyyy" for i in zip(mfrom, to): str = str.replace(*i) return str # vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4: