2147 lines
60 KiB
Python
2147 lines
60 KiB
Python
# coding: latin1
|
|
"""
|
|
MediaWiki-style markup
|
|
|
|
Copyright (C) 2008 David Cramer <dcramer@gmail.com>
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
"""
|
|
|
|
import re, random, locale
|
|
from base64 import b64encode, b64decode
|
|
|
|
# a few patterns we use later
|
|
|
|
MW_COLON_STATE_TEXT = 0
|
|
MW_COLON_STATE_TAG = 1
|
|
MW_COLON_STATE_TAGSTART = 2
|
|
MW_COLON_STATE_CLOSETAG = 3
|
|
MW_COLON_STATE_TAGSLASH = 4
|
|
MW_COLON_STATE_COMMENT = 5
|
|
MW_COLON_STATE_COMMENTDASH = 6
|
|
MW_COLON_STATE_COMMENTDASHDASH = 7
|
|
|
|
_attributePat = re.compile(ur'''(?:^|\s)([A-Za-z0-9]+)(?:\s*=\s*(?:"([^<"]*)"|'([^<']*)'|([a-zA-Z0-9!#$%&()*,\-./:;<>?@[\]^_`{|}~]+)|#([0-9a-fA-F]+)))''', re.UNICODE)
|
|
_space = re.compile(ur'\s+', re.UNICODE)
|
|
_closePrePat = re.compile(u"</pre", re.UNICODE | re.IGNORECASE)
|
|
_openPrePat = re.compile(u"<pre", re.UNICODE | re.IGNORECASE)
|
|
_openMatchPat = re.compile(u"(<table|<blockquote|<h1|<h2|<h3|<h4|<h5|<h6|<pre|<tr|<p|<ul|<ol|<li|</center|</tr|</td|</th)", re.UNICODE | re.IGNORECASE)
|
|
_tagPattern = re.compile(ur'^(/?)(\w+)([^>]*?)(/?>)([^<]*)$', re.UNICODE)
|
|
|
|
_htmlpairs = ( # Tags that must be closed
|
|
u'b', u'del', u'i', u'ins', u'u', u'font', u'big', u'small', u'sub', u'sup', u'h1',
|
|
u'h2', u'h3', u'h4', u'h5', u'h6', u'cite', u'code', u'em', u's',
|
|
u'strike', u'strong', u'tt', u'var', u'div', u'center',
|
|
u'blockquote', u'ol', u'ul', u'dl', u'table', u'caption', u'pre',
|
|
u'ruby', u'rt' , u'rb' , u'rp', u'p', u'span', u'u',
|
|
)
|
|
_htmlsingle = (
|
|
u'br', u'hr', u'li', u'dt', u'dd', u'img',
|
|
)
|
|
_htmlsingleonly = ( # Elements that cannot have close tags
|
|
u'br', u'hr', u'img',
|
|
)
|
|
_htmlnest = ( # Tags that can be nested--??
|
|
u'table', u'tr', u'td', u'th', u'div', u'blockquote', u'ol', u'ul',
|
|
u'dl', u'font', u'big', u'small', u'sub', u'sup', u'span', u'img',
|
|
)
|
|
_tabletags = ( # Can only appear inside table
|
|
u'td', u'th', u'tr',
|
|
)
|
|
_htmllist = ( # Tags used by list
|
|
u'ul', u'ol',
|
|
)
|
|
_listtags = ( # Tags that can appear in a list
|
|
u'li',
|
|
)
|
|
_htmlsingleallowed = _htmlsingle + _tabletags
|
|
_htmlelements = _htmlsingle + _htmlpairs + _htmlnest
|
|
|
|
_htmlEntities = {
|
|
u'Aacute': 193, u'aacute': 225, u'Acirc': 194, u'acirc': 226, u'acute': 180,
|
|
u'AElig': 198, u'aelig': 230, u'Agrave': 192, u'agrave': 224, u'alefsym': 8501,
|
|
u'Alpha': 913, u'alpha': 945, u'amp': 38, u'and': 8743, u'ang': 8736, u'Aring': 197,
|
|
u'aring': 229,
|
|
u'asymp': 8776,
|
|
u'Atilde': 195,
|
|
u'atilde': 227,
|
|
u'Auml': 196,
|
|
u'auml': 228,
|
|
u'bdquo': 8222,
|
|
u'Beta': 914,
|
|
u'beta': 946,
|
|
u'brvbar': 166,
|
|
u'bull': 8226,
|
|
u'cap': 8745,
|
|
u'Ccedil': 199,
|
|
u'ccedil': 231,
|
|
u'cedil': 184,
|
|
u'cent': 162,
|
|
u'Chi': 935,
|
|
u'chi': 967,
|
|
u'circ': 710,
|
|
u'clubs': 9827,
|
|
u'cong': 8773,
|
|
u'copy': 169,
|
|
u'crarr': 8629,
|
|
u'cup': 8746,
|
|
u'curren': 164,
|
|
u'dagger': 8224,
|
|
u'Dagger': 8225,
|
|
u'darr': 8595,
|
|
u'dArr': 8659,
|
|
u'deg': 176,
|
|
u'Delta': 916,
|
|
u'delta': 948,
|
|
u'diams': 9830,
|
|
u'divide': 247,
|
|
u'Eacute': 201,
|
|
u'eacute': 233,
|
|
u'Ecirc': 202,
|
|
u'ecirc': 234,
|
|
u'Egrave': 200,
|
|
u'egrave': 232,
|
|
u'empty': 8709,
|
|
u'emsp': 8195,
|
|
u'ensp': 8194,
|
|
u'Epsilon': 917,
|
|
u'epsilon': 949,
|
|
u'equiv': 8801,
|
|
u'Eta': 919,
|
|
u'eta': 951,
|
|
u'ETH': 208,
|
|
u'eth': 240,
|
|
u'Euml': 203,
|
|
u'euml': 235,
|
|
u'euro': 8364,
|
|
u'exist': 8707,
|
|
u'fnof': 402,
|
|
u'forall': 8704,
|
|
u'frac12': 189,
|
|
u'frac14': 188,
|
|
u'frac34': 190,
|
|
u'frasl': 8260,
|
|
u'Gamma': 915,
|
|
u'gamma': 947,
|
|
u'ge': 8805,
|
|
u'gt': 62,
|
|
u'harr': 8596,
|
|
u'hArr': 8660,
|
|
u'hearts': 9829,
|
|
u'hellip': 8230,
|
|
u'Iacute': 205,
|
|
u'iacute': 237,
|
|
u'Icirc': 206,
|
|
u'icirc': 238,
|
|
u'iexcl': 161,
|
|
u'Igrave': 204,
|
|
u'igrave': 236,
|
|
u'image': 8465,
|
|
u'infin': 8734,
|
|
u'int': 8747,
|
|
u'Iota': 921,
|
|
u'iota': 953,
|
|
u'iquest': 191,
|
|
u'isin': 8712,
|
|
u'Iuml': 207,
|
|
u'iuml': 239,
|
|
u'Kappa': 922,
|
|
u'kappa': 954,
|
|
u'Lambda': 923,
|
|
u'lambda': 955,
|
|
u'lang': 9001,
|
|
u'laquo': 171,
|
|
u'larr': 8592,
|
|
u'lArr': 8656,
|
|
u'lceil': 8968,
|
|
u'ldquo': 8220,
|
|
u'le': 8804,
|
|
u'lfloor': 8970,
|
|
u'lowast': 8727,
|
|
u'loz': 9674,
|
|
u'lrm': 8206,
|
|
u'lsaquo': 8249,
|
|
u'lsquo': 8216,
|
|
u'lt': 60,
|
|
u'macr': 175,
|
|
u'mdash': 8212,
|
|
u'micro': 181,
|
|
u'middot': 183,
|
|
u'minus': 8722,
|
|
u'Mu': 924,
|
|
u'mu': 956,
|
|
u'nabla': 8711,
|
|
u'nbsp': 160,
|
|
u'ndash': 8211,
|
|
u'ne': 8800,
|
|
u'ni': 8715,
|
|
u'not': 172,
|
|
u'notin': 8713,
|
|
u'nsub': 8836,
|
|
u'Ntilde': 209,
|
|
u'ntilde': 241,
|
|
u'Nu': 925,
|
|
u'nu': 957,
|
|
u'Oacute': 211,
|
|
u'oacute': 243,
|
|
u'Ocirc': 212,
|
|
u'ocirc': 244,
|
|
u'OElig': 338,
|
|
u'oelig': 339,
|
|
u'Ograve': 210,
|
|
u'ograve': 242,
|
|
u'oline': 8254,
|
|
u'Omega': 937,
|
|
u'omega': 969,
|
|
u'Omicron': 927,
|
|
u'omicron': 959,
|
|
u'oplus': 8853,
|
|
u'or': 8744,
|
|
u'ordf': 170,
|
|
u'ordm': 186,
|
|
u'Oslash': 216,
|
|
u'oslash': 248,
|
|
u'Otilde': 213,
|
|
u'otilde': 245,
|
|
u'otimes': 8855,
|
|
u'Ouml': 214,
|
|
u'ouml': 246,
|
|
u'para': 182,
|
|
u'part': 8706,
|
|
u'permil': 8240,
|
|
u'perp': 8869,
|
|
u'Phi': 934,
|
|
u'phi': 966,
|
|
u'Pi': 928,
|
|
u'pi': 960,
|
|
u'piv': 982,
|
|
u'plusmn': 177,
|
|
u'pound': 163,
|
|
u'prime': 8242,
|
|
u'Prime': 8243,
|
|
u'prod': 8719,
|
|
u'prop': 8733,
|
|
u'Psi': 936,
|
|
u'psi': 968,
|
|
u'quot': 34,
|
|
u'radic': 8730,
|
|
u'rang': 9002,
|
|
u'raquo': 187,
|
|
u'rarr': 8594,
|
|
u'rArr': 8658,
|
|
u'rceil': 8969,
|
|
u'rdquo': 8221,
|
|
u'real': 8476,
|
|
u'reg': 174,
|
|
u'rfloor': 8971,
|
|
u'Rho': 929,
|
|
u'rho': 961,
|
|
u'rlm': 8207,
|
|
u'rsaquo': 8250,
|
|
u'rsquo': 8217,
|
|
u'sbquo': 8218,
|
|
u'Scaron': 352,
|
|
u'scaron': 353,
|
|
u'sdot': 8901,
|
|
u'sect': 167,
|
|
u'shy': 173,
|
|
u'Sigma': 931,
|
|
u'sigma': 963,
|
|
u'sigmaf': 962,
|
|
u'sim': 8764,
|
|
u'spades': 9824,
|
|
u'sub': 8834,
|
|
u'sube': 8838,
|
|
u'sum': 8721,
|
|
u'sup': 8835,
|
|
u'sup1': 185,
|
|
u'sup2': 178,
|
|
u'sup3': 179,
|
|
u'supe': 8839,
|
|
u'szlig': 223,
|
|
u'Tau': 932,
|
|
u'tau': 964,
|
|
u'there4': 8756,
|
|
u'Theta': 920,
|
|
u'theta': 952,
|
|
u'thetasym': 977,
|
|
u'thinsp': 8201,
|
|
u'THORN': 222,
|
|
u'thorn': 254,
|
|
u'tilde': 732,
|
|
u'times': 215,
|
|
u'trade': 8482,
|
|
u'Uacute': 218,
|
|
u'uacute': 250,
|
|
u'uarr': 8593,
|
|
u'uArr': 8657,
|
|
u'Ucirc': 219,
|
|
u'ucirc': 251,
|
|
u'Ugrave': 217,
|
|
u'ugrave': 249,
|
|
u'uml': 168,
|
|
u'upsih': 978,
|
|
u'Upsilon': 933,
|
|
u'upsilon': 965,
|
|
u'Uuml': 220,
|
|
u'uuml': 252,
|
|
u'weierp': 8472,
|
|
u'Xi': 926,
|
|
u'xi': 958,
|
|
u'Yacute': 221,
|
|
u'yacute': 253,
|
|
u'yen': 165,
|
|
u'Yuml': 376,
|
|
u'yuml': 255,
|
|
u'Zeta': 918,
|
|
u'zeta': 950,
|
|
u'zwj': 8205,
|
|
u'zwnj': 8204
|
|
}
|
|
|
|
_charRefsPat = re.compile(ur'''(&([A-Za-z0-9]+);|&#([0-9]+);|&#[xX]([0-9A-Za-z]+);|(&))''', re.UNICODE)
|
|
_cssCommentPat = re.compile(ur'''\*.*?\*''', re.UNICODE)
|
|
_toUTFPat = re.compile(ur'''\\([0-9A-Fa-f]{1,6})[\s]?''', re.UNICODE)
|
|
_hackPat = re.compile(ur'''(expression|tps*://|url\s*\().*''', re.UNICODE | re.IGNORECASE)
|
|
_hrPat = re.compile(u'''^-----*''', re.UNICODE | re.MULTILINE)
|
|
_h1Pat = re.compile(u'^=(.+)=\s*$', re.UNICODE | re.MULTILINE)
|
|
_h2Pat = re.compile(u'^==(.+)==\s*$', re.UNICODE | re.MULTILINE)
|
|
_h3Pat = re.compile(u'^===(.+)===\s*$', re.UNICODE | re.MULTILINE)
|
|
_h4Pat = re.compile(u'^====(.+)====\s*$', re.UNICODE | re.MULTILINE)
|
|
_h5Pat = re.compile(u'^=====(.+)=====\s*$', re.UNICODE | re.MULTILINE)
|
|
_h6Pat = re.compile(u'^======(.+)======\s*$', re.UNICODE | re.MULTILINE)
|
|
_quotePat = re.compile(u"""(''+)""", re.UNICODE)
|
|
_removePat = re.compile(ur'\b(' + ur'|'.join((u"a", u"an", u"as", u"at", u"before", u"but", u"by", u"for", u"from",
|
|
u"is", u"in", u"into", u"like", u"of", u"off", u"on", u"onto", u"per",
|
|
u"since", u"than", u"the", u"this", u"that", u"to", u"up", u"via",
|
|
u"with")) + ur')\b', re.UNICODE | re.IGNORECASE)
|
|
_nonWordSpaceDashPat = re.compile(ur'[^\w\s\-\./]', re.UNICODE)
|
|
_multiSpacePat = re.compile(ur'[\s\-_\./]+', re.UNICODE)
|
|
_spacePat = re.compile(ur' ', re.UNICODE)
|
|
_linkPat = re.compile(ur'^(?:([A-Za-z0-9]+):)?([^\|]+)(?:\|([^\n]+?))?\]\](.*)$', re.UNICODE | re.DOTALL)
|
|
_bracketedLinkPat = re.compile(ur'(?:\[((?:mailto:|irc://|https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']*)\s*(.*?)\])', re.UNICODE)
|
|
_protocolPat = re.compile(ur'(\b(?:mailto:|irc://|https?://|ftp://))', re.UNICODE)
|
|
_specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE)
|
|
_protocolsPat = re.compile(ur'^(mailto:|irc://|https?://|ftp://)$', re.UNICODE)
|
|
_controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE)
|
|
_hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE)
|
|
_stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE)
|
|
_zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE)
|
|
_headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)</[Hh][1-6] *>", re.UNICODE)
|
|
_templateSectionPat = re.compile(ur"<!--MWTEMPLATESECTION=([^&]+)&([^_]+)-->", re.UNICODE)
|
|
_tagPat = re.compile(ur"<.*?>", re.UNICODE)
|
|
_startRegexHash = {}
|
|
_endRegexHash = {}
|
|
_endCommentPat = re.compile(ur'(-->)', re.UNICODE)
|
|
_extractTagsAndParams_n = 1
|
|
_guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
|
|
_guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
|
|
|
|
def setupAttributeWhitelist():
|
|
common = ( u'id', u'class', u'lang', u'dir', u'title', u'style' )
|
|
block = common + (u'align',)
|
|
tablealign = ( u'align', u'char', u'charoff', u'valign' )
|
|
tablecell = ( u'abbr',
|
|
u'axis',
|
|
u'headers',
|
|
u'scope',
|
|
u'rowspan',
|
|
u'colspan',
|
|
u'nowrap', # deprecated
|
|
u'width', # deprecated
|
|
u'height', # deprecated
|
|
u'bgcolor' # deprecated
|
|
)
|
|
return {
|
|
u'div': block,
|
|
u'center': common, # deprecated
|
|
u'span': block, # ??
|
|
u'h1': block,
|
|
u'h2': block,
|
|
u'h3': block,
|
|
u'h4': block,
|
|
u'h5': block,
|
|
u'h6': block,
|
|
u'em': common,
|
|
u'strong': common,
|
|
u'cite': common,
|
|
u'code': common,
|
|
u'var': common,
|
|
u'img': common + (u'src', u'alt', u'width', u'height',),
|
|
u'blockquote': common + (u'cite',),
|
|
u'sub': common,
|
|
u'sup': common,
|
|
u'p': block,
|
|
u'br': (u'id', u'class', u'title', u'style', u'clear',),
|
|
u'pre': common + (u'width',),
|
|
u'ins': common + (u'cite', u'datetime'),
|
|
u'del': common + (u'cite', u'datetime'),
|
|
u'ul': common + (u'type',),
|
|
u'ol': common + (u'type', u'start'),
|
|
u'li': common + (u'type', u'value'),
|
|
u'dl': common,
|
|
u'dd': common,
|
|
u'dt': common,
|
|
u'table': common + ( u'summary', u'width', u'border', u'frame',
|
|
u'rules', u'cellspacing', u'cellpadding',
|
|
u'align', u'bgcolor',
|
|
),
|
|
u'caption': common + (u'align',),
|
|
u'thead': common + tablealign,
|
|
u'tfoot': common + tablealign,
|
|
u'tbody': common + tablealign,
|
|
u'colgroup': common + ( u'span', u'width' ) + tablealign,
|
|
u'col': common + ( u'span', u'width' ) + tablealign,
|
|
u'tr': common + ( u'bgcolor', ) + tablealign,
|
|
u'td': common + tablecell + tablealign,
|
|
u'th': common + tablecell + tablealign,
|
|
u'tt': common,
|
|
u'b': common,
|
|
u'i': common,
|
|
u'big': common,
|
|
u'small': common,
|
|
u'strike': common,
|
|
u's': common,
|
|
u'u': common,
|
|
u'font': common + ( u'size', u'color', u'face' ),
|
|
u'hr': common + ( u'noshade', u'size', u'width' ),
|
|
u'ruby': common,
|
|
u'rb': common,
|
|
u'rt': common, #array_merge( $common, array( 'rbspan' ) ),
|
|
u'rp': common,
|
|
}
|
|
_whitelist = setupAttributeWhitelist()
|
|
_page_cache = {}
|
|
env = {}
|
|
|
|
def registerTagHook(tag, function):
|
|
mTagHooks[tag] = function
|
|
|
|
class BaseParser(object):
|
|
def __init__(self):
|
|
self.uniq_prefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000))
|
|
self.strip_state = {}
|
|
self.arg_stack = []
|
|
self.env = env
|
|
self.keep_env = (env != {})
|
|
|
|
def __del__(self):
|
|
if not self.keep_env:
|
|
global env
|
|
env = {}
|
|
|
|
''' Used to store objects in the environment
|
|
used to prevent recursive imports '''
|
|
def store_object(self, namespace, key, value=True):
|
|
# Store the item to not reprocess it
|
|
if namespace not in self.env:
|
|
self.env[namespace] = {}
|
|
self.env[namespace][key] = value
|
|
|
|
def has_object(self, namespace, key):
|
|
if namespace not in self.env:
|
|
self.env[namespace] = {}
|
|
if hasattr(self, 'count'):
|
|
data = self.env[namespace]
|
|
test = key in data
|
|
self.count = True
|
|
return key in self.env[namespace]
|
|
|
|
def retrieve_object(self, namespace, key, default=None):
|
|
if not self.env.get(namespace):
|
|
self.env[namespace] = {}
|
|
return self.env[namespace].get(key, default)
|
|
|
|
def parse(self, text):
|
|
utf8 = isinstance(text, str)
|
|
text = to_unicode(text)
|
|
if text[-1:] != u'\n':
|
|
text = text + u'\n'
|
|
taggedNewline = True
|
|
else:
|
|
taggedNewline = False
|
|
|
|
text = self.strip(text)
|
|
text = self.removeHtmlTags(text)
|
|
text = self.parseHorizontalRule(text)
|
|
text = self.parseAllQuotes(text)
|
|
text = self.replaceExternalLinks(text)
|
|
text = self.unstrip(text)
|
|
text = self.fixtags(text)
|
|
text = self.doBlockLevels(text, True)
|
|
text = self.unstripNoWiki(text)
|
|
text = text.split(u'\n')
|
|
text = u'\n'.join(text)
|
|
if taggedNewline and text[-1:] == u'\n':
|
|
text = text[:-1]
|
|
if utf8:
|
|
return text.encode("utf-8")
|
|
return text
|
|
|
|
def strip(self, text, stripcomments=False, dontstrip=[]):
|
|
render = True
|
|
|
|
commentState = {}
|
|
|
|
elements = ['nowiki',] + mTagHooks.keys()
|
|
if True: #wgRawHtml
|
|
elements.append('html')
|
|
|
|
# Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700)
|
|
for k in dontstrip:
|
|
if k in elements:
|
|
del elements[k]
|
|
|
|
matches = {}
|
|
text = self.extractTagsAndParams(elements, text, matches)
|
|
|
|
for marker in matches:
|
|
element, content, params, tag = matches[marker]
|
|
if render:
|
|
tagName = element.lower()
|
|
if tagName == u'!--':
|
|
# comment
|
|
output = tag
|
|
if tag[-3:] != u'-->':
|
|
output += "-->"
|
|
elif tagName == u'html':
|
|
output = content
|
|
elif tagName == u'nowiki':
|
|
output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
|
|
else:
|
|
if tagName in mTagHooks:
|
|
output = mTagHooks[tagName](self, content, params)
|
|
else:
|
|
output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
|
|
else:
|
|
# Just stripping tags; keep the source
|
|
output = tag
|
|
|
|
# Unstrip the output, because unstrip() is no longer recursive so
|
|
# it won't do it itself
|
|
output = self.unstrip(output)
|
|
|
|
if not stripcomments and element == u'!--':
|
|
commentState[marker] = output
|
|
elif element == u'html' or element == u'nowiki':
|
|
if 'nowiki' not in self.strip_state:
|
|
self.strip_state['nowiki'] = {}
|
|
self.strip_state['nowiki'][marker] = output
|
|
else:
|
|
if 'general' not in self.strip_state:
|
|
self.strip_state['general'] = {}
|
|
self.strip_state['general'][marker] = output
|
|
|
|
# Unstrip comments unless explicitly told otherwise.
|
|
# (The comments are always stripped prior to this point, so as to
|
|
# not invoke any extension tags / parser hooks contained within
|
|
# a comment.)
|
|
if not stripcomments:
|
|
# Put them all back and forget them
|
|
for k in commentState:
|
|
v = commentState[k]
|
|
text = text.replace(k, v)
|
|
|
|
return text
|
|
|
|
def removeHtmlTags(self, text):
|
|
"""convert bad tags into HTML identities"""
|
|
sb = []
|
|
text = self.removeHtmlComments(text)
|
|
bits = text.split(u'<')
|
|
sb.append(bits.pop(0))
|
|
tagstack = []
|
|
tablestack = tagstack
|
|
for x in bits:
|
|
m = _tagPattern.match(x)
|
|
if not m:
|
|
continue
|
|
slash, t, params, brace, rest = m.groups()
|
|
t = t.lower()
|
|
badtag = False
|
|
if t in _htmlelements:
|
|
# Check our stack
|
|
if slash:
|
|
# Closing a tag...
|
|
if t in _htmlsingleonly or len(tagstack) == 0:
|
|
badtag = True
|
|
else:
|
|
ot = tagstack.pop()
|
|
if ot != t:
|
|
if ot in _htmlsingleallowed:
|
|
# Pop all elements with an optional close tag
|
|
# and see if we find a match below them
|
|
optstack = []
|
|
optstack.append(ot)
|
|
while True:
|
|
if len(tagstack) == 0:
|
|
break
|
|
ot = tagstack.pop()
|
|
if ot == t or ot not in _htmlsingleallowed:
|
|
break
|
|
optstack.append(ot)
|
|
if t != ot:
|
|
# No match. Push the optinal elements back again
|
|
badtag = True
|
|
tagstack += reversed(optstack)
|
|
else:
|
|
tagstack.append(ot)
|
|
# <li> can be nested in <ul> or <ol>, skip those cases:
|
|
if ot not in _htmllist and t in _listtags:
|
|
badtag = True
|
|
elif t == u'table':
|
|
if len(tablestack) == 0:
|
|
bagtag = True
|
|
else:
|
|
tagstack = tablestack.pop()
|
|
newparams = u''
|
|
else:
|
|
# Keep track for later
|
|
if t in _tabletags and u'table' not in tagstack:
|
|
badtag = True
|
|
elif t in tagstack and t not in _htmlnest:
|
|
badtag = True
|
|
# Is it a self-closed htmlpair? (bug 5487)
|
|
elif brace == u'/>' and t in _htmlpairs:
|
|
badTag = True
|
|
elif t in _htmlsingleonly:
|
|
# Hack to force empty tag for uncloseable elements
|
|
brace = u'/>'
|
|
elif t in _htmlsingle:
|
|
# Hack to not close $htmlsingle tags
|
|
brace = None
|
|
else:
|
|
if t == u'table':
|
|
tablestack.append(tagstack)
|
|
tagstack = []
|
|
tagstack.append(t)
|
|
newparams = self.fixTagAttributes(params, t)
|
|
if not badtag:
|
|
rest = rest.replace(u'>', u'>')
|
|
if brace == u'/>':
|
|
close = u' /'
|
|
else:
|
|
close = u''
|
|
sb.append(u'<')
|
|
sb.append(slash)
|
|
sb.append(t)
|
|
sb.append(newparams)
|
|
sb.append(close)
|
|
sb.append(u'>')
|
|
sb.append(rest)
|
|
continue
|
|
sb.append(u'<')
|
|
sb.append(x.replace(u'>', u'>'))
|
|
|
|
# Close off any remaining tags
|
|
while tagstack:
|
|
t = tagstack.pop()
|
|
sb.append(u'</')
|
|
sb.append(t)
|
|
sb.append(u'>\n')
|
|
if t == u'table':
|
|
if not tablestack:
|
|
break
|
|
tagstack = tablestack.pop()
|
|
|
|
return u''.join(sb)
|
|
|
|
def removeHtmlComments(self, text):
|
|
"""remove <!-- text --> comments from given text"""
|
|
sb = []
|
|
start = text.find(u'<!--')
|
|
last = 0
|
|
while start != -1:
|
|
end = text.find(u'-->', start)
|
|
if end == -1:
|
|
break
|
|
end += 3
|
|
|
|
spaceStart = max(0, start-1)
|
|
spaceEnd = end
|
|
while text[spaceStart] == u' ' and spaceStart > 0:
|
|
spaceStart -= 1
|
|
while text[spaceEnd] == u' ':
|
|
spaceEnd += 1
|
|
|
|
if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n':
|
|
sb.append(text[last:spaceStart])
|
|
sb.append(u'\n')
|
|
last = spaceEnd+1
|
|
else:
|
|
sb.append(text[last:spaceStart+1])
|
|
last = spaceEnd
|
|
|
|
start = text.find(u'<!--', end)
|
|
sb.append(text[last:])
|
|
return u''.join(sb)
|
|
|
|
def decodeTagAttributes(self, text):
|
|
"""docstring for decodeTagAttributes"""
|
|
attribs = {}
|
|
if text.strip() == u'':
|
|
return attribs
|
|
scanner = _attributePat.scanner(text)
|
|
match = scanner.search()
|
|
while match:
|
|
key, val1, val2, val3, val4 = match.groups()
|
|
value = val1 or val2 or val3 or val4
|
|
if value:
|
|
value = _space.sub(u' ', value).strip()
|
|
else:
|
|
value = ''
|
|
attribs[key] = self.decodeCharReferences(value)
|
|
|
|
match = scanner.search()
|
|
return attribs
|
|
|
|
def validateTagAttributes(self, attribs, element):
|
|
"""docstring for validateTagAttributes"""
|
|
out = {}
|
|
if element not in _whitelist:
|
|
return out
|
|
whitelist = _whitelist[element]
|
|
for attribute in attribs:
|
|
value = attribs[attribute]
|
|
if attribute not in whitelist:
|
|
continue
|
|
# Strip javascript "expression" from stylesheets.
|
|
# http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
|
|
if attribute == u'style':
|
|
value = self.checkCss(value)
|
|
if value == False:
|
|
continue
|
|
elif attribute == u'id':
|
|
value = self.escapeId(value)
|
|
# If this attribute was previously set, override it.
|
|
# Output should only have one attribute of each name.
|
|
out[attribute] = value
|
|
return out
|
|
|
|
def safeEncodeAttribute(self, encValue):
|
|
"""docstring for safeEncodeAttribute"""
|
|
encValue = encValue.replace(u'&', u'&')
|
|
encValue = encValue.replace(u'<', u'<')
|
|
encValue = encValue.replace(u'>', u'>')
|
|
encValue = encValue.replace(u'"', u'"')
|
|
encValue = encValue.replace(u'{', u'{')
|
|
encValue = encValue.replace(u'[', u'[')
|
|
encValue = encValue.replace(u"''", u'''')
|
|
encValue = encValue.replace(u'ISBN', u'ISBN')
|
|
encValue = encValue.replace(u'RFC', u'RFC')
|
|
encValue = encValue.replace(u'PMID', u'PMID')
|
|
encValue = encValue.replace(u'|', u'|')
|
|
encValue = encValue.replace(u'__', u'__')
|
|
encValue = encValue.replace(u'\n', u' ')
|
|
encValue = encValue.replace(u'\r', u' ')
|
|
encValue = encValue.replace(u'\t', u'	')
|
|
return encValue
|
|
|
|
def fixTagAttributes(self, text, element):
|
|
if text.strip() == u'':
|
|
return u''
|
|
|
|
stripped = self.validateTagAttributes(self.decodeTagAttributes(text), element)
|
|
|
|
sb = []
|
|
|
|
for attribute in stripped:
|
|
value = stripped[attribute]
|
|
encAttribute = attribute.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
|
|
encValue = self.safeEncodeAttribute(value)
|
|
|
|
sb.append(u' ')
|
|
sb.append(encAttribute)
|
|
sb.append(u'="')
|
|
sb.append(encValue)
|
|
sb.append(u'"')
|
|
|
|
return u''.join(sb)
|
|
|
|
def validateCodepoint(self, codepoint):
|
|
return codepoint == 0x09 \
|
|
or codepoint == 0x0a \
|
|
or codepoint == 0x0d \
|
|
or (codepoint >= 0x20 and codepoint <= 0xd7ff) \
|
|
or (codepoint >= 0xe000 and codepoint <= 0xfffd) \
|
|
or (codepoint >= 0x10000 and codepoint <= 0x10ffff)
|
|
|
|
def _normalizeCallback(self, match):
|
|
text, norm, dec, hexval, _ = match.groups()
|
|
if norm:
|
|
sb = []
|
|
sb.append(u'&')
|
|
if norm not in _htmlEntities:
|
|
sb.append(u'amp;')
|
|
sb.append(norm)
|
|
sb.append(u';')
|
|
return u''.join(sb)
|
|
elif dec:
|
|
dec = int(dec)
|
|
if self.validateCodepoint(dec):
|
|
sb = []
|
|
sb.append(u'&#')
|
|
sb.append(dec)
|
|
sb.append(u';')
|
|
return u''.join(sb)
|
|
elif hexval:
|
|
hexval = int(hexval, 16)
|
|
if self.validateCodepoint(hexval):
|
|
sb = []
|
|
sb.append(u'&#x')
|
|
sb.append(hex(hexval))
|
|
sb.append(u';')
|
|
return u''.join(sb)
|
|
return text.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>')
|
|
|
|
def normalizeCharReferences(self, text):
|
|
"""docstring for normalizeCharReferences"""
|
|
return _charRefsPat.sub(self._normalizeCallback, text)
|
|
|
|
def _decodeCallback(self, match):
|
|
text, norm, dec, hexval, _ = match.groups()
|
|
if norm:
|
|
if norm in _htmlEntities:
|
|
return unichr(_htmlEntities[norm])
|
|
else:
|
|
sb = []
|
|
sb.append(u'&')
|
|
sb.append(norm)
|
|
sb.append(u';')
|
|
return u''.join(sb)
|
|
elif dec:
|
|
dec = int(dec)
|
|
if self.validateCodepoint(dec):
|
|
return unichr(dec)
|
|
return u'?'
|
|
elif hexval:
|
|
hexval = int(hexval, 16)
|
|
if self.validateCodepoint(dec):
|
|
return unichr(dec)
|
|
return u'?'
|
|
return text
|
|
|
|
def decodeCharReferences(self, text):
|
|
"""docstring for decodeCharReferences"""
|
|
if text:
|
|
return _charRefsPat.sub(self._decodeCallback, text)
|
|
return ''
|
|
|
|
def _convertToUtf8(self, s):
|
|
return unichr(int(s.group(1), 16))
|
|
|
|
def checkCss(self, value):
|
|
"""docstring for checkCss"""
|
|
stripped = self.decodeCharReferences(value)
|
|
|
|
stripped = _cssCommentPat.sub(u'', stripped)
|
|
value = stripped
|
|
|
|
stripped = _toUTFPat.sub(self._convertToUtf8, stripped)
|
|
stripped.replace(u'\\', u'')
|
|
if _hackPat.search(stripped):
|
|
# someone is haxx0ring
|
|
return False
|
|
|
|
return value
|
|
|
|
def escapeId(self, value):
|
|
"""docstring for escapeId"""
|
|
# TODO
|
|
return safe_name(value)
|
|
|
|
def parseHorizontalRule(self, text):
|
|
return _hrPat.sub(ur'<hr />', text)
|
|
|
|
def parseHeaders(self, text):
|
|
text = _h6Pat.sub(ur'<h6>\1</h6>', text)
|
|
text = _h5Pat.sub(ur'<h5>\1</h5>', text)
|
|
text = _h4Pat.sub(ur'<h4>\1</h4>', text)
|
|
text = _h3Pat.sub(ur'<h3>\1</h3>', text)
|
|
text = _h2Pat.sub(ur'<h2>\1</h2>', text)
|
|
text = _h1Pat.sub(ur'<h1>\1</h1>', text)
|
|
return text
|
|
|
|
def parseQuotes(self, text):
|
|
arr = _quotePat.split(text)
|
|
if len(arr) == 1:
|
|
return text
|
|
# First, do some preliminary work. This may shift some apostrophes from
|
|
# being mark-up to being text. It also counts the number of occurrences
|
|
# of bold and italics mark-ups.
|
|
numBold = 0
|
|
numItalics = 0
|
|
for i,r in zip(range(len(arr)), arr):
|
|
if i%2 == 1:
|
|
l = len(r)
|
|
if l == 4:
|
|
arr[i-1] += u"'"
|
|
arr[i] = u"'''"
|
|
elif l > 5:
|
|
arr[i-1] += u"'" * (len(arr[i]) - 5)
|
|
arr[i] = u"'''''"
|
|
if l == 2:
|
|
numItalics += 1
|
|
elif l >= 5:
|
|
numItalics += 1
|
|
numBold += 1
|
|
else:
|
|
numBold += 1
|
|
|
|
# If there is an odd number of both bold and italics, it is likely
|
|
# that one of the bold ones was meant to be an apostrophe followed
|
|
# by italics. Which one we cannot know for certain, but it is more
|
|
# likely to be one that has a single-letter word before it.
|
|
if numBold%2 == 1 and numItalics%2 == 1:
|
|
firstSingleLetterWord = -1
|
|
firstMultiLetterWord = -1
|
|
firstSpace = -1
|
|
for i,r in zip(range(len(arr)), arr):
|
|
if i%2 == 1 and len(r) == 3:
|
|
x1 = arr[i-1][-1:]
|
|
x2 = arr[i-1][-2:-1]
|
|
if x1 == u' ':
|
|
if firstSpace == -1:
|
|
firstSpace = i
|
|
elif x2 == u' ':
|
|
if firstSingleLetterWord == -1:
|
|
firstSingleLetterWord = i
|
|
else:
|
|
if firstMultiLetterWord == -1:
|
|
firstMultiLetterWord = i
|
|
|
|
# If there is a single-letter word, use it!
|
|
if firstSingleLetterWord > -1:
|
|
arr[firstSingleLetterWord] = u"''"
|
|
arr[firstSingleLetterWord-1] += u"'"
|
|
# If not, but there's a multi-letter word, use that one.
|
|
elif firstMultiLetterWord > -1:
|
|
arr[firstMultiLetterWord] = u"''"
|
|
arr[firstMultiLetterWord-1] += u"'"
|
|
# ... otherwise use the first one that has neither.
|
|
# (notice that it is possible for all three to be -1 if, for example,
|
|
# there is only one pentuple-apostrophe in the line)
|
|
elif firstSpace > -1:
|
|
arr[firstSpace] = u"''"
|
|
arr[firstSpace-1] += u"'"
|
|
|
|
# Now let's actually convert our apostrophic mush to HTML!
|
|
output = []
|
|
buffer = None
|
|
state = ''
|
|
for i,r in zip(range(len(arr)), arr):
|
|
if i%2 == 0:
|
|
if state == 'both':
|
|
buffer.append(r)
|
|
else:
|
|
output.append(r)
|
|
else:
|
|
if len(r) == 2:
|
|
if state == 'i':
|
|
output.append(u"</i>")
|
|
state = ''
|
|
elif state == 'bi':
|
|
output.append(u"</i>")
|
|
state = 'b'
|
|
elif state == 'ib':
|
|
output.append(u"</b></i><b>")
|
|
state = 'b'
|
|
elif state == 'both':
|
|
output.append(u"<b><i>")
|
|
output.append(u''.join(buffer))
|
|
buffer = None
|
|
output.append(u"</i>")
|
|
state = 'b'
|
|
elif state == 'b':
|
|
output.append(u"<i>")
|
|
state = 'bi'
|
|
else: # ''
|
|
output.append(u"<i>")
|
|
state = 'i'
|
|
elif len(r) == 3:
|
|
if state == 'b':
|
|
output.append(u"</b>")
|
|
state = ''
|
|
elif state == 'bi':
|
|
output.append(u"</i></b><i>")
|
|
state = 'i'
|
|
elif state == 'ib':
|
|
output.append(u"</b>")
|
|
state = 'i'
|
|
elif state == 'both':
|
|
output.append(u"<i><b>")
|
|
output.append(u''.join(buffer))
|
|
buffer = None
|
|
output.append(u"</b>")
|
|
state = 'i'
|
|
elif state == 'i':
|
|
output.append(u"<b>")
|
|
state = 'ib'
|
|
else: # ''
|
|
output.append(u"<b>")
|
|
state = 'b'
|
|
elif len(r) == 5:
|
|
if state == 'b':
|
|
output.append(u"</b><i>")
|
|
state = 'i'
|
|
elif state == 'i':
|
|
output.append(u"</i><b>")
|
|
state = 'b'
|
|
elif state == 'bi':
|
|
output.append(u"</i></b>")
|
|
state = ''
|
|
elif state == 'ib':
|
|
output.append(u"</b></i>")
|
|
state = ''
|
|
elif state == 'both':
|
|
output.append(u"<i><b>")
|
|
output.append(u''.join(buffer))
|
|
buffer = None
|
|
output.append(u"</b></i>")
|
|
state = ''
|
|
else: # ''
|
|
buffer = []
|
|
state = 'both'
|
|
|
|
if state == 'both':
|
|
output.append(u"<i><b>")
|
|
output.append(u''.join(buffer))
|
|
buffer = None
|
|
output.append(u"</b></i>")
|
|
elif state != '':
|
|
if state == 'b' or state == 'ib':
|
|
output.append(u"</b>")
|
|
if state == 'i' or state == 'bi' or state == 'ib':
|
|
output.append(u"</i>")
|
|
if state == 'bi':
|
|
output.append(u"</b>")
|
|
return u''.join(output)
|
|
|
|
def parseAllQuotes(self, text):
|
|
sb = []
|
|
lines = text.split(u'\n')
|
|
first = True
|
|
for line in lines:
|
|
if not first:
|
|
sb.append(u'\n')
|
|
else:
|
|
first = False
|
|
sb.append(self.parseQuotes(line))
|
|
return u''.join(sb)
|
|
|
|
def replaceExternalLinks(self, text):
|
|
sb = []
|
|
bits = _bracketedLinkPat.split(text)
|
|
l = len(bits)
|
|
i = 0
|
|
num_links = 0
|
|
while i < l:
|
|
if i%3 == 0:
|
|
#sb.append(self.replaceFreeExternalLinks(bits[i]))
|
|
sb.append(bits[i])
|
|
i += 1
|
|
else:
|
|
sb.append(u'<a href="')
|
|
sb.append(bits[i])
|
|
sb.append(u'">')
|
|
if not bits[i+1]:
|
|
num_links += 1
|
|
sb.append(to_unicode(truncate_url(bits[i])))
|
|
else:
|
|
sb.append(bits[i+1])
|
|
sb.append(u'</a>')
|
|
i += 2
|
|
return ''.join(sb)
|
|
|
|
# TODO: fix this so it actually works
|
|
def replaceFreeExternalLinks(self, text):
|
|
bits = _protocolPat.split(text)
|
|
sb = [bits.pop(0)]
|
|
i = 0
|
|
l = len(bits)
|
|
while i < l:
|
|
protocol = bits[i]
|
|
remainder = bits[i+1]
|
|
i += 2
|
|
match = _specialUrlPat.match(remainder)
|
|
if match:
|
|
# Found some characters after the protocol that look promising
|
|
url = protocol + match.group(1)
|
|
trail = match.group(2)
|
|
|
|
# special case: handle urls as url args:
|
|
# http://www.example.com/foo?=http://www.example.com/bar
|
|
if len(trail) == 0 and len(bits) > i and _protocolsPat.match(bits[i]):
|
|
match = _specialUrlPat.match(remainder)
|
|
if match:
|
|
url += bits[i] + match.group(1)
|
|
i += 2
|
|
trail = match.group(2)
|
|
|
|
# The characters '<' and '>' (which were escaped by
|
|
# removeHTMLtags()) should not be included in
|
|
# URLs, per RFC 2396.
|
|
pos = max(url.find('<'), url.find('>'))
|
|
if pos != -1:
|
|
trail = url[pos:] + trail
|
|
url = url[0:pos]
|
|
|
|
sep = ',;.:!?'
|
|
if '(' not in url:
|
|
sep += ')'
|
|
|
|
i = len(url)-1
|
|
while i >= 0:
|
|
char = url[i]
|
|
if char not in sep:
|
|
break
|
|
i -= 1
|
|
i += 1
|
|
|
|
if i != len(url):
|
|
trail = url[i:] + trail
|
|
url = url[0:i]
|
|
|
|
url = self.cleanURL(url)
|
|
|
|
sb.append(u'<a href="')
|
|
sb.append(url)
|
|
sb.append(u'">')
|
|
sb.append(truncate_url(url))
|
|
sb.append(u'</a>')
|
|
#sb.append(text)
|
|
sb.append(trail)
|
|
else:
|
|
sb.append(protocol)
|
|
sb.append(remainder)
|
|
return ''.join(sb)
|
|
|
|
def urlencode(self, char):
|
|
num = ord(char)
|
|
if num == 32:
|
|
return '+'
|
|
return "%%%02x" % num
|
|
|
|
def cleanURL(self, url):
|
|
# Normalize any HTML entities in input. They will be
|
|
# re-escaped by makeExternalLink().
|
|
url = self.decodeCharReferences(url)
|
|
|
|
# Escape any control characters introduced by the above step
|
|
url = _controlCharsPat.sub(self.urlencode, url)
|
|
|
|
# Validate hostname portion
|
|
match = _hostnamePat.match(url)
|
|
if match:
|
|
protocol, host, rest = match.groups()
|
|
|
|
# Characters that will be ignored in IDNs.
|
|
# http://tools.ietf.org/html/3454#section-3.1
|
|
# Strip them before further processing so blacklists and such work.
|
|
|
|
_stripPat.sub('', host)
|
|
|
|
# @fixme: validate hostnames here
|
|
|
|
return protocol + host + rest
|
|
else:
|
|
return url
|
|
|
|
def unstripForHTML(self, text):
|
|
text = self.unstrip(text)
|
|
text = self.unstripNoWiki(text)
|
|
return text
|
|
|
|
def unstrip(self, text):
|
|
if 'general' not in self.strip_state:
|
|
return text
|
|
|
|
general = self.strip_state['general']
|
|
for k in general:
|
|
v = general[k]
|
|
text = text.replace(k, v)
|
|
return text
|
|
|
|
def unstripNoWiki(self, text):
|
|
if 'nowiki' not in self.strip_state:
|
|
return text
|
|
nowiki = self.strip_state['nowiki']
|
|
for k in nowiki:
|
|
v = nowiki[k]
|
|
text = text.replace(k, v)
|
|
return text
|
|
|
|
def extractTagsAndParams(self, elements, text, matches):
|
|
"""
|
|
Replaces all occurrences of HTML-style comments and the given tags
|
|
in the text with a random marker and returns teh next text. The output
|
|
parameter $matches will be an associative array filled with data in
|
|
the form:
|
|
'UNIQ-xxxxx' => array(
|
|
'element',
|
|
'tag content',
|
|
array( 'param' => 'x' ),
|
|
'<element param="x">tag content</element>' ) )
|
|
"""
|
|
stripped = u''
|
|
|
|
taglist = u'|'.join(elements)
|
|
if taglist not in _startRegexHash:
|
|
_startRegexHash[taglist] = re.compile(ur"<(" + taglist + ur")(\s+[^>]*?|\s*?)(/?>)|<(!--)", re.UNICODE | re.IGNORECASE)
|
|
start = _startRegexHash[taglist]
|
|
|
|
while text != u'':
|
|
p = start.split(text, 1)
|
|
stripped += p[0]
|
|
if len(p) == 1:
|
|
break
|
|
elif p[4]:
|
|
# comment
|
|
element = p[4]
|
|
attributes = u''
|
|
close = u''
|
|
else:
|
|
element = p[1]
|
|
attributes = p[2]
|
|
close = p[3]
|
|
inside = p[5]
|
|
|
|
global _extractTagsAndParams_n
|
|
marker = self.uniq_prefix + u'-' + element + u'-' + (u"%08X" % _extractTagsAndParams_n) + u'-QINU'
|
|
_extractTagsAndParams_n += 1
|
|
stripped += marker
|
|
|
|
if close == u'/>':
|
|
# empty element tag, <tag />
|
|
content = None
|
|
text = inside
|
|
tail = None
|
|
else:
|
|
if element == u'!--':
|
|
end = _endCommentPat
|
|
else:
|
|
if element not in _endRegexHash:
|
|
_endRegexHash[element] = re.compile(ur'(</' + element + ur'\s*>)', re.UNICODE | re.IGNORECASE)
|
|
end = _endRegexHash[element]
|
|
q = end.split(inside, 1)
|
|
content = q[0]
|
|
if len(q) < 3:
|
|
# no end tag
|
|
tail = ''
|
|
text = ''
|
|
else:
|
|
tail = q[1]
|
|
text = q[2]
|
|
|
|
matches[marker] = (
|
|
element,
|
|
content,
|
|
self.decodeTagAttributes(attributes),
|
|
u"<" + element + attributes + close + content + tail
|
|
)
|
|
return stripped
|
|
|
|
def fixtags(self, text):
|
|
"""Clean up special characters, only run once, next-to-last before doBlockLevels"""
|
|
# french spaces, last one Guillemet-left
|
|
# only if there is something before the space
|
|
text = _guillemetLeftPat.sub(ur'\1 \2', text)
|
|
# french spaces, Guillemet-right
|
|
text = _guillemetRightPat.sub(ur'\1 ', text)
|
|
return text
|
|
|
|
def closeParagraph(self, mLastSection):
|
|
"""Used by doBlockLevels()"""
|
|
result = u''
|
|
if mLastSection != u'':
|
|
result = u'</' + mLastSection + u'>\n'
|
|
|
|
return result
|
|
|
|
def getCommon(self, st1, st2):
|
|
"""
|
|
getCommon() returns the length of the longest common substring
|
|
of both arguments, starting at the beginning of both.
|
|
"""
|
|
fl = len(st1)
|
|
shorter = len(st2)
|
|
if fl < shorter:
|
|
shorter = fl
|
|
|
|
i = 0
|
|
while i < shorter:
|
|
if st1[i] != st2[i]:
|
|
break
|
|
i += 1
|
|
return i
|
|
|
|
def openList(self, char, mLastSection):
|
|
"""
|
|
These next three functions open, continue, and close the list
|
|
element appropriate to the prefix character passed into them.
|
|
"""
|
|
result = self.closeParagraph(mLastSection)
|
|
|
|
mDTopen = False
|
|
if char == u'*':
|
|
result += u'<ul><li>'
|
|
elif char == u'#':
|
|
result += u'<ol><li>'
|
|
elif char == u':':
|
|
result += u'<dl><dd>'
|
|
elif char == u';':
|
|
result += u'<dl><dt>'
|
|
mDTopen = True
|
|
else:
|
|
result += u'<!-- ERR 1 -->'
|
|
|
|
return result, mDTopen
|
|
|
|
def nextItem(self, char, mDTopen):
|
|
if char == u'*' or char == '#':
|
|
return u'</li><li>', None
|
|
elif char == u':' or char == u';':
|
|
close = u'</dd>'
|
|
if mDTopen:
|
|
close = '</dt>'
|
|
if char == u';':
|
|
return close + u'<dt>', True
|
|
else:
|
|
return close + u'<dd>', False
|
|
return u'<!-- ERR 2 -->'
|
|
|
|
def closeList(self, char, mDTopen):
|
|
if char == u'*':
|
|
return u'</li></ul>\n'
|
|
elif char == u'#':
|
|
return u'</li></ol>\n'
|
|
elif char == u':':
|
|
if mDTopen:
|
|
return u'</dt></dl>\n'
|
|
else:
|
|
return u'</dd></dl>\n'
|
|
else:
|
|
return u'<!-- ERR 3 -->'
|
|
|
|
def findColonNoLinks(self, text, before, after):
|
|
try:
|
|
pos = text.search(':')
|
|
except:
|
|
return False
|
|
|
|
lt = text.find('<')
|
|
if lt == -1 or lt > pos:
|
|
# Easy; no tag nesting to worry about
|
|
before = text[0:pos]
|
|
after = text[0:pos+1]
|
|
return before, after, pos
|
|
|
|
# Ugly state machine to walk through avoiding tags.
|
|
state = MW_COLON_STATE_TEXT;
|
|
stack = 0;
|
|
i = 0
|
|
while i < len(text):
|
|
c = text[i];
|
|
|
|
if state == 0: # MW_COLON_STATE_TEXT:
|
|
if text[i] == '<':
|
|
# Could be either a <start> tag or an </end> tag
|
|
state = MW_COLON_STATE_TAGSTART
|
|
elif text[i] == ':':
|
|
if stack == 0:
|
|
# we found it
|
|
return text[0:i], text[i+1], i
|
|
else:
|
|
# Skip ahead looking for something interesting
|
|
try:
|
|
colon = text.search(':', i)
|
|
except:
|
|
return False
|
|
lt = text.find('<', i)
|
|
if stack == 0:
|
|
if lt == -1 or colon < lt:
|
|
# we found it
|
|
return text[0:colon], text[colon+1], i
|
|
if lt == -1:
|
|
break
|
|
# Skip ahead to next tag start
|
|
i = lt
|
|
state = MW_COLON_STATE_TAGSTART
|
|
elif state == 1: # MW_COLON_STATE_TAG:
|
|
# In a <tag>
|
|
if text[i] == '>':
|
|
stack += 1
|
|
state = MW_COLON_STATE_TEXT
|
|
elif text[i] == '/':
|
|
state = MW_COLON_STATE_TAGSLASH
|
|
elif state == 2: # MW_COLON_STATE_TAGSTART:
|
|
if text[i] == '/':
|
|
state = MW_COLON_STATE_CLOSETAG
|
|
elif text[i] == '!':
|
|
state = MW_COLON_STATE_COMMENT
|
|
elif text[i] == '>':
|
|
# Illegal early close? This shouldn't happen D:
|
|
state = MW_COLON_STATE_TEXT
|
|
else:
|
|
state = MW_COLON_STATE_TAG
|
|
elif state == 3: # MW_COLON_STATE_CLOSETAG:
|
|
# In a </tag>
|
|
if text[i] == '>':
|
|
stack -= 1
|
|
if stack < 0:
|
|
return False
|
|
state = MW_COLON_STATE_TEXT
|
|
elif state == MW_COLON_STATE_TAGSLASH:
|
|
if text[i] == '>':
|
|
# Yes, a self-closed tag <blah/>
|
|
state = MW_COLON_STATE_TEXT
|
|
else:
|
|
# Probably we're jumping the gun, and this is an attribute
|
|
state = MW_COLON_STATE_TAG
|
|
elif state == 5: # MW_COLON_STATE_COMMENT:
|
|
if text[i] == '-':
|
|
state = MW_COLON_STATE_COMMENTDASH
|
|
elif state == MW_COLON_STATE_COMMENTDASH:
|
|
if text[i] == '-':
|
|
state = MW_COLON_STATE_COMMENTDASHDASH
|
|
else:
|
|
state = MW_COLON_STATE_COMMENT
|
|
elif state == MW_COLON_STATE_COMMENTDASHDASH:
|
|
if text[i] == '>':
|
|
state = MW_COLON_STATE_TEXT
|
|
else:
|
|
state = MW_COLON_STATE_COMMENT
|
|
else:
|
|
raise
|
|
if stack > 0:
|
|
return False
|
|
return False
|
|
|
|
def doBlockLevels(self, text, linestart):
|
|
# Parsing through the text line by line. The main thing
|
|
# happening here is handling of block-level elements p, pre,
|
|
# and making lists from lines starting with * # : etc.
|
|
lastPrefix = u''
|
|
mDTopen = inBlockElem = False
|
|
prefixLength = 0
|
|
paragraphStack = False
|
|
_closeMatchPat = re.compile(ur"(</table|</blockquote|</h1|</h2|</h3|</h4|</h5|</h6|<td|<th|<div|</div|<hr|</pre|</p|" + self.uniq_prefix + ur"-pre|</li|</ul|</ol|<center)", re.UNICODE | re.IGNORECASE)
|
|
mInPre = False
|
|
mLastSection = u''
|
|
mDTopen = False
|
|
output = []
|
|
for oLine in text.split('\n')[not linestart and 1 or 0:]:
|
|
lastPrefixLength = len(lastPrefix)
|
|
preCloseMatch = _closePrePat.search(oLine)
|
|
preOpenMatch = _openPrePat.search(oLine)
|
|
if not mInPre:
|
|
chars = u'*#:;'
|
|
prefixLength = 0
|
|
for c in oLine:
|
|
if c in chars:
|
|
prefixLength += 1
|
|
else:
|
|
break
|
|
pref = oLine[0:prefixLength]
|
|
|
|
# eh?
|
|
pref2 = pref.replace(u';', u':')
|
|
t = oLine[prefixLength:]
|
|
mInPre = bool(preOpenMatch)
|
|
else:
|
|
# Don't interpret any other prefixes in preformatted text
|
|
prefixLength = 0
|
|
pref = pref2 = u''
|
|
t = oLine
|
|
|
|
# List generation
|
|
if prefixLength and lastPrefix == pref2:
|
|
# Same as the last item, so no need to deal with nesting or opening stuff
|
|
tmpOutput, tmpMDTopen = self.nextItem(pref[-1:], mDTopen)
|
|
output.append(tmpOutput)
|
|
if tmpMDTopen is not None:
|
|
mDTopen = tmpMDTopen
|
|
paragraphStack = False
|
|
|
|
if pref[-1:] == u';':
|
|
# The one nasty exception: definition lists work like this:
|
|
# ; title : definition text
|
|
# So we check for : in the remainder text to split up the
|
|
# title and definition, without b0rking links.
|
|
term = t2 = u''
|
|
z = self.findColonNoLinks(t, term, t2)
|
|
if z != False:
|
|
term, t2 = z[1:2]
|
|
t = t2
|
|
output.append(term)
|
|
tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
|
|
output.append(tmpOutput)
|
|
if tmpMDTopen is not None:
|
|
mDTopen = tmpMDTopen
|
|
|
|
elif prefixLength or lastPrefixLength:
|
|
# Either open or close a level...
|
|
commonPrefixLength = self.getCommon(pref, lastPrefix)
|
|
paragraphStack = False
|
|
while commonPrefixLength < lastPrefixLength:
|
|
tmp = self.closeList(lastPrefix[lastPrefixLength-1], mDTopen)
|
|
output.append(tmp)
|
|
mDTopen = False
|
|
lastPrefixLength -= 1
|
|
if prefixLength <= commonPrefixLength and commonPrefixLength > 0:
|
|
tmpOutput, tmpMDTopen = self.nextItem(pref[commonPrefixLength-1], mDTopen)
|
|
output.append(tmpOutput)
|
|
if tmpMDTopen is not None:
|
|
mDTopen = tmpMDTopen
|
|
|
|
while prefixLength > commonPrefixLength:
|
|
char = pref[commonPrefixLength:commonPrefixLength+1]
|
|
tmpOutput, tmpMDTOpen = self.openList(char, mLastSection)
|
|
if tmpMDTOpen:
|
|
mDTopen = True
|
|
output.append(tmpOutput)
|
|
mLastSection = u''
|
|
mInPre = False
|
|
|
|
if char == u';':
|
|
# FIXME: This is dupe of code above
|
|
term = t2 = u''
|
|
z = self.findColonNoLinks(t, term, t2)
|
|
if z != False:
|
|
term, t2 = z[1:2]
|
|
t = t2
|
|
output.append(term)
|
|
tmpOutput, tmpMDTopen = self.nextItem(u':', mDTopen)
|
|
output.append(tmpOutput)
|
|
if tmpMDTopen is not None:
|
|
mDTopen = tmpMDTopen
|
|
|
|
commonPrefixLength += 1
|
|
|
|
lastPrefix = pref2
|
|
|
|
if prefixLength == 0:
|
|
# No prefix (not in list)--go to paragraph mode
|
|
# XXX: use a stack for nestable elements like span, table and div
|
|
openmatch = _openMatchPat.search(t)
|
|
closematch = _closeMatchPat.search(t)
|
|
if openmatch or closematch:
|
|
paragraphStack = False
|
|
output.append(self.closeParagraph(mLastSection))
|
|
mLastSection = u''
|
|
if preCloseMatch:
|
|
mInPre = False
|
|
if preOpenMatch:
|
|
mInPre = True
|
|
inBlockElem = bool(not closematch)
|
|
elif not inBlockElem and not mInPre:
|
|
if t[0:1] == u' ' and (mLastSection == u'pre' or t.strip() != u''):
|
|
# pre
|
|
if mLastSection != u'pre':
|
|
paragraphStack = False
|
|
output.append(self.closeParagraph(u'') + u'<pre>')
|
|
mInPre = False
|
|
mLastSection = u'pre'
|
|
t = t[1:]
|
|
else:
|
|
# paragraph
|
|
if t.strip() == u'':
|
|
if paragraphStack:
|
|
output.append(paragraphStack + u'<br />')
|
|
paragraphStack = False
|
|
mLastSection = u'p'
|
|
else:
|
|
if mLastSection != u'p':
|
|
output.append(self.closeParagraph(mLastSection))
|
|
mLastSection = u''
|
|
mInPre = False
|
|
paragraphStack = u'<p>'
|
|
else:
|
|
paragraphStack = u'</p><p>'
|
|
else:
|
|
if paragraphStack:
|
|
output.append(paragraphStack)
|
|
paragraphStack = False
|
|
mLastSection = u'p'
|
|
elif mLastSection != u'p':
|
|
output.append(self.closeParagraph(mLastSection) + u'<p>')
|
|
mLastSection = u'p'
|
|
mInPre = False
|
|
|
|
# somewhere above we forget to get out of pre block (bug 785)
|
|
if preCloseMatch and mInPre:
|
|
mInPre = False
|
|
|
|
if paragraphStack == False:
|
|
output.append(t + u"\n")
|
|
|
|
while prefixLength:
|
|
output.append(self.closeList(pref2[prefixLength-1], mDTopen))
|
|
mDTopen = False
|
|
prefixLength -= 1
|
|
|
|
if mLastSection != u'':
|
|
output.append(u'</' + mLastSection + u'>')
|
|
mLastSection = u''
|
|
|
|
return ''.join(output)
|
|
|
|
class Parser(BaseParser):
|
|
def __init__(self, show_toc=True):
|
|
super(Parser, self).__init__()
|
|
self.show_toc = show_toc
|
|
|
|
def parse(self, text):
|
|
utf8 = isinstance(text, str)
|
|
text = to_unicode(text)
|
|
if text[-1:] != u'\n':
|
|
text = text + u'\n'
|
|
taggedNewline = True
|
|
else:
|
|
taggedNewline = False
|
|
|
|
text = self.strip(text)
|
|
text = self.removeHtmlTags(text)
|
|
text = self.doTableStuff(text)
|
|
text = self.parseHorizontalRule(text)
|
|
text = self.checkTOC(text)
|
|
text = self.parseHeaders(text)
|
|
text = self.parseAllQuotes(text)
|
|
text = self.replaceExternalLinks(text)
|
|
if not self.show_toc and text.find(u"<!--MWTOC-->") == -1:
|
|
self.show_toc = False
|
|
text = self.formatHeadings(text, True)
|
|
text = self.unstrip(text)
|
|
text = self.fixtags(text)
|
|
text = self.doBlockLevels(text, True)
|
|
text = self.unstripNoWiki(text)
|
|
text = text.split(u'\n')
|
|
text = u'\n'.join(text)
|
|
if taggedNewline and text[-1:] == u'\n':
|
|
text = text[:-1]
|
|
if utf8:
|
|
return text.encode("utf-8")
|
|
return text
|
|
|
|
def checkTOC(self, text):
|
|
if text.find(u"__NOTOC__") != -1:
|
|
text = text.replace(u"__NOTOC__", u"")
|
|
self.show_toc = False
|
|
if text.find(u"__TOC__") != -1:
|
|
text = text.replace(u"__TOC__", u"<!--MWTOC-->")
|
|
self.show_toc = True
|
|
return text
|
|
|
|
def doTableStuff(self, text):
|
|
t = text.split(u"\n")
|
|
td = [] # Is currently a td tag open?
|
|
ltd = [] # Was it TD or TH?
|
|
tr = [] # Is currently a tr tag open?
|
|
ltr = [] # tr attributes
|
|
has_opened_tr = [] # Did this table open a <tr> element?
|
|
indent_level = 0 # indent level of the table
|
|
|
|
for k, x in zip(range(len(t)), t):
|
|
x = x.strip()
|
|
fc = x[0:1]
|
|
matches = _zomgPat.match(x)
|
|
if matches:
|
|
indent_level = len(matches.group(1))
|
|
|
|
attributes = self.unstripForHTML(matches.group(2))
|
|
|
|
t[k] = u'<dl><dd>'*indent_level + u'<table' + self.fixTagAttributes(attributes, u'table') + u'>'
|
|
td.append(False)
|
|
ltd.append(u'')
|
|
tr.append(False)
|
|
ltr.append(u'')
|
|
has_opened_tr.append(False)
|
|
elif len(td) == 0:
|
|
pass
|
|
elif u'|}' == x[0:2]:
|
|
z = u"</table>" + x[2:]
|
|
l = ltd.pop()
|
|
if not has_opened_tr.pop():
|
|
z = u"<tr><td></td><tr>" + z
|
|
if tr.pop():
|
|
z = u"</tr>" + z
|
|
if td.pop():
|
|
z = u'</' + l + u'>' + z
|
|
ltr.pop()
|
|
t[k] = z + u'</dd></dl>'*indent_level
|
|
elif u'|-' == x[0:2]: # Allows for |-------------
|
|
x = x[1:]
|
|
while x != u'' and x[0:1] == '-':
|
|
x = x[1:]
|
|
z = ''
|
|
l = ltd.pop()
|
|
has_opened_tr.pop()
|
|
has_opened_tr.append(True)
|
|
if tr.pop():
|
|
z = u'</tr>' + z
|
|
if td.pop():
|
|
z = u'</' + l + u'>' + z
|
|
ltr.pop()
|
|
t[k] = z
|
|
tr.append(False)
|
|
td.append(False)
|
|
ltd.append(u'')
|
|
attributes = self.unstripForHTML(x)
|
|
ltr.append(self.fixTagAttributes(attributes, u'tr'))
|
|
elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption
|
|
# x is a table row
|
|
if u'|+' == x[0:2]:
|
|
fc = u'+'
|
|
x = x[1:]
|
|
x = x[1:]
|
|
if fc == u'!':
|
|
x = x.replace(u'!!', u'||')
|
|
# Split up multiple cells on the same line.
|
|
# FIXME: This can result in improper nesting of tags processed
|
|
# by earlier parser steps, but should avoid splitting up eg
|
|
# attribute values containing literal "||".
|
|
x = x.split(u'||')
|
|
|
|
t[k] = u''
|
|
|
|
# Loop through each table cell
|
|
for theline in x:
|
|
z = ''
|
|
if fc != u'+':
|
|
tra = ltr.pop()
|
|
if not tr.pop():
|
|
z = u'<tr' + tra + u'>\n'
|
|
tr.append(True)
|
|
ltr.append(u'')
|
|
has_opened_tr.pop()
|
|
has_opened_tr.append(True)
|
|
l = ltd.pop()
|
|
if td.pop():
|
|
z = u'</' + l + u'>' + z
|
|
if fc == u'|':
|
|
l = u'td'
|
|
elif fc == u'!':
|
|
l = u'th'
|
|
elif fc == u'+':
|
|
l = u'caption'
|
|
else:
|
|
l = u''
|
|
ltd.append(l)
|
|
|
|
#Cell parameters
|
|
y = theline.split(u'|', 1)
|
|
# Note that a '|' inside an invalid link should not
|
|
# be mistaken as delimiting cell parameters
|
|
if y[0].find(u'[[') != -1:
|
|
y = [theline]
|
|
|
|
if len(y) == 1:
|
|
y = z + u"<" + l + u">" + y[0]
|
|
else:
|
|
attributes = self.unstripForHTML(y[0])
|
|
y = z + u"<" + l + self.fixTagAttributes(attributes, l) + u">" + y[1]
|
|
|
|
t[k] += y
|
|
td.append(True)
|
|
|
|
while len(td) > 0:
|
|
l = ltd.pop()
|
|
if td.pop():
|
|
t.append(u'</td>')
|
|
if tr.pop():
|
|
t.append(u'</tr>')
|
|
if not has_opened_tr.pop():
|
|
t.append(u'<tr><td></td></tr>')
|
|
t.append(u'</table>')
|
|
|
|
text = u'\n'.join(t)
|
|
# special case: don't return empty table
|
|
if text == u"<table>\n<tr><td></td></tr>\n</table>":
|
|
text = u''
|
|
|
|
return text
|
|
|
|
def formatHeadings(self, text, isMain):
|
|
"""
|
|
This function accomplishes several tasks:
|
|
1) Auto-number headings if that option is enabled
|
|
2) Add an [edit] link to sections for logged in users who have enabled the option
|
|
3) Add a Table of contents on the top for users who have enabled the option
|
|
4) Auto-anchor headings
|
|
|
|
It loops through all headlines, collects the necessary data, then splits up the
|
|
string and re-inserts the newly formatted headlines.
|
|
"""
|
|
doNumberHeadings = False
|
|
showEditLink = True # Can User Edit
|
|
|
|
if text.find(u"__NOEDITSECTION__") != -1:
|
|
showEditLink = False
|
|
text = text.replace(u"__NOEDITSECTION__", u"")
|
|
|
|
# Get all headlines for numbering them and adding funky stuff like [edit]
|
|
# links - this is for later, but we need the number of headlines right now
|
|
matches = _headerPat.findall(text)
|
|
numMatches = len(matches)
|
|
|
|
# if there are fewer than 4 headlines in the article, do not show TOC
|
|
# unless it's been explicitly enabled.
|
|
enoughToc = self.show_toc and (numMatches >= 4 or text.find(u"<!--MWTOC-->") != -1)
|
|
|
|
# Allow user to stipulate that a page should have a "new section"
|
|
# link added via __NEWSECTIONLINK__
|
|
showNewSection = False
|
|
if text.find(u"__NEWSECTIONLINK__") != -1:
|
|
showNewSection = True
|
|
text = text.replace(u"__NEWSECTIONLINK__", u"")
|
|
# if the string __FORCETOC__ (not case-sensitive) occurs in the HTML,
|
|
# override above conditions and always show TOC above first header
|
|
if text.find(u"__FORCETOC__") != -1:
|
|
self.show_toc = True
|
|
enoughToc = True
|
|
text = text.replace(u"__FORCETOC__", u"")
|
|
# Never ever show TOC if no headers
|
|
if numMatches < 1:
|
|
enoughToc = False
|
|
|
|
# headline counter
|
|
headlineCount = 0
|
|
sectionCount = 0 # headlineCount excluding template sections
|
|
|
|
# Ugh .. the TOC should have neat indentation levels which can be
|
|
# passed to the skin functions. These are determined here
|
|
toc = []
|
|
head = {}
|
|
sublevelCount = {}
|
|
levelCount = {}
|
|
toclevel = 0
|
|
level = 0
|
|
prevlevel = 0
|
|
toclevel = 0
|
|
prevtoclevel = 0
|
|
refers = {}
|
|
refcount = {}
|
|
wgMaxTocLevel = 5
|
|
|
|
for match in matches:
|
|
headline = match[2]
|
|
istemplate = False
|
|
templatetitle = u''
|
|
templatesection = 0
|
|
numbering = []
|
|
|
|
m = _templateSectionPat.search(headline)
|
|
if m:
|
|
istemplate = True
|
|
templatetitle = b64decode(m[0])
|
|
templatesection = 1 + int(b64decode(m[1]))
|
|
headline = _templateSectionPat.sub(u'', headline)
|
|
|
|
if toclevel:
|
|
prevlevel = level
|
|
prevtoclevel = toclevel
|
|
|
|
level = matches[headlineCount][0]
|
|
|
|
if doNumberHeadings or enoughToc:
|
|
if level > prevlevel:
|
|
toclevel += 1
|
|
sublevelCount[toclevel] = 0
|
|
if toclevel < wgMaxTocLevel:
|
|
toc.append(u'\n<ul>')
|
|
elif level < prevlevel and toclevel > 1:
|
|
# Decrease TOC level, find level to jump to
|
|
|
|
if toclevel == 2 and level < levelCount[1]:
|
|
toclevel = 1
|
|
else:
|
|
for i in range(toclevel, 0, -1):
|
|
if levelCount[i] == level:
|
|
# Found last matching level
|
|
toclevel = i
|
|
break
|
|
elif levelCount[i] < level:
|
|
toclevel = i + 1
|
|
break
|
|
if toclevel < wgMaxTocLevel:
|
|
toc.append(u"</li>\n")
|
|
toc.append(u"</ul>\n</li>\n" * max(prevtoclevel - toclevel, 0))
|
|
else:
|
|
if toclevel < wgMaxTocLevel:
|
|
toc.append(u"</li>\n")
|
|
|
|
levelCount[toclevel] = level
|
|
|
|
# count number of headlines for each level
|
|
sublevelCount[toclevel] += 1
|
|
for i in range(1, toclevel+1):
|
|
if sublevelCount[i]:
|
|
numbering.append(to_unicode(sublevelCount[i]))
|
|
|
|
# The canonized header is a version of the header text safe to use for links
|
|
# Avoid insertion of weird stuff like <math> by expanding the relevant sections
|
|
canonized_headline = self.unstrip(headline)
|
|
canonized_headline = self.unstripNoWiki(canonized_headline)
|
|
|
|
# -- don't know what to do with this yet.
|
|
# Remove link placeholders by the link text.
|
|
# <!--LINK number-->
|
|
# turns into
|
|
# link text with suffix
|
|
# $canonized_headline = preg_replace( '/<!--LINK ([0-9]*)-->/e',
|
|
# "\$this->mLinkHolders['texts'][\$1]",
|
|
# $canonized_headline );
|
|
# $canonized_headline = preg_replace( '/<!--IWLINK ([0-9]*)-->/e',
|
|
# "\$this->mInterwikiLinkHolders['texts'][\$1]",
|
|
# $canonized_headline );
|
|
|
|
# strip out HTML
|
|
canonized_headline = _tagPat.sub(u'', canonized_headline)
|
|
tocline = canonized_headline.strip()
|
|
# Save headline for section edit hint before it's escaped
|
|
headline_hint = tocline
|
|
canonized_headline = self.escapeId(tocline)
|
|
refers[headlineCount] = canonized_headline
|
|
|
|
# count how many in assoc. array so we can track dupes in anchors
|
|
if canonized_headline not in refers:
|
|
refers[canonized_headline] = 1
|
|
else:
|
|
refers[canonized_headline] += 1
|
|
refcount[headlineCount] = refers[canonized_headline]
|
|
|
|
numbering = '.'.join(numbering)
|
|
|
|
# Don't number the heading if it is the only one (looks silly)
|
|
if doNumberHeadings and numMatches > 1:
|
|
# the two are different if the line contains a link
|
|
headline = numbering + u' ' + headline
|
|
|
|
# Create the anchor for linking from the TOC to the section
|
|
anchor = canonized_headline;
|
|
if refcount[headlineCount] > 1:
|
|
anchor += u'_' + unicode(refcount[headlineCount])
|
|
|
|
if enoughToc:
|
|
toc.append(u'\n<li class="toclevel-')
|
|
toc.append(to_unicode(toclevel))
|
|
toc.append(u'"><a href="#w_')
|
|
toc.append(anchor)
|
|
toc.append(u'"><span class="tocnumber">')
|
|
toc.append(numbering)
|
|
toc.append(u'</span> <span class="toctext">')
|
|
toc.append(tocline)
|
|
toc.append(u'</span></a>')
|
|
|
|
# if showEditLink and (not istemplate or templatetitle != u""):
|
|
# if not head[headlineCount]:
|
|
# head[headlineCount] = u''
|
|
#
|
|
# if istemplate:
|
|
# head[headlineCount] += sk.editSectionLinkForOther(templatetile, templatesection)
|
|
# else:
|
|
# head[headlineCount] += sk.editSectionLink(mTitle, sectionCount+1, headline_hint)
|
|
|
|
# give headline the correct <h#> tag
|
|
if headlineCount not in head:
|
|
head[headlineCount] = []
|
|
h = head[headlineCount]
|
|
h.append(u'<h')
|
|
h.append(to_unicode(level))
|
|
h.append(u' id="w_')
|
|
h.append(anchor)
|
|
h.append('">')
|
|
h.append(matches[headlineCount][1].strip())
|
|
h.append(headline.strip())
|
|
h.append(u'</h')
|
|
h.append(to_unicode(level))
|
|
h.append(u'>')
|
|
|
|
headlineCount += 1
|
|
|
|
if not istemplate:
|
|
sectionCount += 1
|
|
|
|
if enoughToc:
|
|
if toclevel < wgMaxTocLevel:
|
|
toc.append(u"</li>\n")
|
|
toc.append(u"</ul>\n</li>\n" * max(0, toclevel - 1))
|
|
#TODO: use gettext
|
|
#toc.insert(0, u'<div id="toc"><h2>' + _('Table of Contents') + '</h2>')
|
|
toc.insert(0, u'<div id="toc"><h2>Table of Contents</h2>')
|
|
toc.append(u'</ul>\n</div>')
|
|
|
|
# split up and insert constructed headlines
|
|
|
|
blocks = _headerPat.split(text)
|
|
|
|
i = 0
|
|
len_blocks = len(blocks)
|
|
forceTocPosition = text.find(u"<!--MWTOC-->")
|
|
full = []
|
|
while i < len_blocks:
|
|
j = i/4
|
|
full.append(blocks[i])
|
|
if enoughToc and not i and isMain and forceTocPosition == -1:
|
|
full += toc
|
|
toc = None
|
|
if j in head and head[j]:
|
|
full += head[j]
|
|
head[j] = None
|
|
i += 4
|
|
full = u''.join(full)
|
|
if forceTocPosition != -1:
|
|
return full.replace(u"<!--MWTOC-->", u''.join(toc), 1)
|
|
else:
|
|
return full
|
|
|
|
def parse(text, showToc=True):
|
|
"""Returns HTML from MediaWiki markup"""
|
|
p = Parser(show_toc=showToc)
|
|
return p.parse(text)
|
|
|
|
def parselite(text):
|
|
"""Returns HTML from MediaWiki markup ignoring
|
|
without headings"""
|
|
p = BaseParser()
|
|
return p.parse(text)
|
|
|
|
def truncate_url(url, length=40):
|
|
if len(url) <= length:
|
|
return url
|
|
import re
|
|
pattern = r'(/[^/]+/?)$'
|
|
match = re.search(pattern, url)
|
|
if not match:
|
|
return url
|
|
l = len(match.group(1))
|
|
domain = url.replace(match.group(1), '')
|
|
firstpart = url[0:len(url)-l]
|
|
secondpart = match.group(1)
|
|
if firstpart == firstpart[0:length-3]:
|
|
secondpart = secondpart[0:length-3] + '...'
|
|
else:
|
|
firstpart = firstpart[0:length-3]
|
|
secondpart = '...' + secondpart
|
|
t_url = firstpart+secondpart
|
|
return t_url
|
|
|
|
def to_unicode(text, charset=None):
|
|
"""Convert a `str` object to an `unicode` object.
|
|
|
|
If `charset` is given, we simply assume that encoding for the text,
|
|
but we'll use the "replace" mode so that the decoding will always
|
|
succeed.
|
|
If `charset` is ''not'' specified, we'll make some guesses, first
|
|
trying the UTF-8 encoding, then trying the locale preferred encoding,
|
|
in "replace" mode. This differs from the `unicode` builtin, which
|
|
by default uses the locale preferred encoding, in 'strict' mode,
|
|
and is therefore prompt to raise `UnicodeDecodeError`s.
|
|
|
|
Because of the "replace" mode, the original content might be altered.
|
|
If this is not what is wanted, one could map the original byte content
|
|
by using an encoding which maps each byte of the input to an unicode
|
|
character, e.g. by doing `unicode(text, 'iso-8859-1')`.
|
|
"""
|
|
if not isinstance(text, str):
|
|
if isinstance(text, Exception):
|
|
# two possibilities for storing unicode strings in exception data:
|
|
try:
|
|
# custom __str__ method on the exception (e.g. PermissionError)
|
|
return unicode(text)
|
|
except UnicodeError:
|
|
# unicode arguments given to the exception (e.g. parse_date)
|
|
return ' '.join([to_unicode(arg) for arg in text.args])
|
|
return unicode(text)
|
|
if charset:
|
|
return unicode(text, charset, 'replace')
|
|
else:
|
|
try:
|
|
return unicode(text, 'utf-8')
|
|
except UnicodeError:
|
|
return unicode(text, locale.getpreferredencoding(), 'replace')
|
|
|
|
# tag hooks
|
|
mTagHooks = {}
|
|
|
|
## IMPORTANT
|
|
## Make sure all hooks output CLEAN html. Escape any user input BEFORE it's returned
|
|
|
|
# Arguments passed:
|
|
# - wiki environment instance
|
|
# - tag content
|
|
# - dictionary of attributes
|
|
|
|
# quote example:
|
|
# <quote cite="person">quote</quote>
|
|
from cgi import escape
|
|
|
|
def hook_quote(env, body, attributes={}):
|
|
text = [u'<div class="blockquote">']
|
|
if 'cite' in attributes:
|
|
text.append(u"<strong class=\"cite\">%s wrote:</strong>\n" % escape(attributes['cite']))
|
|
text.append(body.strip())
|
|
text.append(u'</div>')
|
|
return u'\n'.join(text)
|
|
registerTagHook('quote', hook_quote)
|
|
|
|
def safe_name(name=None, remove_slashes=True):
|
|
if name is None:
|
|
return None
|
|
name = str2url(name)
|
|
if remove_slashes:
|
|
name = re.sub(r"[^a-zA-Z0-9\-_\s\.]", "", name)
|
|
else:
|
|
name = re.sub(r"[^a-zA-Z0-9\-_\s\.\/]", "", name)
|
|
name = re.sub(r"[\s\._]", "-", name)
|
|
name = re.sub(r"[-]+", "-", name)
|
|
return name.strip("-").lower()
|
|
|
|
def str2url(str):
|
|
"""
|
|
Takes a UTF-8 string and replaces all characters with the equivalent in 7-bit
|
|
ASCII. It returns a plain ASCII string usable in URLs.
|
|
"""
|
|
try:
|
|
str = str.encode('utf-8')
|
|
except:
|
|
pass
|
|
mfrom = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîï"
|
|
to = "AAAAAAECEEEEIIIIDNOOOOOOUUUUYSaaaaaaaceeeeiiii"
|
|
mfrom += "ñòóôõöøùúûüýÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠġĢģ"
|
|
to += "noooooouuuuyyaaaaaaccccccccddddeeeeeeeeeegggggggg"
|
|
mfrom += "ĤĥĦħĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌōŎŏŐőŒœŔŕŖŗŘř"
|
|
to += "hhhhiiiiiiiiiijjkkkllllllllllnnnnnnnnnoooooooorrrrrr"
|
|
mfrom += "ŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžſƀƂƃƄƅƇƈƉƊƐƑƒƓƔ"
|
|
to += "ssssssssttttttuuuuuuuuuuuuwwyyyzzzzzzfbbbbbccddeffgv"
|
|
mfrom += "ƖƗƘƙƚƝƞƟƠƤƦƫƬƭƮƯưƱƲƳƴƵƶǍǎǏǐǑǒǓǔǕǖǗǘǙǚǛǜǝǞǟǠǡǢǣǤǥǦǧǨǩ"
|
|
to += "likklnnoopettttuuuuyyzzaaiioouuuuuuuuuueaaaaeeggggkk"
|
|
mfrom += "ǪǫǬǭǰǴǵǷǸǹǺǻǼǽǾǿȀȁȂȃȄȅȆȇȈȉȊȋȌȍȎȏȐȑȒȓȔȕȖȗȘșȚțȞȟȤȥȦȧȨȩ"
|
|
to += "oooojggpnnaaeeooaaaaeeeeiiiioooorrrruuuusstthhzzaaee"
|
|
mfrom += "ȪȫȬȭȮȯȰȱȲȳḀḁḂḃḄḅḆḇḈḉḊḋḌḍḎḏḐḑḒḓḔḕḖḗḘḙḚḛḜḝḞḟḠḡḢḣḤḥḦḧḨḩḪḫ"
|
|
to += "ooooooooyyaabbbbbbccddddddddddeeeeeeeeeeffgghhhhhhhhhh"
|
|
mfrom += "ḬḭḮḯḰḱḲḳḴḵḶḷḸḹḺḻḼḽḾḿṀṁṂṃṄṅṆṇṈṉṊṋṌṍṎṏṐṑṒṓṔṕṖṗṘṙṚṛṜṝṞṟ"
|
|
to += "iiiikkkkkkllllllllmmmmmmnnnnnnnnoooooooopppprrrrrrrr"
|
|
mfrom += "ṠṡṢṣṤṥṦṧṨṩṪṫṬṭṮṯṰṱṲṳṴṵṶṷṸṹṺṻṼṽṾṿẀẁẂẃẄẅẆẇẈẉẊẋẌẍẎẏẐẑẒẓẔẕ"
|
|
to += "ssssssssssttttttttuuuuuuuuuuvvvvwwwwwwwwwwxxxxxyzzzzzz"
|
|
mfrom += "ẖẗẘẙẚẛẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊị"
|
|
to += "htwyafaaaaaaaaaaaaaaaaaaaaaaaaeeeeeeeeeeeeeeeeiiii"
|
|
mfrom += "ỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ"
|
|
to += "oooooooooooooooooooooooouuuuuuuuuuuuuuyyyyyyyy"
|
|
for i in zip(mfrom, to):
|
|
str = str.replace(*i)
|
|
return str
|
|
|
|
# vim:expandtab:smartindent:tabstop=4:softtabstop=4:shiftwidth=4:
|