""" MediaWiki-style markup parse(text) -- returns safe-html from wiki markup code based off of mediawiki """ import re, random, math, locale from base64 import b64encode, b64decode mTagHooks = {} def registerTagHook(tag, function): mTagHooks[tag] = function def removeHtmlComments(text): """remove comments from given text""" sb = [] start = text.find(u'', start) if end == -1: break end += 3 spaceStart = max(0, start-1) spaceEnd = end while text[spaceStart] == u' ' and spaceStart > 0: spaceStart -= 1 while text[spaceEnd] == u' ': spaceEnd += 1 if text[spaceStart] == u'\n' and text[spaceEnd] == u'\n': sb.append(text[last:spaceStart]) sb.append(u'\n') last = spaceEnd+1 else: sb.append(text[last:spaceStart+1]) last = spaceEnd start = text.find(u'") showToc = True return text, showToc _bracketedLinkPat = re.compile(ur'(?:\[((?:https?://|ftp://|/)[^<>\]\[' + u"\x00-\x20\x7f" + ur']+)\s*(.*?)\])', re.UNICODE) def replaceExternalLinks(text): sb = [] bits = _bracketedLinkPat.split(text) l = len(bits) i = 0 num_links = 0 while i < l: if i%3 == 0: sb.append(replaceFreeExternalLinks(bits[i])) i += 1 else: sb.append(u'') if not bits[i+1]: sb.append(u'[') num_links += 1 sb.append(to_unicode(num_links)) sb.append(u']') else: sb.append(bits[i+1]) sb.append(u'') i += 2 return ''.join(sb) _protocolPat = re.compile(ur'(\b(?:https?://|ftp://))', re.UNICODE) _specialUrlPat = re.compile(ur'^([^<>\]\[' + u"\x00-\x20\x7f" + ur']+)(.*)$', re.UNICODE) _protocolsPat = re.compile(ur'^(https?://|ftp://)$', re.UNICODE) def replaceFreeExternalLinks(text): bits = _protocolPat.split(text) sb = [bits.pop(0)] i = 0 l = len(bits) while i < l: protocol = bits[i] remainder = bits[i+1] i += 2 match = _specialUrlPat.match(remainder) if match: # Found some characters after the protocol that look promising url = protocol + match.group(1) trail = match.group(2) # special case: handle urls as url args: # http://www.example.com/foo?=http://www.example.com/bar if len(trail) == 0 and len(bits) > i and _protocolsPat.match(bits[i]): match = _specialUrlPat.match(remainder) if match: url += bits[i] + match.groups(1) i += 2 trail = m[2] # The characters '<' and '>' (which were escaped by # removeHTMLtags()) should not be included in # URLs, per RFC 2396. pos = max(url.find('<'), url.find('>')) if pos != -1: trail = url[pos:] + trail url = url[0:pos] sep = ',;.:!?' if '(' not in url: sep += ')' i = len(url)-1 while i >= 0: char = url[i] if char not in sep: break i -= 1 i += 1 if i != len(url): trail = url[i:] + trail url = url[0:i] url = cleanURL(url) sb.append(u'') sb.append(url) sb.append(u'') sb.append(text) sb.append(trail) else: sb.append(protocol) sb.append(remainder) return ''.join(sb) def urlencode(char): num = ord(char) if num == 32: return '+' return "%%%02x" % num _controlCharsPat = re.compile(ur'[\]\[<>"' + u"\\x00-\\x20\\x7F" + ur']]', re.UNICODE) _hostnamePat = re.compile(ur'^([^:]+:)(//[^/]+)?(.*)$', re.UNICODE) _stripPat = re.compile(u'\\s|\u00ad|\u1806|\u200b|\u2060|\ufeff|\u03f4|\u034f|\u180b|\u180c|\u180d|\u200c|\u200d|[\ufe00-\ufe0f]', re.UNICODE) def cleanURL(url): # Normalize any HTML entities in input. They will be # re-escaped by makeExternalLink(). url = decodeCharReferences(url) # Escape any control characters introduced by the above step url = _controlCharsPat.sub(urlencode, url) # Validate hostname portion match = _hostnamePat.match(url) if match: protocol, host, rest = match.groups() # Characters that will be ignored in IDNs. # http://tools.ietf.org/html/3454#section-3.1 # Strip them before further processing so blacklists and such work. _stripPat.sub('', host) # @fixme: validate hostnames here return protocol + host + rest else: return url _zomgPat = re.compile(ur'^(:*)\{\|(.*)$', re.UNICODE) def doTableStuff(text, state): t = text.split(u"\n") td = [] # Is currently a td tag open? ltd = [] # Was it TD or TH? tr = [] # Is currently a tr tag open? ltr = [] # tr attributes has_opened_tr = [] # Did this table open a element? indent_level = 0 # indent level of the table for k, x in zip(range(len(t)), t): x = x.strip() fc = x[0:1] matches = _zomgPat.match(x) if matches: indent_level = len(matches.group(1)) attributes = unstripForHTML(matches.group(2), state) t[k] = u'
'*indent_level + u'' td.append(False) ltd.append(u'') tr.append(False) ltr.append(u'') has_opened_tr.append(False) elif len(td) == 0: pass elif u'|}' == x[0:2]: z = u"" + x[2:] l = ltd.pop() if not has_opened_tr.pop(): z = u"" + z if tr.pop(): z = u"" + z if td.pop(): z = u'' + z ltr.pop() t[k] = z + u'
'*indent_level elif u'|-' == x[0:2]: # Allows for |------------- x = x[1:] while x != u'' and x[0:1] == '-': x = x[1:] z = '' l = ltd.pop() has_opened_tr.pop() has_opened_tr.append(True) if tr.pop(): z = u'' + z if td.pop(): z = u'' + z ltr.pop() t[k] = z tr.append(False) td.append(False) ltd.append(u'') attributes = unstripForHTML(x, state) ltr.append(fixTagAttributes(attributes, u'tr')) elif u'|' == fc or u'!' == fc or u'|+' == x[0:2]: # Caption # x is a table row if u'|+' == x[0:2]: fc = u'+' x = x[1:] x = x[1:] if fc == u'!': x = x.replace(u'!!', u'||') # Split up multiple cells on the same line. # FIXME: This can result in improper nesting of tags processed # by earlier parser steps, but should avoid splitting up eg # attribute values containing literal "||". x = x.split(u'||') t[k] = u'' # Loop through each table cell for theline in x: z = '' if fc != u'+': tra = ltr.pop() if not tr.pop(): z = u'\n' tr.append(True) ltr.append(u'') has_opened_tr.pop() has_opened_tr.append(True) l = ltd.pop() if td.pop(): z = u'' + z if fc == u'|': l = u'td' elif fc == u'!': l = u'th' elif fc == u'+': l = u'caption' else: l = u'' ltd.append(l) #Cell parameters y = theline.split(u'|', 1) # Note that a '|' inside an invalid link should not # be mistaken as delimiting cell parameters if y[0].find(u'[[') != -1: y = [theline] if len(y) == 1: y = z + u"<" + l + u">" + y[0] else: attributes = unstripForHTML(y[0], state) y = z + u"<" + l + fixTagAttributes(attributes, l) + u">" + y[1] t[k] += y td.append(True) while len(td) > 0: l = ltd.pop() if td.pop(): t.append(u'') if tr.pop(): t.append(u'') if not has_opened_tr.pop(): t.append(u'') t.append(u'') text = u'\n'.join(t) # special case: don't return empty table if text == u"\n\n
": text = u'' return text def unstripForHTML(text, state): text = unstrip(text, state) text = unstripNoWiki(text, state) return text def unstrip(text, state): if 'general' not in state: return text general = state['general'] for k in general: v = general[k] text = text.replace(k, v) return text def unstripNoWiki(text, state): if 'nowiki' not in state: return text nowiki = state['nowiki'] for k in nowiki: v = nowiki[k] text = text.replace(k, v) return text _headerPat = re.compile(ur"<[Hh]([1-6])(.*?)>(.*?)", re.UNICODE) _templateSectionPat = re.compile(ur"", re.UNICODE) _tagPat = re.compile(ur"<.*?>", re.UNICODE) def formatHeadings(text, isMain, showToc, state): """ This function accomplishes several tasks: 1) Auto-number headings if that option is enabled 2) Add an [edit] link to sections for logged in users who have enabled the option 3) Add a Table of contents on the top for users who have enabled the option 4) Auto-anchor headings It loops through all headlines, collects the necessary data, then splits up the string and re-inserts the newly formatted headlines. """ doNumberHeadings = False showEditLink = True # Can User Edit if text.find(u"__NOEDITSECTION__") != -1: showEditLink = False text = text.replace(u"__NOEDITSECTION__", u"") # Get all headlines for numbering them and adding funky stuff like [edit] # links - this is for later, but we need the number of headlines right now matches = _headerPat.findall(text) numMatches = len(matches) # if there are fewer than 4 headlines in the article, do not show TOC # unless it's been explicitly enabled. enoughToc = showToc and (numMatches >= 4 or text.find(u"") != -1) # Allow user to stipulate that a page should have a "new section" # link added via __NEWSECTIONLINK__ showNewSection = False if text.find(u"__NEWSECTIONLINK__") != -1: showNewSection = True text = text.replace(u"__NEWSECTIONLINK__", u"") # if the string __FORCETOC__ (not case-sensitive) occurs in the HTML, # override above conditions and always show TOC above first header if text.find(u"__FORCETOC__") != -1: showToc = True enoughToc = True text = text.replace(u"__FORCETOC__", u"") # Never ever show TOC if no headers if numMatches < 1: enoughToc = False # headline counter headlineCount = 0 sectionCount = 0 # headlineCount excluding template sections # Ugh .. the TOC should have neat indentation levels which can be # passed to the skin functions. These are determined here toc = [] head = {} sublevelCount = {} levelCount = {} toclevel = 0 level = 0 prevlevel = 0 toclevel = 0 prevtoclevel = 0 refers = {} refcount = {} wgMaxTocLevel = 5 for match in matches: headline = match[2] istemplate = False templatetitle = u'' templatesection = 0 numbering = [] m = _templateSectionPat.search(headline) if m: istemplate = True templatetitle = b64decode(m[0]) templatesection = 1 + int(b64decode(m[1])) headline = _templateSectionPat.sub(u'', headline) if toclevel: prevlevel = level prevtoclevel = toclevel level = matches[headlineCount][0] if doNumberHeadings or enoughToc: if level > prevlevel: toclevel += 1 sublevelCount[toclevel] = 0 if toclevel < wgMaxTocLevel: toc.append(u'\n\n\n" * max(prevtoclevel - toclevel, 0)) else: if toclevel < wgMaxTocLevel: toc.append(u"\n") levelCount[toclevel] = level # count number of headlines for each level sublevelCount[toclevel] += 1 for i in range(1, toclevel+1): if sublevelCount[i]: numbering.append(to_unicode(sublevelCount[i])) # The canonized header is a version of the header text safe to use for links # Avoid insertion of weird stuff like by expanding the relevant sections canonized_headline = unstrip(headline, state) canonized_headline = unstripNoWiki(canonized_headline, state) # -- don't know what to do with this yet. # Remove link placeholders by the link text. # # turns into # link text with suffix # $canonized_headline = preg_replace( '//e', # "\$this->mLinkHolders['texts'][\$1]", # $canonized_headline ); # $canonized_headline = preg_replace( '//e', # "\$this->mInterwikiLinkHolders['texts'][\$1]", # $canonized_headline ); # strip out HTML canonized_headline = _tagPat.sub(u'', canonized_headline) tocline = canonized_headline.strip() # Save headline for section edit hint before it's escaped headline_hint = tocline canonized_headline = escapeId(tocline) refers[headlineCount] = canonized_headline # count how many in assoc. array so we can track dupes in anchors if canonized_headline not in refers: refers[canonized_headline] = 1 else: refers[canonized_headline] += 1 refcount[headlineCount] = refers[canonized_headline] numbering = '.'.join(numbering) # Don't number the heading if it is the only one (looks silly) if doNumberHeadings and numMatches > 1: # the two are different if the line contains a link headline = numbering + u' ' + headline # Create the anchor for linking from the TOC to the section anchor = canonized_headline; if refcount[headlineCount] > 1: anchor += u'_' + unicode(refcount[headlineCount]) if enoughToc: toc.append(u'\n
  • ') toc.append(numbering) toc.append(u' ') toc.append(tocline) toc.append(u'') # if showEditLink and (not istemplate or templatetitle != u""): # if not head[headlineCount]: # head[headlineCount] = u'' # # if istemplate: # head[headlineCount] += sk.editSectionLinkForOther(templatetile, templatesection) # else: # head[headlineCount] += sk.editSectionLink(mTitle, sectionCount+1, headline_hint) # give headline the correct tag if headlineCount not in head: head[headlineCount] = [] h = head[headlineCount] h.append(u'') h.append(matches[headlineCount][1].strip()) h.append(headline.strip()) h.append(u'') headlineCount += 1 if not istemplate: sectionCount += 1 if enoughToc: if toclevel < wgMaxTocLevel: toc.append(u"
  • \n") toc.append(u"\n\n" * max(0, toclevel - 1)) toc.insert(0, u'

    Contents

    ') toc.append(u'\n
    ') # split up and insert constructed headlines blocks = _headerPat.split(text) i = 0 len_blocks = len(blocks) forceTocPosition = text.find(u"") full = [] while i < len_blocks: j = i/4 full.append(blocks[i]) if enoughToc and not i and isMain and forceTocPosition == -1: full += toc toc = None if j in head and head[j]: full += head[j] head[j] = None i += 4 full = u''.join(full) if forceTocPosition != -1: return full.replace(u"", u''.join(toc), 1) else: return full _startRegexHash = {} _endRegexHash = {} _endCommentPat = re.compile(ur'(-->)', re.UNICODE) _extractTagsAndParams_n = 1 def extractTagsAndParams(elements, text, matches, uniq_prefix = u''): """ Replaces all occurrences of HTML-style comments and the given tags in the text with a random marker and returns teh next text. The output parameter $matches will be an associative array filled with data in the form: 'UNIQ-xxxxx' => array( 'element', 'tag content', array( 'param' => 'x' ), 'tag content' ) ) """ stripped = u'' taglist = u'|'.join(elements) if taglist not in _startRegexHash: _startRegexHash[taglist] = re.compile(ur"<(" + taglist + ur")(\s+[^>]*?|\s*?)(/?>)|<(!--)", re.UNICODE | re.IGNORECASE) start = _startRegexHash[taglist] while text != u'': p = start.split(text, 1) stripped += p[0] if len(p) == 1: break elif p[4]: # comment element = p[4] attributes = u'' close = u'' else: element = p[1] attributes = p[2] close = p[3] inside = p[5] global _extractTagsAndParams_n marker = uniq_prefix + u'-' + element + u'-' + (u"%08X" % _extractTagsAndParams_n) + u'-QINU' _extractTagsAndParams_n += 1 stripped += marker if close == u'/>': # empty element tag, content = None text = inside tail = None else: if element == u'!--': end = _endCommentPat else: if element not in _endRegexHash: _endRegexHash[element] = re.compile(ur'()', re.UNICODE | re.IGNORECASE) end = _endRegexHash[element] q = end.split(inside, 1) content = q[0] if len(q) < 3: # no end tag tail = '' text = '' else: tail = q[1] text = q[2] matches[marker] = ( element, content, decodeTagAttributes(attributes), u"<" + element + attributes + close + content + tail ) return stripped def strip(text, state, uniq_prefix, stripcomments = False, dontstrip = []): render = True commentState = {} elements = ['nowiki', 'gallery'] + mTagHooks.keys() if True: #wgRawHtml elements.append('html') # if( $this->mOptions->getUseTeX() ) { # $elements[] = 'math'; # } # Removing $dontstrip tags from $elements list (currently only 'gallery', fixing bug 2700) for k in dontstrip: if k in elements: del elements[k] matches = {} text = extractTagsAndParams(elements, text, matches, uniq_prefix) for marker in matches: element, content, params, tag = matches[marker] if render: tagName = element.lower() if tagName == u'!--': # comment output = tag if tag[-3:] != u'-->': output += "-->" elif tagName == u'html': output = content elif tagName == u'nowiki': output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') elif tagName == u'math': output = content # do math here elif tagName == u'gallery': output = renderImageGallery(content, params) else: if tagName in mTagHooks: output = mTagHooks[tagName](content, params) else: output = content.replace(u'&', u'&').replace(u'<', u'<').replace(u'>', u'>') else: # Just stripping tags; keep the source output = tag # Unstrip the output, because unstrip() is no longer recursive so # it won't do it itself output = unstrip(output, state) if not stripcomments and element == u'!--': commentState[marker] = output elif element == u'html' or element == u'nowiki': if 'nowiki' not in state: state['nowiki'] = {} state['nowiki'][marker] = output else: if 'general' not in state: state['general'] = {} state['general'][marker] = output # Unstrip comments unless explicitly told otherwise. # (The comments are always stripped prior to this point, so as to # not invoke any extension tags / parser hooks contained within # a comment.) if not stripcomments: # Put them all back and forget them for k in commentState: v = commentState[k] text = text.replace(k, v) return text mArgStack = [] def replaceVariables(text, args = {}, argsOnly = False): """ Replace magic variables, templates, and template arguments with the appropriate text. Templates are substituted recursively, taking care to avoid infinite loops. """ return text # Prevent too big inclusions # if( strlen( $text ) > $this->mOptions->getMaxIncludeSize() ) { # return $text; # } # This function is called recursively. To keep track of arguments we need a stack: mArgStack.append(args) braceCallbacks = {} if not argsOnly: braceCallbacks[2] = [None, braceSubstitution] braceCallbacks[3] = [None, argSubstitution] callbacks = { u'{': { 'end': u'}', 'cb': braceCallbacks, 'min': argsOnly and 3 or 2, 'max': 3 }, u'[': { 'end': u']', 'cb': {2: None}, 'min': 2, 'max': 2 } } text = replace_callback(text, callbacks) mArgStack.pop() return text def replace_callback(text, callbacks): """ parse any parentheses in format ((title|part|part)) and call callbacks to get a replacement text for any found piece """ openingBraceStack = [] # this array will hold a stack of parentheses which are not closed yet lastOpeningBrace = -1 # last not closed parentheses validOpeningBraces = u''.join(callbacks.keys()) i = 0 while i < len(text): if lastOpeningBrace == -1: currentClosing = u'' search = validOpeningBraces else: currentClosing = openingBraceStack[lastOpeningBrace]['braceEnd'] search = validOpeningBraces + u'|' + currentClosing rule = None pos = -1 for c in search: pos = max(pos, text.find(c, i)) pos -= i pos += 1 if pos == 0: pos = len(text)-i i += pos if i < len(text): if text[i] == u'|': found = 'pipe' elif text[i] == currentClosing: found = 'close' elif text[i] in callbacks: found = 'open' rule = callbacks[text[i]] else: i += 1 continue else: break if found == 'open': # found opening brace, let's add it to parentheses stack piece = { 'brace': text[i], 'braceEnd': rule['end'], 'title': u'', 'parts': None } # count opening brace characters count = 0 while True: if text[i+count:i+1+count] == piece['brace']: count += 1 else: break piece['count'] = count i += piece['count'] piece['startAt'] = piece['partStart'] = i # we need to add to stack only if opening brace count is enough for one of the rules if piece['count'] >= rule['min']: lastOpeningBrace += 1 openingBraceStack[lastOpeningBrace] = piece elif found == 'close': maxCount = openingBraceStack[lastOpeningBrace]['count'] count = 0 while count < maxCount: if text[i+count:i+1+count] == text[i]: count += 1 else: break # check for maximum matching characters (if there are 5 closing # characters, we will probably need only 3 - depending on the rules) matchingCount = 0 matchingCallback = None cbType = callbacks[openingBraceStack[lastOpeningBrace]['brace']] if count > cbType['max']: # The specified maximum exists in the callback array, unless the caller # has made an error matchingCount = cbType['max'] else: # Count is less than the maximum # Skip any gaps in the callback array to find the true largest match # Need to use array_key_exists not isset because the callback can be null matchingCount = count while matchingCount > 0 and matchingCount not in cbType['cb']: matchingCount -= 1 if matchingCount <= 0: i += count continue matchingCallback = cbType['cb'][matchingCount] # let's set a title or last part (if '|' was found) if openingBraceStack[lastOpeningBrace]['parts'] is None: openingBraceStack[lastOpeningBrace]['title'] = \ text[openingBraceStack[lastOpeningBrace]['partStart']:i] else: openingBraceStack[lastOpeningBrace]['parts'].append( text[openingBraceStack[lastOpeningBrace]['partStart']:i] ) pieceStart = openingBraceStack[lastOpeningBrace]['startAt'] - matchingCount pieceEnd = i + matchingCount if callable(matchingCallback): cbArgs = { 'text': text[pieceStart:pieceEnd], 'title': openingBraceStack[lastOpeningBrace]['title'].strip(), 'parts': openingBraceStack[lastOpeningBrace]['parts'], 'lineStart': pieceStart > 0 and text[pieceStart-1] == u"\n" } # finally we can call a user callback and replace piece of text replaceWith = matchingCallback(cbArgs) text = text[:pieceStart] + replaceWith + text[pieceEnd:] i = pieceStart + len(replaceWith) else: # null value for callback means that parentheses should be parsed, but not replaced i += matchingCount # reset last opening parentheses, but keep it in case there are unused characters piece = { 'brace': openingBraceStack[lastOpeningBrace]['brace'], 'braceEnd': openingBraceStack[lastOpeningBrace]['braceEnd'], 'count': openingBraceStack[lastOpeningBrace]['count'], 'title': u'', 'parts': None, 'startAt': openingBraceStack[lastOpeningBrace]['startAt'] } openingBraceStack[lastOpeningBrace] = None lastOpeningBrace -= 1 if matchingCount < piece['count']: piece['count'] -= matchingCount piece['startAt'] -= matchingCount piece['partStart'] = piece['startAt'] # do we still qualify for any callback with remaining count? currentCbList = callbacks[piece['brace']]['cb'] while piece['count']: if piece['count'] in currentCbList: lastOpeningBrace += 1 openingBraceStack[lastOpeningBrace] = piece break piece['count'] -= 1 elif found == 'pipe': # lets set a title if it is a first separator, or next part otherwise if opeingBraceStack[lastOpeningBrace]['parts'] is None: openingBraceStack[lastOpeningBrace]['title'] = \ text[openingBraceStack[lastOpeningBrace]['partStart']:i] openingBraceStack[lastOpeningBrace]['parts'] = [] else: openingBraceStack[lastOpeningBrace]['parts'].append( text[openingBraceStack[lastOpeningBrace]['partStart']:i] ) i += 1 openingBraceStack[lastOpeningBrace]['partStart'] = i return text def braceSubstitution(piece): """ Return the text of a template, after recursively replacing any variables or templates within the template. """ # global $wgContLang, $wgLang, $wgAllowDisplayTitle, $action; # Flags found = False # $text has been filled nowiki = False # wiki markup in $text should be escaped noparse = False # Unsafe HTML tags should not be stripped, etc. noargs = False # Don't replace triple-brace arguments in $text replaceHeadings = False # Make the edit section links go to the template not the article isHTML = False # text is HTML, armour it against wikitext transformation forceRawInterwiki = False # Force interwiki transclusion to be done in raw mode not rendered # Title object, where $text came from title = None linestart = ''; # part1 is the bit before the first |, and must contain only title characters # args is a list of arguments, starting from index 0, not including $part1 titleText = part1 = piece['title'] # If the third subpattern matched anything, it will start with | if piece['parts'] is None: replaceWith = variableSubstitution([piece['text'], piece['title']]) if replaceWith != piece['text']: text = replaceWith found = True noparse = True noargs = True args = piece['parts'] is None and [] or piece['parts'] argc = len(args) # SUBST if not found: mwSubst = u"SUBST" if part1.find(mwSubst) != -1: # One of two possibilities is true: # 1) Found SUBST but not in the PST phase # 2) Didn't find SUBST and in the PST phase # In either case, return without further processing part1.replace(mwSubst, u'', 1) text = piece['text'] found = True noparse = True noargs = True # MSG, MSGNW and RAW if not found: # Check for MSGNW: mwMsgnw = u"MSGNW" if part1.find(mwMsgnw) != -1: part1.replace(mwMsgnw, u'', 1) nowiki = True else: mwMsg = u"MSG" part1.replace(mwMsg, u'', 1) # Check for RAW: mwRaw = u"RAW" if part1.find(mwRaw) != -1: part1.replace(mwRaw, u'', 1) forceRawInterwiki = True # Parser functions if not found: colonPos = part1.find(u':') if colonPos != -1: # Case sensitive functions function = part1[0:colonPos] if function in mFunctionSynonyms[1]: function = mFunctionSynonyms[1][function] else: # Case insensitive functions function = function.lower() if function in mFunctionSynonyms[0]: function = mFunctionSynonyms[0][function] else: function = False if function: funcArgs = [x.strip() for x in args] funcArgs += [None, part1[colonPos+1:].strip()] result = mFunctionHooks[function](*funcArgs) found = True # The text is usually already parsed, doesn't need triple-brace tags expanded, etc. #$noargs = true; #$noparse = true; if isinstance(result, dict): if 0 in result: tex = linestart + list[0] del list[0] # Extract flags into the local scope # This allows callers to set flags such as nowiki, noparse, found, etc. if 'nowiki' in result: nowiki = result['nowiki'] if 'noparse' in result: noparse = result['noparse'] if 'found' in result: found = result['found'] else: text = linestart + result # Template table test # Did we encounter this template already? If yes, it is in the cache # and we need to check for loops. if not found and piece['title'] in mTemplates: found = True # Infinite loop test if part1 in mTemplatePath: noparse = True noargs = True found = True text = linestart + u"[[" + part1 + u"]]" else: text = linestart + mTemplates[piece['title']] # Load from database lastPathLevel = mTemplatePath if not found: ns = NS_TEMPLATE; # declaring $subpage directly in the function call # does not work correctly with references and breaks # {{/subpage}}-style inclusions subpage = u'' part1 = maybeDoSubpageLink(part1, subpage) if subpage != u'': ns = mTitle.getNamespace() title = Title.newFromText(part1, ns) if title is not None: titleText = title.getPrefixedText() checkVariantLink = len(wgContLang.getVariants()) > 1 # Check for language variants if the template is not found if checkVariantLink and title.getArticleID() == 0: wgContLang.findVariantLink(part1, title) if title.isExternal(): if title.getNamespace() == u"Special" and mOptions.getAllowSpecialInclusion(): text = SpecialPage.capturePath(title) if isinstance(text, basestring): found = True noparse = True noargs = True isHTML = True this.disableCache() else: articleContent = fetchTemplate(title) if articleContent != False: found = True text = articleContent replaceHeadings = true # If the title is valid but undisplayable, make a link to it if not found: text = u"[[:" + titleText + u"]]" found = True elif title.isTrans(): pass # # Interwiki transclusion # if ( $this->ot['html'] && !$forceRawInterwiki ) { # $text = $this->interwikiTransclude( $title, 'render' ); # $isHTML = true; # $noparse = true; # } else { # $text = $this->interwikiTransclude( $title, 'raw' ); # $replaceHeadings = true; # } # $found = true; # } # Template cache array insertion # Use the original $piece['title'] not the mangled $part1, so that # modifiers such as RAW: produce separate cache entries if found: if isHTML: pass # A special page; don't store it in the template cache. else: mTemplates[place['title']] = text text = linestart + text # if ( $found && !$this->incrementIncludeSize( 'pre-expand', strlen( $text ) ) ) { # # Error, oversize inclusion # $text = $linestart . # "[[$titleText]]"; # $noparse = true; # $noargs = true; # } # # # Recursive parsing, escaping and link table handling # # Only for HTML output # if ( $nowiki && $found && ( $this->ot['html'] || $this->ot['pre'] ) ) { # $text = wfEscapeWikiText( $text ); # } elseif ( !$this->ot['msg'] && $found ) { # if ( $noargs ) { # $assocArgs = array(); # } else { # # Clean up argument array # $assocArgs = array(); # $index = 1; # foreach( $args as $arg ) { # $eqpos = strpos( $arg, '=' ); # if ( $eqpos === false ) { # $assocArgs[$index++] = $arg; # } else { # $name = trim( substr( $arg, 0, $eqpos ) ); # $value = trim( substr( $arg, $eqpos+1 ) ); # if ( $value === false ) { # $value = ''; # } # if ( $name !== false ) { # $assocArgs[$name] = $value; # } # } # } # # # Add a new element to the templace recursion path # $this->mTemplatePath[$part1] = 1; # } # # if ( !$noparse ) { # # If there are any tags, only include them # if ( in_string( '', $text ) && in_string( '', $text ) ) { # preg_match_all( '/(.*?)\n?<\/onlyinclude>/s', $text, $m ); # $text = ''; # foreach ($m[1] as $piece) # $text .= $piece; # } # # Remove sections and tags # $text = preg_replace( '/.*?<\/noinclude>/s', '', $text ); # $text = strtr( $text, array( '' => '' , '' => '' ) ); # # if( $this->ot['html'] || $this->ot['pre'] ) { # # Strip ,
    , etc.
    #				 $text = $this->strip( $text, $this->mStripState );
    #				 if ( $this->ot['html'] ) {
    #					 $text = Sanitizer::removeHTMLtags( $text, array( &$this, 'replaceVariables' ), $assocArgs );
    #				 } elseif ( $this->ot['pre'] && $this->mOptions->getRemoveComments() ) {
    #					 $text = Sanitizer::removeHTMLcomments( $text );
    #				 }
    #			 }
    #			 $text = $this->replaceVariables( $text, $assocArgs );
    # 
    #			 # If the template begins with a table or block-level
    #			 # element, it should be treated as beginning a new line.
    #			 if (!$piece['lineStart'] && preg_match('/^({\\||:|;|#|\*)/', $text)) /*}*/{ 
    #				 $text = "\n" . $text;
    #			 }
    #		 } elseif ( !$noargs ) {
    #			 # $noparse and !$noargs
    #			 # Just replace the arguments, not any double-brace items
    #			 # This is used for rendered interwiki transclusion
    #			 $text = $this->replaceVariables( $text, $assocArgs, true );
    #		 }
    #	 }
    #	 # Prune lower levels off the recursion check path
    #	 $this->mTemplatePath = $lastPathLevel;
    # 
    #	 if ( $found && !$this->incrementIncludeSize( 'post-expand', strlen( $text ) ) ) {
    #		 # Error, oversize inclusion
    #		 $text = $linestart .
    #			 "[[$titleText]]";
    #		 $noparse = true;
    #		 $noargs = true;
    #	 }
    # 
    #	 if ( !$found ) {
    #		 wfProfileOut( $fname );
    #		 return $piece['text'];
    #	 } else {
    #		 wfProfileIn( __METHOD__ . '-placeholders' );
    #		 if ( $isHTML ) {
    #			 # Replace raw HTML by a placeholder
    #			 # Add a blank line preceding, to prevent it from mucking up
    #			 # immediately preceding headings
    #			 $text = "\n\n" . $this->insertStripItem( $text, $this->mStripState );
    #		 } else {
    #			 # replace ==section headers==
    #			 # XXX this needs to go away once we have a better parser.
    #			 if ( !$this->ot['wiki'] && !$this->ot['pre'] && $replaceHeadings ) {
    #				 if( !is_null( $title ) )
    #					 $encodedname = base64_encode($title->getPrefixedDBkey());
    #				 else
    #					 $encodedname = base64_encode("");
    #				 $m = preg_split('/(^={1,6}.*?={1,6}\s*?$)/m', $text, -1,
    #					 PREG_SPLIT_DELIM_CAPTURE);
    #				 $text = '';
    #				 $nsec = 0;
    #				 for( $i = 0; $i < count($m); $i += 2 ) {
    #					 $text .= $m[$i];
    #					 if (!isset($m[$i + 1]) || $m[$i + 1] == "") continue;
    #					 $hl = $m[$i + 1];
    #					 if( strstr($hl, "" . $m2[3];
    # 
    #					 $nsec++;
    #				 }
    #			 }
    #		 }
    #		 wfProfileOut( __METHOD__ . '-placeholders' );
    #	 }
    # 
    #	 # Prune lower levels off the recursion check path
    #	 $this->mTemplatePath = $lastPathLevel;
    # 
    #	 if ( !$found ) {
    #		 wfProfileOut( $fname );
    #		 return $piece['text'];
    #	 } else {
    #		 wfProfileOut( $fname );
    #		 return $text;
    #	 }
    # }
    
    _guillemetLeftPat = re.compile(ur'(.) (\?|:|;|!|\302\273)', re.UNICODE)
    _guillemetRightPat = re.compile(ur'(\302\253) ', re.UNICODE)
    def fixtags(text):
    	"""Clean up special characters, only run once, next-to-last before doBlockLevels"""
    	# french spaces, last one Guillemet-left
    	# only if there is something before the space
    	text = _guillemetLeftPat.sub(ur'\1 \2', text)
    	# french spaces, Guillemet-right
    	text = _guillemetRightPat.sub(ur'\1 ', text)
    	return text
    
    def closeParagraph(mLastSection):
    	"""Used by doBlockLevels()"""
    	result = u''
    	if mLastSection != u'':
    		result = u'\n'
    	
    	return result
    
    def getCommon(st1, st2):
    	"""
    	getCommon() returns the length of the longest common substring
    	of both arguments, starting at the beginning of both.
    	"""
    	fl = len(st1)
    	shorter = len(st2)
    	if fl < shorter:
    		shorter = fl
    	
    	i = 0
    	while i < shorter:
    		if st1[i] != st2[i]:
    			break
    		i += 1
    	return i
    
    def openList(char, mLastSection):
    	"""
    	These next three functions open, continue, and close the list
    	element appropriate to the prefix character passed into them.
    	"""
    	result = closeParagraph(mLastSection)
    	
    	mDTopen = False
    	if char == u'*':
    		result += u'
    • ' elif char == u'#': result += u'
      1. ' elif char == u':': result += u'
        ' elif char == u';': result += u'
        ' mDTopen = True else: result += u'' return result, mDTopen def nextItem(char, mDTopen): if char == u'*' or char == '#': return u'
      2. ', None elif char == u':' or char == u';': close = u'' if mDTopen: close = '' if char == u';': return close + u'
        ', True else: return close + u'
        ', False return u'' def closeList(char, mDTopen): if char == u'*': return u'
    \n' elif char == u'#': return u'\n' elif char == u':': if mDTopen: return u'\n' else: return u'\n' else: return u'' _closePrePat = re.compile(u" 0: tmpOutput, tmpMDTopen = nextItem(pref[commonPrefixLength-1]) output += tmpOutput if tmpMDTopen is not None: mDTopen = tmpMDTopen while prefixLength > commonPrefixLength: char = pref[commonPrefixLength:commonPrefixLength+1] tmpOutput, tmpMDTOpen = openList(char, mLastSection) if tmpMDTOpen: mDTopen = True output += tmpOutput mLastSection = u'' mInPre = False if char == u';': # FIXME: This is dupe of code above if findColonNoLinks(t, term, t2) != False: t = t2 output += term tmpOutput, tmpMDTopen = nextItem(u':', mDTopen) output += tmpOutput if tmpMDTopen is not None: mDTopen = tmpMDTopen commonPrefixLength += 1 lastPrefix = pref2 if prefixLength == 0: # No prefix (not in list)--go to paragraph mode # XXX: use a stack for nestable elements like span, table and div openmatch = _openMatchPat.search(t) _closeMatchPat = re.compile(ur"(' mInPre = False mLastSection = u'pre' t = t[1:] else: # paragraph if t.strip() == u'': if paragraphStack: output += paragraphStack + u'
    ' paragraphStack = False mLastSection = u'p' else: if mLastSection != u'p': output += closeParagraph(mLastSection) mLastSection = u'' mInPre = False paragraphStack = u'

    ' else: paragraphStack = u'

    ' else: if paragraphStack: output += paragraphStack paragraphStack = False mLastSection = u'p' elif mLastSection != u'p': output += closeParagraph(mLastSection) + u'

    ' mLastSection = u'p' mInPre = False # somewhere above we forget to get out of pre block (bug 785) if preCloseMatch and mInPre: mInPre = False if paragraphStack == False: output += t + u"\n" while prefixLength: output += closeList(pref2[prefixLength-1], mDTopen) mDTopen = False prefixLength -= 1 if mLastSection != u'': output += u'' mLastSection = u'' return output def parse(text, showToc=True): """docstring for parse""" utf8 = isinstance(text, str) text = to_unicode(text) if text[-1:] != u'\n': text = text + u'\n' taggedNewline = True else: taggedNewline = False mStripState = {} mUniqPrefix = u"\x07UNIQ" + unicode(random.randint(1, 1000000000)) text = strip(text, mStripState, mUniqPrefix) text = removeHtmlTags(text) text = replaceVariables(text) text = doTableStuff(text, mStripState) text = parseHorizontalRule(text) text, toc = checkTOC(text) text = parseHeaders(text) text = parseAllQuotes(text) text = replaceInternalLinks(text) text = replaceExternalLinks(text) if not toc and text.find(u"") == -1: showToc = False text = formatHeadings(text, True, showToc, mStripState) text = unstrip(text, mStripState) text = fixtags(text) text = doBlockLevels(text, True, mUniqPrefix) text = unstripNoWiki(text, mStripState) if taggedNewline and text[-1:] == u'\n': text = text[:-1] if utf8: return text.encode("utf-8") return text def truncate_url(url, length=40): if len(url) <= length: return url import re pattern = r'(/[^/]+/?)$' match = re.search(pattern, url) if not match: return url l = len(match.group(1)) domain = url.replace(match.group(1), '') firstpart = url[0:len(url)-l] secondpart = match.group(1) if firstpart == firstpart[0:length-3]: secondpart = secondpart[0:length-3] + '...' else: firstpart = firstpart[0:length-3] secondpart = '...' + secondpart t_url = firstpart+secondpart return t_url def to_unicode(text, charset=None): """Convert a `str` object to an `unicode` object. If `charset` is given, we simply assume that encoding for the text, but we'll use the "replace" mode so that the decoding will always succeed. If `charset` is ''not'' specified, we'll make some guesses, first trying the UTF-8 encoding, then trying the locale preferred encoding, in "replace" mode. This differs from the `unicode` builtin, which by default uses the locale preferred encoding, in 'strict' mode, and is therefore prompt to raise `UnicodeDecodeError`s. Because of the "replace" mode, the original content might be altered. If this is not what is wanted, one could map the original byte content by using an encoding which maps each byte of the input to an unicode character, e.g. by doing `unicode(text, 'iso-8859-1')`. """ if not isinstance(text, str): if isinstance(text, Exception): # two possibilities for storing unicode strings in exception data: try: # custom __str__ method on the exception (e.g. PermissionError) return unicode(text) except UnicodeError: # unicode arguments given to the exception (e.g. parse_date) return ' '.join([to_unicode(arg) for arg in text.args]) return unicode(text) if charset: return unicode(text, charset, 'replace') else: try: return unicode(text, 'utf-8') except UnicodeError: return unicode(text, locale.getpreferredencoding(), 'replace')