''' Wikicreole style parser. Mostly complient. Written 2010-12-19. danomagnum.com ''' __version__ = 1.0 import re import math USE_TOC = True #should we even make TOCs? TOC_LENGTH = 1000 #Length an article needs to be before we worry about TOCing it. def tocmaker(contents): '''Creates a toc from a list of ids''' ret = '' if len(contents) > 3: ret = "
Contents
' return ret def parse(text): '''Parses text and returns html according to (almost) wikicreole \'\'italic\'\' \'\'\'bold\'\'\' __underline__ [[link]] [[link|linktext]] {{image}} {{image|imagetext}} *list **list2 #numbered list ##numbered list 2 ==Heading 1 ===Heading 2 \\\\ ->
---- ->
''' in_pre = False in_ulist = 0 in_olist = 0 in_table = False formattings = [['**','b',False],['//','i',False],['__','u',False]] outstring = '' contents = [] headings = [('=====','h5', 5),('====','h4', 4),('===','h3', 3),('==','h2', 2),('=','h1', 1)] re_image_1 = re.compile(r'\{\{([^\|^\}]+?)\|(.+?)\}\}') re_image_1s = r'' re_image_2 = re.compile(r'\{\{([^\|^\}]+?)\}\}') re_image_2s = r'' re_outlink_1 = re.compile(r'\[\[([^\|^\]]+?)\|(.+?)\]\]') re_outlink_1s = r'\2' re_outlink_2 = re.compile(r'\[\[([^\|^\]]+?)\]\]') re_outlink_2s = r'\1' re_pre_inline = re.compile(r"\{\{\{(.+?)\}\}\}") re_pre_inline_s = r'\1' for line in text.split('\n'): if in_pre: if line.startswith('}}}'): outstring += '' in_pre = False else: outstring += line + "\n" continue line = line.lstrip() if line.startswith('#'): #check for ordered lists if not in_olist: outstring += "
    " in_olist = 1 else: level = len(line[:in_olist+1].split('#')) - 1 delta = math.fabs(level - in_olist) while delta: level = len(line[:in_olist+1].split('#')) - 1 if level > in_olist: outstring += "
      " in_olist += 1 else: outstring += "
    " in_olist -= 1 delta = math.fabs(level - in_olist) line = "
  1. " + line[in_olist:] + "
  2. " elif in_olist: while in_olist: outstring += "
" in_olist -= 1 if line.startswith('*'): #check for unordered lists if not in_ulist: outstring += "" in_ulist -= 1 if line.startswith('|'): if not in_table: in_table = True outstring += "" parts = line.split('|') output = "" for p in parts[1:-1]: if p.startswith("="): output += "" else: output += "" output += "" line = output elif in_table: outstring += "
" + p[1:] + "" + p + "
" in_table = False if line == '': for fmt in formattings: if fmt[2]: outstring += '' fmt[2] = False outstring += "

" continue if line.startswith('%'): #comments start with %, so just ignore it continue if line.startswith('----'): outstring += "


" continue if line.startswith ('{{{'): outstring += "
"
			#if you start a line with {{{format, the pre gets its class set to that format
			in_pre = True
			continue


		for h in headings:
			if line.startswith(h[0]):
				line = line.strip(h[0])
				outstring += "<" + h[1] + " id='" + line + "'>"
				contents.append((line, h[2]))
				line = line + ""


		if line.count(r'\\'):
			line = line.replace(r'\\','
') line = re_pre_inline.sub(re_pre_inline_s,line) line = re_outlink_1.sub(re_outlink_1s,line) line = re_outlink_2.sub(re_outlink_2s,line) line = re_image_1.sub(re_image_1s,line) line = re_image_2.sub(re_image_2s,line) #these lines protect https and ftps from getting clobbered by the italics line = line.replace('http://','!http:~~!') line = line.replace('https://','!https:~~!') line = line.replace('ftp://','!ftp:~~!') for fmt in formattings: while line.count(fmt[0]): if fmt[2]: line = line.replace(fmt[0],'',1) fmt[2] = False else: line = line.replace(fmt[0],'<' + fmt[1] + '>',1) fmt[2] = True line = line.replace('!http:~~!','http://') line = line.replace('!https:~~!','https://') line = line.replace('!ftp:~~!','ftp://') #ine = re_italic.sub(re_italic_s,line) #ine = re_underline.sub(re_underline_s,line) outstring += line + "\n" if USE_TOC: if len(outstring) > TOC_LENGTH: outstring = tocmaker(contents) + outstring return outstring if __name__ == '__main__': string = "[[test]]" #print parse(string)