"""This is Textile A Humane Web Text Generator USING TEXTILE Block modifier syntax: Header: hn. Paragraphs beginning with 'hn. ' (where n is 1-6) are wrapped in header tags. Example:
Text
Paragraph with CSS class: p(class). Paragraphs beginning with 'p(class). ' receive a CSS class attribute. Example:Text
Blockquote: bq. Paragraphs beginning with 'bq. ' are wrapped in block quote tags. Example:TextBlockquote with citation: bq(citeurl). Paragraphs beginning with 'bq(citeurl). ' receive a citation attribute. Example:
TextNumeric list: # Consecutive paragraphs beginning with # are wrapped in ordered list tags. Example:
computer code
==notextile== leave text alone (do not format)
"linktext":url linktext
"linktext(title)":url linktext
!imageurl! tags;
map high-bit ASCII to HTML numeric entities
1.02 - 2003/03/19 - MAP - changed hyperlink qtag expression to only
match valid URL characters (per RFC 2396); fixed preg_replace to
not match across line breaks (solves lots of problems with
mistakenly matching overlapping inline markup); fixed whitespace
stripping to only strip whitespace from beginning and end of lines,
not immediately before and after HTML tags.
1.03 - 2003/03/20 - MAP - changed hyperlink qtag again to more
closely match original Textile (fixes problems with links
immediately followed by punctuation -- somewhere Dean is
grinning right now); handle curly apostrophe with "ve"
contraction; clean up empty titles at end.
1.04 - 2003/03/23 - MAP - lstrip input to deal with extra spaces at
beginning of first line; tweaked list loop to handle consecutive lists
1.1 - 2003/06/06 - MAP - created initial test suite for links and images,
and fixed a bunch of related bugs to pass them
"""
import re
DEBUGLEVEL = 0
def _debug(s, level=1):
if DEBUGLEVEL >= level:
print s
# map 8-bit ASCII codes to HTML numerical entity equivalents
_demoroniserMap = [(128, 8364), (129, 0), (130, 8218), (131, 402), (132, 8222), (133, 8230), (134, 8224), (135, 8225), (136, 710), (137, 8240), (138, 352), (139, 8249), (140, 338), (141, 0), (142, 0), (143, 0), (144, 0), (145, 8216), (146, 8217), (147, 8220), (148, 8221), (149, 8226), (150, 8211), (151, 8212), (152, 732), (153, 8482), (154, 353), (155, 8250), (156, 339), (157, 0), (158, 0), (159, 376)]
_demoroniserMap = [(chr(a), b and ('%s;' % b) or '') for (a, b) in _demoroniserMap]
def demoroniser(text):
for a, b in _demoroniserMap:
text = text.replace(a, b)
return text
def preg_replace(pattern, replacement, text):
# this acts like re.sub, except it replaces empty groups with ''
# instead of raising an exception
def replacement_func(matchobj):
counter = 1
rc = replacement
_debug(matchobj.groups())
for matchitem in matchobj.groups():
if not matchitem:
matchitem = ''
rc = rc.replace(r'\%s' % counter, matchitem)
counter += 1
return rc
p = re.compile(pattern)
_debug(pattern)
return p.sub(replacement_func, text)
ENT_COMPAT = 0
ENT_NOQUOTES = 1
ENT_QUOTES = 2
def htmlspecialchars(text, mode):
text = text.replace('&', '&')
if mode != ENT_NOQUOTES:
text = text.replace('"', '"')
if mode == ENT_QUOTES:
text = text.replace("'", ''')
text = text.replace('<', '<')
text = text.replace('>', '>')
return text
def textile(text):
### Basic global changes
text = text.lstrip()
# turn any incoming ampersands into a dummy character for now.
# This uses a negative lookahead for alphanumerics followed by a semicolon,
# implying an incoming html entity, to be skipped
text = preg_replace(r"&(?![#a-zA-Z0-9]+;)", r"x%x%", text);
# entify illegal high-bit ASCII
text = demoroniser(text)
# unentify angle brackets and ampersands
text = text.replace(">", ">").replace("<", "<").replace("&", "&")
# zap carriage returns
text = text.replace("\r\n", "\n")
text = text.replace("\r", "\n")
# if there is no html, trim each line unequivocally
if not re.search(r'''<.*>''', text):
# trim each line
text = "\n".join([l.strip() for l in text.split("\n")])
else:
# else split the text into an array at <> and only trim lines
# that are not within a tag
lines = []
pre = 0
for line in re.split('(<.*?>)', text):
if re.match('', line.lower()):
pre = 1
elif re.match('', line.lower()):
pre = 0
if not pre:
line = preg_replace('''(\\s*?)\n(\\s*?)''', '\n', line)
lines.append(line)
text = ''.join(lines)
### Find and replace quick tags
# double equal signs mean
text = preg_replace(r"""(^|\s)==(.*?)==([^\w]{0,2})""", r"""\1\2 \3""", text);
# image qtag
text = preg_replace(r"""!([^\s\(=!]+?)\s?(\(([^\)]+?)\))?!""", r"""
""", text);
# image with hyperlink
hyperlink = r"""(\S+?)([^\w\s\/;=\?]*?)(\s|$)"""
text = preg_replace(r"""( ):""" + hyperlink, r"""\1\3\4""", text);
# hyperlink qtag
text = preg_replace(r'''"([^"\(]+)\s?(\(([^\)]+)\))?":''' + hyperlink, r'''\1\5\6''', text)
# arrange qtag delineators and replacements in an array
qtags = [(r'\*\*', 'b'),
(r'\*', 'strong'),
(r'\?\?', 'cite'),
(r'-', 'del'),
(r'\+', 'ins'),
(r'~', 'sub'),
(r'@', 'code')]
# loop through the array, replacing qtags with html
for texttag, htmltag in qtags:
text = preg_replace(r'''(^|\s|>)''' + texttag + r'''\b(.+?)\b([^\w\s]*?)''' + texttag + r'''([^\w\s]{0,2})''',
r'''\1<''' + htmltag + r'''>\2\3''' + htmltag + r'''>\4''',
text);
# some weird bs with underscores and \b word boundaries,
# so we'll do those on their own
text = preg_replace(r'''(^|\s)__(.*?)__([^\w\s]{0,2})''', r'''\1\2\3''', text)
text = preg_replace(r'''(^|\s)_(.*?)_([^\w\s]{0,2})''', r'''\1\2\3''', text)
text = preg_replace(r'''\^(.*?)\^''', r'''\1''', text)
### Find and replace typographic chars and special tags
# small problem with double quotes at the end of a string, so add a dummy space
text = preg_replace(r'''"$''', r'''" ''', text);
# NB: all these will wreak havoc inside tags
glyphs = [(r'''([^\s[{(>])?\'([dmst]\b|ll\b|ve\b|\s|:|$)''', r'''\1’\2'''), # single closing
(r'''\'''', r'''‘'''), # single opening
(r'''([^\s[{(])?"(\s|:|$)''', r'''\1”\2'''), # double closing
(r'''"''', r'''“'''), # double opening
(r'''\b( )?\.{3}''', r'''\1…'''), # ellipsis
(r'''\b([A-Z][A-Z0-9]{2,})\b(\(([^\)]+)\))''', r'''\1'''), # 3+ uppercase acronym
(r'''(^|[^"][>\s])([A-Z][A-Z0-9 ]{2,})([^\2\3'''), # 3+ uppercase caps
(r'''\s?--\s?''', r'''—'''), # em dash
(r'''\s-\s''', r''' – '''), # en dash
(r'''(\d+) ?x ?(\d+)''', r'''\1×\2'''), # dimension sign
(r'''\b ?(\((tm|TM)\))''', r'''™'''), # trademark
(r'''\b ?(\([rR]\))''', r'''®'''), # registered
(r'''\b ?(\([cC]\))''', r'''©'''), # registered
]
# set toggle for turning off replacements between or
codepre = 0
# if there is no html, do a simple search and replace
if not re.search(r'''<.*>''', text):
for glyph_search, glyph_replace in glyphs:
_debug(text)
_debug("applying %s" % glyph_search)
text = preg_replace(glyph_search, glyph_replace, text)
_debug(text)
else:
lines = []
# else split the text into an array at <>
for line in re.split('(<.*?>)', text):
if re.match('<(code|pre|kbd|notextile)>', line.lower()):
codepre = 1
elif re.match('(code|pre|kbd|notextile)>', line.lower()):
codepre = 0
if (not re.match('<.*?>', line)) and (not codepre):
for glyph_search, glyph_replace in glyphs:
line = preg_replace(glyph_search, glyph_replace, line)
if codepre:
# escape <>& if between
line = htmlspecialchars(line, ENT_NOQUOTES)
line = line.replace('<pre>', '')
line = line.replace('<code>', '')
lines.append(line)
text = ''.join(lines)
### Block level formatting
# deal with forced breaks; this is going to be a problem between
# tags, but we'll clean them later
text = preg_replace('''(\\S)(_*?)([^\\w\\s]*?) *?\n([^#*\\s])''', r'''\1\2\3
\4''', text)
# might be a problem with lists
text = text.replace(r'''l>
''', '''l>\n''')
blocks = [(r'''^\s?\*\s(.*)''', '''\t\\1 '''), # bulleted list *
(r'''^\s?#\s(.*)''', '''\t\\1 '''), # numeric list #
(r'''^bq\. (.*)''', '''\t\\1
'''), # blockquote bq.
(r'''^h(\d)\(([\w]+)\)\.\s(.*)''', '''\t\\3 '''), # header hn(class). w/ css class
(r'''^h(\d)\. (.*)''', '''\t\\2 '''), # plain header hn.
(r'''^p\(([\w]+)\)\.\s(.*)''', '''\t\\2
'''), # para p(class). w/ css class
(r'''^p\. (.*)''', '''\t\\1
'''), # plain paragraph
('''^([^\t ]+.*)''', '''\t\\1
''') # remaining plain paragraph
]
list = ''
pre = 0
rc = []
for line in text.split('\n') + [' ']:
# make sure line isn't blank
if line:
# matches are off if we're between or tags
if re.search('', line.lower()):
pre = 1
# deal with block replacements first, then see if we're in a list
if not pre:
for block_find, block_replace in blocks:
line = preg_replace(block_find, block_replace, line)
# kill any br tags that slipped in earlier
if pre:
line = line.replace(r'''
''', '''\n''')
# matches back on after
if re.search('
', line.lower()):
pre = 0
# at the beginning of a list, $line switches to a value
if (not list) and re.match('\t\n\\1\\2''', line)
list = line[2] # "u" or "o", presumably
elif list and (not re.match('''\t \n\\1''', line)
list = ''
rc.append(line)
text = '\n'.join(rc)
#clean up
text = preg_replace(r'''<\/?notextile>''', "", text)
# clean up liu and lio
text = preg_replace(r'''<(\/?)li(u|o)>''', r'''<\1li>''', text)
# clean up empty titles
text = text.replace(' title=""', '')
# turn the temp char back to an ampersand entity
text = text.replace("x%x%", "&")
# Newline linebreaks, just for markup tidiness
text = text.replace('''
''', '''
\n''')
return text
if __name__ == '__main__':
import sys
print textile(sys.stdin.read())