Copybara | 854996b | 2021-09-07 19:36:02 +0000 | [diff] [blame] | 1 | #!/usr/bin/python |
| 2 | """markdown.py: A Markdown-styled-text to HTML converter in Python. |
| 3 | |
| 4 | Usage: |
| 5 | ./markdown.py textfile.markdown |
| 6 | |
| 7 | Calling: |
| 8 | import markdown |
| 9 | somehtml = markdown.markdown(sometext) |
| 10 | |
| 11 | For other versions of markdown, see: |
| 12 | http://www.freewisdom.org/projects/python-markdown/ |
| 13 | http://en.wikipedia.org/wiki/Markdown |
| 14 | """ |
| 15 | |
| 16 | from __future__ import absolute_import |
| 17 | from __future__ import division |
| 18 | from __future__ import print_function |
| 19 | |
| 20 | import md5 |
| 21 | import re |
| 22 | import sys |
| 23 | |
| 24 | __version__ = '1.0.1-2' # port of 1.0.1 |
| 25 | __license__ = "GNU GPL 2" |
| 26 | __author__ = [ |
| 27 | 'John Gruber <http://daringfireball.net/>', |
| 28 | 'Tollef Fog Heen <tfheen@err.no>', |
| 29 | 'Aaron Swartz <me@aaronsw.com>' |
| 30 | ] |
| 31 | |
| 32 | def htmlquote(text): |
| 33 | """Encodes `text` for raw use in HTML.""" |
| 34 | text = text.replace("&", "&") # Must be done first! |
| 35 | text = text.replace("<", "<") |
| 36 | text = text.replace(">", ">") |
| 37 | text = text.replace("'", "'") |
| 38 | text = text.replace('"', """) |
| 39 | return text |
| 40 | |
| 41 | def semirandom(seed): |
| 42 | x = 0 |
| 43 | for c in md5.new(seed).digest(): x += ord(c) |
| 44 | return x / (255*16.) |
| 45 | |
| 46 | class _Markdown: |
| 47 | emptyelt = " />" |
| 48 | tabwidth = 4 |
| 49 | |
| 50 | escapechars = '\\`*_{}[]()>#+-.!' |
| 51 | escapetable = {} |
| 52 | for char in escapechars: |
| 53 | escapetable[char] = md5.new(char).hexdigest() |
| 54 | |
| 55 | r_multiline = re.compile("\n{2,}") |
| 56 | r_stripspace = re.compile(r"^[ \t]+$", re.MULTILINE) |
| 57 | def parse(self, text): |
| 58 | self.urls = {} |
| 59 | self.titles = {} |
| 60 | self.html_blocks = {} |
| 61 | self.list_level = 0 |
| 62 | |
| 63 | text = text.replace("\r\n", "\n") |
| 64 | text = text.replace("\r", "\n") |
| 65 | text += "\n\n" |
| 66 | text = self._Detab(text) |
| 67 | text = self.r_stripspace.sub("", text) |
| 68 | text = self._HashHTMLBlocks(text) |
| 69 | text = self._StripLinkDefinitions(text) |
| 70 | text = self._RunBlockGamut(text) |
| 71 | text = self._UnescapeSpecialChars(text) |
| 72 | return text |
| 73 | |
| 74 | r_StripLinkDefinitions = re.compile(r""" |
| 75 | ^[ ]{0,%d}\[(.+)\]: # id = $1 |
| 76 | [ \t]*\n?[ \t]* |
| 77 | <?(\S+?)>? # url = $2 |
| 78 | [ \t]*\n?[ \t]* |
| 79 | (?: |
| 80 | (?<=\s) # lookbehind for whitespace |
| 81 | [\"\(] # " is backlashed so it colorizes our code right |
| 82 | (.+?) # title = $3 |
| 83 | [\"\)] |
| 84 | [ \t]* |
| 85 | )? # title is optional |
| 86 | (?:\n+|\Z) |
| 87 | """ % (tabwidth-1), re.MULTILINE|re.VERBOSE) |
| 88 | def _StripLinkDefinitions(self, text): |
| 89 | def replacefunc(matchobj): |
| 90 | (t1, t2, t3) = matchobj.groups() |
| 91 | #@@ case sensitivity? |
| 92 | self.urls[t1.lower()] = self._EncodeAmpsAndAngles(t2) |
| 93 | if t3 is not None: |
| 94 | self.titles[t1.lower()] = t3.replace('"', '"') |
| 95 | return "" |
| 96 | |
| 97 | text = self.r_StripLinkDefinitions.sub(replacefunc, text) |
| 98 | return text |
| 99 | |
| 100 | blocktagsb = r"p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|math" |
| 101 | blocktagsa = blocktagsb + "|ins|del" |
| 102 | |
| 103 | r_HashHTMLBlocks1 = re.compile(r""" |
| 104 | ( # save in $1 |
| 105 | ^ # start of line (with /m) |
| 106 | <(%s) # start tag = $2 |
| 107 | \b # word break |
| 108 | (.*\n)*? # any number of lines, minimally matching |
| 109 | </\2> # the matching end tag |
| 110 | [ \t]* # trailing spaces/tabs |
| 111 | (?=\n+|$) # followed by a newline or end of document |
| 112 | ) |
| 113 | """ % blocktagsa, re.MULTILINE | re.VERBOSE) |
| 114 | |
| 115 | r_HashHTMLBlocks2 = re.compile(r""" |
| 116 | ( # save in $1 |
| 117 | ^ # start of line (with /m) |
| 118 | <(%s) # start tag = $2 |
| 119 | \b # word break |
| 120 | (.*\n)*? # any number of lines, minimally matching |
| 121 | .*</\2> # the matching end tag |
| 122 | [ \t]* # trailing spaces/tabs |
| 123 | (?=\n+|\Z) # followed by a newline or end of document |
| 124 | ) |
| 125 | """ % blocktagsb, re.MULTILINE | re.VERBOSE) |
| 126 | |
| 127 | r_HashHR = re.compile(r""" |
| 128 | (?: |
| 129 | (?<=\n\n) # Starting after a blank line |
| 130 | | # or |
| 131 | \A\n? # the beginning of the doc |
| 132 | ) |
| 133 | ( # save in $1 |
| 134 | [ ]{0,%d} |
| 135 | <(hr) # start tag = $2 |
| 136 | \b # word break |
| 137 | ([^<>])*? # |
| 138 | /?> # the matching end tag |
| 139 | [ \t]* |
| 140 | (?=\n{2,}|\Z)# followed by a blank line or end of document |
| 141 | ) |
| 142 | """ % (tabwidth-1), re.VERBOSE) |
| 143 | r_HashComment = re.compile(r""" |
| 144 | (?: |
| 145 | (?<=\n\n) # Starting after a blank line |
| 146 | | # or |
| 147 | \A\n? # the beginning of the doc |
| 148 | ) |
| 149 | ( # save in $1 |
| 150 | [ ]{0,%d} |
| 151 | (?: |
| 152 | <! |
| 153 | (--.*?--\s*)+ |
| 154 | > |
| 155 | ) |
| 156 | [ \t]* |
| 157 | (?=\n{2,}|\Z)# followed by a blank line or end of document |
| 158 | ) |
| 159 | """ % (tabwidth-1), re.VERBOSE) |
| 160 | |
| 161 | def _HashHTMLBlocks(self, text): |
| 162 | def handler(m): |
| 163 | key = md5.new(m.group(1)).hexdigest() |
| 164 | self.html_blocks[key] = m.group(1) |
| 165 | return "\n\n%s\n\n" % key |
| 166 | |
| 167 | text = self.r_HashHTMLBlocks1.sub(handler, text) |
| 168 | text = self.r_HashHTMLBlocks2.sub(handler, text) |
| 169 | oldtext = text |
| 170 | text = self.r_HashHR.sub(handler, text) |
| 171 | text = self.r_HashComment.sub(handler, text) |
| 172 | return text |
| 173 | |
| 174 | #@@@ wrong! |
| 175 | r_hr1 = re.compile(r'^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$', re.M) |
| 176 | r_hr2 = re.compile(r'^[ ]{0,2}([ ]?-[ ]?){3,}[ \t]*$', re.M) |
| 177 | r_hr3 = re.compile(r'^[ ]{0,2}([ ]?_[ ]?){3,}[ \t]*$', re.M) |
| 178 | |
| 179 | def _RunBlockGamut(self, text): |
| 180 | text = self._DoHeaders(text) |
| 181 | for x in [self.r_hr1, self.r_hr2, self.r_hr3]: |
| 182 | text = x.sub("\n<hr%s\n" % self.emptyelt, text); |
| 183 | text = self._DoLists(text) |
| 184 | text = self._DoCodeBlocks(text) |
| 185 | text = self._DoBlockQuotes(text) |
| 186 | |
| 187 | # We did this in parse() |
| 188 | # to escape the source |
| 189 | # now it's stuff _we_ made |
| 190 | # so we don't wrap it in <p>s. |
| 191 | text = self._HashHTMLBlocks(text) |
| 192 | text = self._FormParagraphs(text) |
| 193 | return text |
| 194 | |
| 195 | r_NewLine = re.compile(" {2,}\n") |
| 196 | def _RunSpanGamut(self, text): |
| 197 | text = self._DoCodeSpans(text) |
| 198 | text = self._EscapeSpecialChars(text) |
| 199 | text = self._DoImages(text) |
| 200 | text = self._DoAnchors(text) |
| 201 | text = self._DoAutoLinks(text) |
| 202 | text = self._EncodeAmpsAndAngles(text) |
| 203 | text = self._DoItalicsAndBold(text) |
| 204 | text = self.r_NewLine.sub(" <br%s\n" % self.emptyelt, text) |
| 205 | return text |
| 206 | |
| 207 | def _EscapeSpecialChars(self, text): |
| 208 | tokens = self._TokenizeHTML(text) |
| 209 | text = "" |
| 210 | for cur_token in tokens: |
| 211 | if cur_token[0] == "tag": |
| 212 | cur_token[1] = cur_token[1].replace('*', self.escapetable["*"]) |
| 213 | cur_token[1] = cur_token[1].replace('_', self.escapetable["_"]) |
| 214 | text += cur_token[1] |
| 215 | else: |
| 216 | text += self._EncodeBackslashEscapes(cur_token[1]) |
| 217 | return text |
| 218 | |
| 219 | r_DoAnchors1 = re.compile( |
| 220 | r""" ( # wrap whole match in $1 |
| 221 | \[ |
| 222 | (.*?) # link text = $2 |
| 223 | # [for bracket nesting, see below] |
| 224 | \] |
| 225 | |
| 226 | [ ]? # one optional space |
| 227 | (?:\n[ ]*)? # one optional newline followed by spaces |
| 228 | |
| 229 | \[ |
| 230 | (.*?) # id = $3 |
| 231 | \] |
| 232 | ) |
| 233 | """, re.S|re.VERBOSE) |
| 234 | r_DoAnchors2 = re.compile( |
| 235 | r""" ( # wrap whole match in $1 |
| 236 | \[ |
| 237 | (.*?) # link text = $2 |
| 238 | \] |
| 239 | \( # literal paren |
| 240 | [ \t]* |
| 241 | <?(.+?)>? # href = $3 |
| 242 | [ \t]* |
| 243 | ( # $4 |
| 244 | ([\'\"]) # quote char = $5 |
| 245 | (.*?) # Title = $6 |
| 246 | \5 # matching quote |
| 247 | )? # title is optional |
| 248 | \) |
| 249 | ) |
| 250 | """, re.S|re.VERBOSE) |
| 251 | def _DoAnchors(self, text): |
| 252 | # We here don't do the same as the perl version, as python's regex |
| 253 | # engine gives us no way to match brackets. |
| 254 | |
| 255 | def handler1(m): |
| 256 | whole_match = m.group(1) |
| 257 | link_text = m.group(2) |
| 258 | link_id = m.group(3).lower() |
| 259 | if not link_id: link_id = link_text.lower() |
| 260 | title = self.titles.get(link_id, None) |
| 261 | |
| 262 | |
| 263 | if self.urls.has_key(link_id): |
| 264 | url = self.urls[link_id] |
| 265 | url = url.replace("*", self.escapetable["*"]) |
| 266 | url = url.replace("_", self.escapetable["_"]) |
| 267 | res = '<a href="%s"' % htmlquote(url) |
| 268 | |
| 269 | if title: |
| 270 | title = title.replace("*", self.escapetable["*"]) |
| 271 | title = title.replace("_", self.escapetable["_"]) |
| 272 | res += ' title="%s"' % htmlquote(title) |
| 273 | res += ">%s</a>" % htmlquote(link_text) |
| 274 | else: |
| 275 | res = whole_match |
| 276 | return res |
| 277 | |
| 278 | def handler2(m): |
| 279 | whole_match = m.group(1) |
| 280 | link_text = m.group(2) |
| 281 | url = m.group(3) |
| 282 | title = m.group(6) |
| 283 | |
| 284 | url = url.replace("*", self.escapetable["*"]) |
| 285 | url = url.replace("_", self.escapetable["_"]) |
| 286 | res = '''<a href="%s"''' % htmlquote(url) |
| 287 | |
| 288 | if title: |
| 289 | title = title.replace('"', '"') |
| 290 | title = title.replace("*", self.escapetable["*"]) |
| 291 | title = title.replace("_", self.escapetable["_"]) |
| 292 | res += ' title="%s"' % htmlquote(title) |
| 293 | res += ">%s</a>" % htmlquote(link_text) |
| 294 | return res |
| 295 | |
| 296 | text = self.r_DoAnchors1.sub(handler1, text) |
| 297 | text = self.r_DoAnchors2.sub(handler2, text) |
| 298 | return text |
| 299 | |
| 300 | r_DoImages1 = re.compile( |
| 301 | r""" ( # wrap whole match in $1 |
| 302 | !\[ |
| 303 | (.*?) # alt text = $2 |
| 304 | \] |
| 305 | |
| 306 | [ ]? # one optional space |
| 307 | (?:\n[ ]*)? # one optional newline followed by spaces |
| 308 | |
| 309 | \[ |
| 310 | (.*?) # id = $3 |
| 311 | \] |
| 312 | |
| 313 | ) |
| 314 | """, re.VERBOSE|re.S) |
| 315 | |
| 316 | r_DoImages2 = re.compile( |
| 317 | r""" ( # wrap whole match in $1 |
| 318 | !\[ |
| 319 | (.*?) # alt text = $2 |
| 320 | \] |
| 321 | \( # literal paren |
| 322 | [ \t]* |
| 323 | <?(\S+?)>? # src url = $3 |
| 324 | [ \t]* |
| 325 | ( # $4 |
| 326 | ([\'\"]) # quote char = $5 |
| 327 | (.*?) # title = $6 |
| 328 | \5 # matching quote |
| 329 | [ \t]* |
| 330 | )? # title is optional |
| 331 | \) |
| 332 | ) |
| 333 | """, re.VERBOSE|re.S) |
| 334 | |
| 335 | def _DoImages(self, text): |
| 336 | def handler1(m): |
| 337 | whole_match = m.group(1) |
| 338 | alt_text = m.group(2) |
| 339 | link_id = m.group(3).lower() |
| 340 | |
| 341 | if not link_id: |
| 342 | link_id = alt_text.lower() |
| 343 | |
| 344 | alt_text = alt_text.replace('"', """) |
| 345 | if self.urls.has_key(link_id): |
| 346 | url = self.urls[link_id] |
| 347 | url = url.replace("*", self.escapetable["*"]) |
| 348 | url = url.replace("_", self.escapetable["_"]) |
| 349 | res = '''<img src="%s" alt="%s"''' % (htmlquote(url), htmlquote(alt_text)) |
| 350 | if self.titles.has_key(link_id): |
| 351 | title = self.titles[link_id] |
| 352 | title = title.replace("*", self.escapetable["*"]) |
| 353 | title = title.replace("_", self.escapetable["_"]) |
| 354 | res += ' title="%s"' % htmlquote(title) |
| 355 | res += self.emptyelt |
| 356 | else: |
| 357 | res = whole_match |
| 358 | return res |
| 359 | |
| 360 | def handler2(m): |
| 361 | whole_match = m.group(1) |
| 362 | alt_text = m.group(2) |
| 363 | url = m.group(3) |
| 364 | title = m.group(6) or '' |
| 365 | |
| 366 | alt_text = alt_text.replace('"', """) |
| 367 | title = title.replace('"', """) |
| 368 | url = url.replace("*", self.escapetable["*"]) |
| 369 | url = url.replace("_", self.escapetable["_"]) |
| 370 | res = '<img src="%s" alt="%s"' % (htmlquote(url), htmlquote(alt_text)) |
| 371 | if title is not None: |
| 372 | title = title.replace("*", self.escapetable["*"]) |
| 373 | title = title.replace("_", self.escapetable["_"]) |
| 374 | res += ' title="%s"' % htmlquote(title) |
| 375 | res += self.emptyelt |
| 376 | return res |
| 377 | |
| 378 | text = self.r_DoImages1.sub(handler1, text) |
| 379 | text = self.r_DoImages2.sub(handler2, text) |
| 380 | return text |
| 381 | |
| 382 | r_DoHeaders = re.compile(r"^(\#{1,6})[ \t]*(.+?)[ \t]*\#*\n+", re.VERBOSE|re.M) |
| 383 | def _DoHeaders(self, text): |
| 384 | def findheader(text, c, n): |
| 385 | textl = text.split('\n') |
| 386 | for i in range(len(textl)): |
| 387 | if i >= len(textl): continue |
| 388 | count = textl[i].strip().count(c) |
| 389 | if count > 0 and count == len(textl[i].strip()) and textl[i+1].strip() == '' and textl[i-1].strip() != '': |
| 390 | textl = textl[:i] + textl[i+1:] |
| 391 | textl[i-1] = '<h'+n+'>'+self._RunSpanGamut(textl[i-1])+'</h'+n+'>' |
| 392 | textl = textl[:i] + textl[i+1:] |
| 393 | text = '\n'.join(textl) |
| 394 | return text |
| 395 | |
| 396 | def handler(m): |
| 397 | level = len(m.group(1)) |
| 398 | header = self._RunSpanGamut(m.group(2)) |
| 399 | return "<h%s>%s</h%s>\n\n" % (level, header, level) |
| 400 | |
| 401 | text = findheader(text, '=', '1') |
| 402 | text = findheader(text, '-', '2') |
| 403 | text = self.r_DoHeaders.sub(handler, text) |
| 404 | return text |
| 405 | |
| 406 | rt_l = r""" |
| 407 | ( |
| 408 | ( |
| 409 | [ ]{0,%d} |
| 410 | ([*+-]|\d+[.]) |
| 411 | [ \t]+ |
| 412 | ) |
| 413 | (?:.+?) |
| 414 | ( |
| 415 | \Z |
| 416 | | |
| 417 | \n{2,} |
| 418 | (?=\S) |
| 419 | (?![ \t]* ([*+-]|\d+[.])[ \t]+) |
| 420 | ) |
| 421 | ) |
| 422 | """ % (tabwidth - 1) |
| 423 | r_DoLists = re.compile('^'+rt_l, re.M | re.VERBOSE | re.S) |
| 424 | r_DoListsTop = re.compile( |
| 425 | r'(?:\A\n?|(?<=\n\n))'+rt_l, re.M | re.VERBOSE | re.S) |
| 426 | |
| 427 | def _DoLists(self, text): |
| 428 | def handler(m): |
| 429 | list_type = "ol" |
| 430 | if m.group(3) in [ "*", "-", "+" ]: |
| 431 | list_type = "ul" |
| 432 | listn = m.group(1) |
| 433 | listn = self.r_multiline.sub("\n\n\n", listn) |
| 434 | res = self._ProcessListItems(listn) |
| 435 | res = "<%s>\n%s</%s>\n" % (list_type, res, list_type) |
| 436 | return res |
| 437 | |
| 438 | if self.list_level: |
| 439 | text = self.r_DoLists.sub(handler, text) |
| 440 | else: |
| 441 | text = self.r_DoListsTop.sub(handler, text) |
| 442 | return text |
| 443 | |
| 444 | r_multiend = re.compile(r"\n{2,}\Z") |
| 445 | r_ProcessListItems = re.compile(r""" |
| 446 | (\n)? # leading line = $1 |
| 447 | (^[ \t]*) # leading whitespace = $2 |
| 448 | ([*+-]|\d+[.]) [ \t]+ # list marker = $3 |
| 449 | ((?:.+?) # list item text = $4 |
| 450 | (\n{1,2})) |
| 451 | (?= \n* (\Z | \2 ([*+-]|\d+[.]) [ \t]+)) |
| 452 | """, re.VERBOSE | re.M | re.S) |
| 453 | |
| 454 | def _ProcessListItems(self, text): |
| 455 | self.list_level += 1 |
| 456 | text = self.r_multiend.sub("\n", text) |
| 457 | |
| 458 | def handler(m): |
| 459 | item = m.group(4) |
| 460 | leading_line = m.group(1) |
| 461 | leading_space = m.group(2) |
| 462 | |
| 463 | if leading_line or self.r_multiline.search(item): |
| 464 | item = self._RunBlockGamut(self._Outdent(item)) |
| 465 | else: |
| 466 | item = self._DoLists(self._Outdent(item)) |
| 467 | if item[-1] == "\n": item = item[:-1] # chomp |
| 468 | item = self._RunSpanGamut(item) |
| 469 | return "<li>%s</li>\n" % item |
| 470 | |
| 471 | text = self.r_ProcessListItems.sub(handler, text) |
| 472 | self.list_level -= 1 |
| 473 | return text |
| 474 | |
| 475 | r_DoCodeBlocks = re.compile(r""" |
| 476 | (?:\n\n|\A) |
| 477 | ( # $1 = the code block |
| 478 | (?: |
| 479 | (?:[ ]{%d} | \t) # Lines must start with a tab or equiv |
| 480 | .*\n+ |
| 481 | )+ |
| 482 | ) |
| 483 | ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space/end of doc |
| 484 | """ % (tabwidth, tabwidth), re.M | re.VERBOSE) |
| 485 | def _DoCodeBlocks(self, text): |
| 486 | def handler(m): |
| 487 | codeblock = m.group(1) |
| 488 | codeblock = self._EncodeCode(self._Outdent(codeblock)) |
| 489 | codeblock = self._Detab(codeblock) |
| 490 | codeblock = codeblock.lstrip("\n") |
| 491 | codeblock = codeblock.rstrip() |
| 492 | res = "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock |
| 493 | return res |
| 494 | |
| 495 | text = self.r_DoCodeBlocks.sub(handler, text) |
| 496 | return text |
| 497 | r_DoCodeSpans = re.compile(r""" |
| 498 | (`+) # $1 = Opening run of ` |
| 499 | (.+?) # $2 = The code block |
| 500 | (?<!`) |
| 501 | \1 # Matching closer |
| 502 | (?!`) |
| 503 | """, re.I|re.VERBOSE) |
| 504 | def _DoCodeSpans(self, text): |
| 505 | def handler(m): |
| 506 | c = m.group(2) |
| 507 | c = c.strip() |
| 508 | c = self._EncodeCode(c) |
| 509 | return "<code>%s</code>" % c |
| 510 | |
| 511 | text = self.r_DoCodeSpans.sub(handler, text) |
| 512 | return text |
| 513 | |
| 514 | def _EncodeCode(self, text): |
| 515 | text = text.replace("&","&") |
| 516 | text = text.replace("<","<") |
| 517 | text = text.replace(">",">") |
| 518 | for c in "*_{}[]\\": |
| 519 | text = text.replace(c, self.escapetable[c]) |
| 520 | return text |
| 521 | |
| 522 | |
| 523 | r_DoBold = re.compile(r"(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1", re.VERBOSE | re.S) |
| 524 | r_DoItalics = re.compile(r"(\*|_) (?=\S) (.+?) (?<=\S) \1", re.VERBOSE | re.S) |
| 525 | def _DoItalicsAndBold(self, text): |
| 526 | text = self.r_DoBold.sub(r"<strong>\2</strong>", text) |
| 527 | text = self.r_DoItalics.sub(r"<em>\2</em>", text) |
| 528 | return text |
| 529 | |
| 530 | r_start = re.compile(r"^", re.M) |
| 531 | r_DoBlockQuotes1 = re.compile(r"^[ \t]*>[ \t]?", re.M) |
| 532 | r_DoBlockQuotes2 = re.compile(r"^[ \t]+$", re.M) |
| 533 | r_DoBlockQuotes3 = re.compile(r""" |
| 534 | ( # Wrap whole match in $1 |
| 535 | ( |
| 536 | ^[ \t]*>[ \t]? # '>' at the start of a line |
| 537 | .+\n # rest of the first line |
| 538 | (.+\n)* # subsequent consecutive lines |
| 539 | \n* # blanks |
| 540 | )+ |
| 541 | )""", re.M | re.VERBOSE) |
| 542 | r_protectpre = re.compile(r'(\s*<pre>.+?</pre>)', re.S) |
| 543 | r_propre = re.compile(r'^ ', re.M) |
| 544 | |
| 545 | def _DoBlockQuotes(self, text): |
| 546 | def prehandler(m): |
| 547 | return self.r_propre.sub('', m.group(1)) |
| 548 | |
| 549 | def handler(m): |
| 550 | bq = m.group(1) |
| 551 | bq = self.r_DoBlockQuotes1.sub("", bq) |
| 552 | bq = self.r_DoBlockQuotes2.sub("", bq) |
| 553 | bq = self._RunBlockGamut(bq) |
| 554 | bq = self.r_start.sub(" ", bq) |
| 555 | bq = self.r_protectpre.sub(prehandler, bq) |
| 556 | return "<blockquote>\n%s\n</blockquote>\n\n" % bq |
| 557 | |
| 558 | text = self.r_DoBlockQuotes3.sub(handler, text) |
| 559 | return text |
| 560 | |
| 561 | r_tabbed = re.compile(r"^([ \t]*)") |
| 562 | def _FormParagraphs(self, text): |
| 563 | text = text.strip("\n") |
| 564 | grafs = self.r_multiline.split(text) |
| 565 | |
| 566 | for g in range(len(grafs)): |
| 567 | t = grafs[g].strip() #@@? |
| 568 | if not self.html_blocks.has_key(t): |
| 569 | t = self._RunSpanGamut(t) |
| 570 | t = self.r_tabbed.sub(r"<p>", t) |
| 571 | t += "</p>" |
| 572 | grafs[g] = t |
| 573 | |
| 574 | for g in range(len(grafs)): |
| 575 | t = grafs[g].strip() |
| 576 | if self.html_blocks.has_key(t): |
| 577 | grafs[g] = self.html_blocks[t] |
| 578 | |
| 579 | return "\n\n".join(grafs) |
| 580 | |
| 581 | r_EncodeAmps = re.compile(r"&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)") |
| 582 | r_EncodeAngles = re.compile(r"<(?![a-z/?\$!])") |
| 583 | def _EncodeAmpsAndAngles(self, text): |
| 584 | text = self.r_EncodeAmps.sub("&", text) |
| 585 | text = self.r_EncodeAngles.sub("<", text) |
| 586 | return text |
| 587 | |
| 588 | def _EncodeBackslashEscapes(self, text): |
| 589 | for char in self.escapechars: |
| 590 | text = text.replace("\\" + char, self.escapetable[char]) |
| 591 | return text |
| 592 | |
| 593 | r_link = re.compile(r"<((https?|ftp):[^\'\">\s]+)>", re.I) |
| 594 | r_email = re.compile(r""" |
| 595 | < |
| 596 | (?:mailto:)? |
| 597 | ( |
| 598 | [-.\w]+ |
| 599 | \@ |
| 600 | [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ |
| 601 | ) |
| 602 | >""", re.VERBOSE|re.I) |
| 603 | def _DoAutoLinks(self, text): |
| 604 | text = self.r_link.sub(r'<a href="\1">\1</a>', text) |
| 605 | |
| 606 | def handler(m): |
| 607 | l = m.group(1) |
| 608 | return self._EncodeEmailAddress(self._UnescapeSpecialChars(l)) |
| 609 | |
| 610 | text = self.r_email.sub(handler, text) |
| 611 | return text |
| 612 | |
| 613 | r_EncodeEmailAddress = re.compile(r">.+?:") |
| 614 | def _EncodeEmailAddress(self, text): |
| 615 | encode = [ |
| 616 | lambda x: "&#%s;" % ord(x), |
| 617 | lambda x: "&#x%X;" % ord(x), |
| 618 | lambda x: x |
| 619 | ] |
| 620 | |
| 621 | text = "mailto:" + text |
| 622 | addr = "" |
| 623 | for c in text: |
| 624 | if c == ':': addr += c; continue |
| 625 | |
| 626 | r = semirandom(addr) |
| 627 | if r < 0.45: |
| 628 | addr += encode[1](c) |
| 629 | elif r > 0.9 and c != '@': |
| 630 | addr += encode[2](c) |
| 631 | else: |
| 632 | addr += encode[0](c) |
| 633 | |
| 634 | text = '<a href="%s">%s</a>' % (addr, addr) |
| 635 | text = self.r_EncodeEmailAddress.sub('>', text) |
| 636 | return text |
| 637 | |
| 638 | def _UnescapeSpecialChars(self, text): |
| 639 | for key in self.escapetable.keys(): |
| 640 | text = text.replace(self.escapetable[key], key) |
| 641 | return text |
| 642 | |
| 643 | tokenize_depth = 6 |
| 644 | tokenize_nested_tags = '|'.join([r'(?:<[a-z/!$](?:[^<>]'] * tokenize_depth) + (')*>)' * tokenize_depth) |
| 645 | r_TokenizeHTML = re.compile( |
| 646 | r"""(?: <! ( -- .*? -- \s* )+ > ) | # comment |
| 647 | (?: <\? .*? \?> ) | # processing instruction |
| 648 | %s # nested tags |
| 649 | """ % tokenize_nested_tags, re.I|re.VERBOSE) |
| 650 | def _TokenizeHTML(self, text): |
| 651 | pos = 0 |
| 652 | tokens = [] |
| 653 | matchobj = self.r_TokenizeHTML.search(text, pos) |
| 654 | while matchobj: |
| 655 | whole_tag = matchobj.string[matchobj.start():matchobj.end()] |
| 656 | sec_start = matchobj.end() |
| 657 | tag_start = sec_start - len(whole_tag) |
| 658 | if pos < tag_start: |
| 659 | tokens.append(["text", matchobj.string[pos:tag_start]]) |
| 660 | |
| 661 | tokens.append(["tag", whole_tag]) |
| 662 | pos = sec_start |
| 663 | matchobj = self.r_TokenizeHTML.search(text, pos) |
| 664 | |
| 665 | if pos < len(text): |
| 666 | tokens.append(["text", text[pos:]]) |
| 667 | return tokens |
| 668 | |
| 669 | r_Outdent = re.compile(r"""^(\t|[ ]{1,%d})""" % tabwidth, re.M) |
| 670 | def _Outdent(self, text): |
| 671 | text = self.r_Outdent.sub("", text) |
| 672 | return text |
| 673 | |
| 674 | def _Detab(self, text): return text.expandtabs(self.tabwidth) |
| 675 | |
| 676 | def Markdown(*args, **kw): return _Markdown().parse(*args, **kw) |
| 677 | markdown = Markdown |
| 678 | |
| 679 | if __name__ == '__main__': |
| 680 | if len(sys.argv) > 1: |
| 681 | print(Markdown(open(sys.argv[1]).read())) |
| 682 | else: |
| 683 | print(Markdown(sys.stdin.read())) |