import logging
import os.path
import re
from bs4 import BeautifulSoup
import mistune
logger = logging.getLogger(__name__)
[docs]def convert(text):
'''Render markdown text as simple TEI.
Does not include namespaces or wrapping elements; assumes that the
rendered markdown will be inserted into a TEI document as text
content, and that it is not intended to be an entire, valid document
on its own.
'''
mkdown = mistune.Markdown(renderer=TeiMarkdownRenderer())
return mkdown(TeiMarkdownRenderer.preprocess(text))
[docs]class TeiMarkdownRenderer(mistune.Renderer):
'''TEI Markdown renderer for use with :mod:`mistune` markdown
parsing and rendering library. Renderer is based on the built-in
mistune HTML renderer.'''
audiovideo_block_re = re.compile(
r'(<(audio|video)(?!</(audio>|video)).*</(audio|video)>)',
re.MULTILINE | re.DOTALL | re.UNICODE)
#: common html5 audio file extensions and corresponding mimetypes;
#: used to infer audio mimetype when it is not specified
audiovideo_ext_mimetype = {
# audio/ or video/ content prefix supplied based on html tag
'.aac': 'aac',
'.mp4': 'mp4',
'.m4a': 'mp4',
'.mp1': 'mpeg',
'.mp2': 'mpeg',
'.mp3': 'mpeg',
'.mpg': 'mpeg',
'.mpeg': 'mpeg',
'.oga': 'ogg',
'.ogg': 'ogg',
'.wav': 'wav',
'.webm': 'webm'
}
default_mimetype = {
'audio': 'mpeg',
'video': 'mp4'
}
def __init__(self, **kwargs):
self.options = kwargs
# def placeholder(self):
# """Returns the default, empty output value for the renderer.
# All renderer methods use the '+=' operator to append to this value.
# Default is a string so rendering HTML can build up a result string with
# the rendered Markdown.
# Can be overridden by Renderer subclasses to be types like an empty
# list, allowing the renderer to create a tree-like structure to
# represent the document (which can then be reprocessed later into a
# separate format like docx or pdf).
# """
# return "<?xml:namespace ns='%s' ?>" % TeiNote.ROOT_NS
@classmethod
[docs] def preprocess(cls, text):
'''Method to preprocess text to make sure it is converted properly.
Currently, adds whitespace to ensure that any audio tags will
be processed as an html block.
'''
# Add extra newlines around *any* audio blocks; doesn't hurt to
# have extra whitespace, but without it the audio tags will
# not be converted properly.
return cls.audiovideo_block_re.sub(r'\n\n\1\n\n', text)
[docs] def block_code(self, code, lang=None):
"""Rendering block level code.
:param code: text content of the code block.
:param lang: language of the given code.
"""
code = code.rstrip('\n')
attr = ''
if lang:
attr = ' lang="%s"' % lang
return '<code%s>%s</code>' % (attr, code)
[docs] def block_quote(self, text):
"""Rendering <quote> with the given text.
:param text: text content of the blockquote.
"""
return '<quote>%s</quote>' % text.rstrip('\n')
[docs] def block_html(self, html):
"""Rendering block level pure html content.
Currently only supports html5 audio tags.
:param html: text content of the html snippet.
"""
# parse html block so it can be easily inspected & traversed
soup = BeautifulSoup(html, 'xml')
# audio or video tag, if present
audiovideo = soup.audio or soup.video
# if block contains audio or video, convert to TEI mimetype tag
if audiovideo:
src = audiovideo.source.get('src', None)
content_type = audiovideo.source.get('type', None)
# NOTE: technically the regex will allow for a type attribute
# without a src; but that would be a broken, nonsensical,
# non-playable input - so that case is not handled.
if not src:
next
# mimetype is needed for tei media tag; if we don't have one,
# try to guess from the url
if not content_type:
basename, ext = os.path.splitext(src)
content_type = '%s/%s' % (
audiovideo.name.lower(), # audio or video
# content subtype by extension, with fallback
self.audiovideo_ext_mimetype.get(
ext,
self.default_mimetype[audiovideo.name.lower()]
)
)
return '<media mimeType="%(type)s" url="%(src)s"/>' % {
'type': content_type, 'src': src
}
# NOTE: default mistune logic here; probably not useful for TEI
if self.options.get('skip_style') and \
html.lower().startswith('<style'):
return ''
if self.options.get('escape'):
return mistune.escape(html)
return html
[docs] def hrule(self):
"""Rendering method for horizontal rule."""
return '<milestone rend="horizontal-rule"/>'
[docs] def list(self, body, ordered=True):
"""Rendering list tags.
:param body: body contents of the list.
:param ordered: whether this list is ordered or not.
"""
attr = ''
if ordered:
attr = ' rend="numbered"'
return '<list%s>%s</list>' % (attr, body)
[docs] def list_item(self, text):
"""Rendering list item."""
return '<item>%s</item>' % text
[docs] def paragraph(self, text):
"""Rendering paragraph tags. Like ``<p>``."""
return '<p>%s</p>' % text.strip(' ')
[docs] def table(self, header, body):
"""Rendering table element. Wrap header and body in it.
:param header: header part of the table.
:param body: body part of the table.
"""
return (
'<table><head>%s</head>'
'%s</table>'
) % (header, body)
[docs] def table_row(self, content):
"""Rendering a table row.
:param content: content of current table row.
"""
return '<row>%s</row>' % content
[docs] def table_cell(self, content, **flags):
"""Rendering a table cell.
:param content: content of current table cell.
:param header: whether this is header or not.
:param align: align of current table cell.
"""
if flags['header']:
role = "label"
else:
role = "data"
additional_attrs = ''
if 'align' in flags and flags['align']:
additional_attrs = ' rend="%s"' % flags['align']
return '<cell role="%s"%s>%s</cell>' % (role, additional_attrs, content)
[docs] def double_emphasis(self, text):
"""Rendering **strong** text.
:param text: text content for emphasis.
"""
return '<emph rend="bold">%s</emph>' % text
[docs] def emphasis(self, text):
"""Rendering *emphasis* text.
:param text: text content for emphasis.
"""
return '<emph rend="italic">%s</emph>' % text
[docs] def codespan(self, text):
"""Rendering inline `code` text.
:param text: text content for inline code.
"""
text = mistune.escape(text.rstrip(), smart_amp=False)
return '<code>%s</code>' % text
[docs] def linebreak(self):
"""Rendering line break."""
return '<lb/>'
[docs] def strikethrough(self, text):
"""Rendering ~~strikethrough~~ text.
:param text: text content for strikethrough.
"""
return '<del>%s</del>' % text
[docs] def text(self, text):
"""Rendering unformatted text.
:param text: text content.
"""
return mistune.escape(text)
[docs] def autolink(self, link, is_email=False):
"""Rendering a given link or email address.
:param link: link content or email address.
:param is_email: whether this is an email or not.
"""
link = mistune.escape(link)
if is_email:
tag = 'email'
attr = ''
else:
tag = 'ref'
attr = ' target="%s"' % link
return '<%(tag)s%(attr)s>%(text)s</%(tag)s>' % {
'tag':tag, 'text': link, 'attr': attr}
[docs] def link(self, link, title, text):
"""Rendering a given link with content and title.
:param link: href link for ``<a>`` tag.
:param title: title content for `title` attribute.
:param text: text content for description.
"""
if link.startswith('javascript:'):
link = ''
attr = ''
if title:
attr = ' n="%s"' % mistune.escape(title, quote=True)
return '<ref target="%s"%s>%s</ref>' % (link, attr, text)
[docs] def image(self, src, title, text):
"""Rendering a image with title and text.
:param src: source link of the image.
:param title: title text of the image.
:param text: alt text of the image.
"""
if src.startswith('javascript:'):
src = ''
text = mistune.escape(text, quote=True)
# markdown doesn't include mimetype, but it's required in TEI
# infer mimetype based on image suffix in the url
image_suffix = (src.rsplit('.', 1)[-1])
mimetype = "image/*"
if image_suffix in ['gif', 'png', 'jpeg']:
mimetype = 'image/%s' % image_suffix
elif image_suffix == 'jpg':
mimetype = 'image/jpeg'
tag = '<media mimetype="%s" url="%s">' % (mimetype, src)
if title or text:
desc_parts = ['<desc>']
if title:
desc_parts.append('<head>%s</head>' % title)
if text:
desc_parts.append('<p>%s</p>' % text)
desc_parts.append('</desc>')
tag += ''.join(desc_parts)
tag += '</media>'
return tag
[docs] def inline_html(self, html):
"""Rendering span level pure html content.
:param html: text content of the html snippet.
"""
# use beautiful soup to parse and read element name, attributes
soup = BeautifulSoup(html, 'lxml')
# only expect one element here
# NOTE: using xml parser had inconsistent results; using lxml
# wraps contents inside html body tags, so access content there
element = soup.html.body.contents[0]
text_content = element.string or ''
if element.name in ['i', 'em']:
return '<emph rend="italic">%s</emph>' % text_content
if element.name in ['b', 'strong']:
return '<emph rend="bold">%s</emph>' % text_content
if element.name == 'a':
# convert name anchor to <anchor xml:id="###"/>
# **preliminary** (anchor not valid in all contexts)
if element.get('name', None) or element.get('id', None):
el_id = element.get('id', None) or element.get('name', None)
return '<anchor xml:id="%s">%s</anchor>' % \
(el_id, text_content)
if self.options.get('escape'):
return mistune.escape(html)
return html
[docs] def newline(self):
"""Rendering newline element."""
# TODO
return ''