from lxml import html, etree import re register = Library() css_cleanup_regex = re.compile('((font|padding|margin)(-[^:]+)?|line-height):\s*[^;]+;') def _cleanup_elements(elem): """ Removes empty elements from HTML (i.e. those without text inside). If the tag has a 'style' attribute, we remove the css attributes we don't want. """ if elem.text_content().strip() == '': elem.drop_tree() else: if elem.attrib.has_key('style'): elem.attrib['style'] = css_cleanup_regex.sub('', elem.attrib['style']) for sub in elem: _cleanup_elements(sub) @register.simple_tag def cleanup_html(string): """ Makes generated HTML (i.e. ouput from the WYSISYG) look almost decent. """ try: elem = html.fromstring(string) _cleanup_elements(elem) html_string = html.tostring(elem) lines = [] for line in html_string.splitlines(): line = line.rstrip() if line != '': lines.append(line) return '\n'.join(lines) except etree.XMLSyntaxError: return string