import re def clean_word(txt, its): for i in "font div span font img hr table td tr".split(): r=re.compile(r']*>' % i) txt = r.sub('',txt) for i in [ r'', r'', r'<(\w:[^>]*?)>.*', r'class=".*?"', r'<.--.*?-->', r'<!--.*?-->', #r']*> ]*>', #r']*>\s*]*>', r"""align=["'][^"']*["']""", r"""style=["'][^"']*["']""", r'{mso-[^}]*}', r'<[^>]*>(( )|\s*)]*>', ]: r=re.compile(i, re.DOTALL) txt = r.sub('',txt) if its>0: return clean_word(txt, its-1) r = re.compile(r'(\s*){1,9999}') txt = r.sub("

",txt) return txt