truncatehtml_at_word

Author:: zakj
Posted:: November 10, 2008
Language:: Python
Version:: 1.0
Score:: 1 (after 1 ratings)

Download
Raw

Much stolen from base truncate_html_words. The difference is that this filter takes a number of characters as its argument and truncates to the nearest word boundary less than that count, rather than specifying a number of words.

@register.filter
def truncatehtml_at_word(s, chars):
    """
    Truncate a string to the nearest word boundary less than the given number
    of characters.  Whitespace is not included in the character count.  If the
    string contains HTML, tags and comments are also not included in the
    character count.  Closes opened HTML tags whose closing tags might have
    been truncated.
    """

    length = int(chars)
    if length <= 0:
        return u''
    re_words = truncatehtml_at_word.re_words
    re_tag = truncatehtml_at_word.re_tag
    html4_singlets = truncatehtml_at_word.html4_singlets

    # Count non-HTML characters and keep note of open tags.
    open_tags = []
    count = 0
    pos = 0
    truncate_at = 0
    while count < length:
        m = re_words.search(s, pos)
        if not m:
            # No more words in the string.
            break
        pos = m.end(0)
        if m.group(1):
            # It's an actual non-HTML word.  If adding this word would exceed
            # our length threshold, then we're done.
            count += len(m.group(1))
            if count > length:
                break
            # Otherwise, update our truncation point to include the word.
            truncate_at = pos
            continue
        # Check for tag.
        tag = re_tag.match(m.group(0))
        if not tag:
            continue
        closing_tag, tagname, self_closing = tag.groups()
        tagname = tagname.lower()  # Element names are always case-insensitive
        if self_closing or tagname in html4_singlets:
            pass
        elif closing_tag:
            # Check for match in open tags list
            try:
                i = open_tags.index(tagname)
            except ValueError:
                pass
            else:
                # SGML: An end tag closes, back to the matching start tag, all
                # unclosed intervening start tags with omitted end tags
                open_tags = open_tags[i+1:]
        else:
            # Add it to the start of the open tags list
            open_tags.insert(0, tagname)
        truncate_at = pos
    # Don't bother closing tags if we didn't need to truncate.
    if truncate_at >= len(s):
        return s
    out = s[:truncate_at]
    for tag in open_tags:
        out += '</%s>' % tag
    if len(out) < len(s):
        out += '&nbsp;&hellip;'
    return out
truncatehtml_at_word.re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
truncatehtml_at_word.re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
# <p> is included here despite not being a true singlet to avoid adding
# incorrect closing tags to something like "para 1 <p> para 2 <p>".
truncatehtml_at_word.html4_singlets = ('br', 'col', 'link', 'base', 'img',
                                       'param', 'area', 'hr', 'input', 'p')

Comments

Please login first before commenting.

truncatehtml_at_word

More like this

Comments