Detect blog platform

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pycurl
from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
from urlparse import urlparse

def get_doc(url):
    try:
        c = pycurl.Curl()

        c.setopt(pycurl.URL, url)
        import StringIO
        b = StringIO.StringIO()
        c.setopt(pycurl.WRITEFUNCTION, b.write)
        c.setopt(pycurl.TIMEOUT, 5)
        c.setopt(pycurl.CONNECTTIMEOUT, 5)
        c.setopt(pycurl.MAXREDIRS, 5)
        #c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling
        #c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5)

        c.perform()
        c.close()
        return b.getvalue()
    except:
        return None

def get_domain(d):
    pos = d.rfind('.', 0, d.rfind('.'))
    if pos > 0:
        return d[pos+1:]
    else:
        return d

BSPs = (
    'Blogger', # 0
    'Live Spaces', # 1
    'LiveJournal', # 2
    'WordPress', # 3
    'AOL Journal', # 4
    'XANGA', # 5
    'Typepad', # 6
    'MySpace', # 7
    'Movable Type', # 8
)

DomainName2BSPMap = {
    'blogspot.com':BSPs[0],
    'live.com':BSPs[1],
    'livejournal.com':BSPs[2],
    'wordpress.com':BSPs[3],
    'aol.com':BSPs[4],
    'xanga.com':BSPs[5],
    'typepad.com':BSPs[6],
    'myspace.com':BSPs[7],
}

BSPMetaGeneratorMap = {
    'blogger':BSPs[0],
    'wordpress':BSPs[3],
    'typepad':BSPs[6],
    'movable type':BSPs[8],
    'movabletype':BSPs[8],
    'live spaces':BSPs[1],
}

def guess(url):
    '''Guess blog platform according a url.

    1. If url match domain pattern, return;
    2. If meta generator match, return;
    3. If RSD engine name match, return;
    4. Else return Other.

    REF:
        http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html
    '''
    url = url.lower()
    t = urlparse(url)
    domain = get_domain(t[1])
    if DomainName2BSPMap.has_key(domain):
        return DomainName2BSPMap[domain]

    html = get_doc(url)
    if html:
        metaStrainer = SoupStrainer('meta', attrs={'name':'generator'})
        metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)]
        if metas:
            generator = metas[0]['content'].lower()
            for k, v in BSPMetaGeneratorMap.iteritems():
                if k in generator:
                    return v

        linkStrainer = SoupStrainer('link', title="RSD")
        links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)]
        if links:
            rsd_url = str(links[0]['href'])
            rsd = get_doc(rsd_url)
            if rsd:
                soup = BeautifulStoneSoup(rsd)
                try:
                    enginename = soup.rsd.service.enginename.string.lower()
                    for k, v in BSPMetaGeneratorMap.iteritems():
                        if k in enginename:
                            return v

                except AttributeError:
                    pass
    return 'Other'

if __name__ == '__main__':
    urls = [
    'http://www.boingboing.net/', # Movable Type
    'http://www.engadget.com/', # Other
    'http://www.gizmodo.com/',
    'http://www.techcrunch.com/', # WordPress
    'http://www.huffingtonpost.com/',
    'http://www.lifehacker.com/',
    'http://arstechnica.com/',
    'http://postsecret.blogspot.com/',
    'http://www.dailykos.com/',
    'http://michellemalkin.com/',
    'http://www.tmz.com/',
    'http://www.ilemoned.com/', # WordPress
    'http://headrush.typepad.com/',
    'http://thinkprogress.org/',
    'http://googleblog.blogspot.com/',
    'http://sethgodin.typepad.com/',
    'http://yanxi.bokewu.com/',
    'http://www.crooksandliars.com/',
    'http://www.kotaku.com/',
    'http://www.beppegrillo.it/', # Movable Type

    'http://rateyourstudents.blogspot.com/',
    'http://ninas72.spaces.live.com/',
    'http://eshm.livejournal.com/',
    'http://blogs4brownback.wordpress.com/',
    'http://journals.aol.com/dailypulseblog/citizenjournalism/',
    'http://www.xanga.com/MonchiKi',
    'http://etherbrian.typepad.com/',
    'http://www.myspace.com/nathanfillion',
    ]

    for url in urls:
        print url, guess(url)

Comments

twinsant (on May 23, 2007):

Test code

from blogdetect import get_domain, guess

import unittest

class TestBlogDetect(unittest.TestCase):

def testBoingBoing(self):
    """BoingBoing.net uses RSD detection"""
    url = 'http://www.boingboing.net/'
    platform = guess(url)
    self.assertEqual(platform, 'Movable Type')

def testBlogger(self):
    url = 'http://rateyourstudents.blogspot.com/'
    platform = guess(url)
    self.assertEqual(platform, 'Blogger')

def testLiveSpaces(self):
    url = 'http://ninas72.spaces.live.com/'
    platform = guess(url)
    self.assertEqual(platform, 'Live Spaces')

def testLiveJournal(self):
    url = 'http://eshm.livejournal.com/'
    platform = guess(url)
    self.assertEqual(platform, 'LiveJournal')

def testWordPress(self):
    url = 'http://blogs4brownback.wordpress.com/'
    platform = guess(url)
    self.assertEqual(platform, 'WordPress')

def testAolJounral(self):
    url = 'http://journals.aol.com/dailypulseblog/citizenjournalism/'
    platform = guess(url)
    self.assertEqual(platform, 'AOL Journal')

def testXanga(self):
    url = 'http://www.xanga.com/MonchiKi'
    platform = guess(url)
    self.assertEqual(platform, 'XANGA')

def testTypepad(self):
    url = 'http://etherbrian.typepad.com/'
    platform = guess(url)
    self.assertEqual(platform, 'Typepad')

def testMySpace(self):
    url = 'http://www.myspace.com/nathanfillion'
    platform = guess(url)
    self.assertEqual(platform, 'MySpace')

def testEngadget(self):
    url = 'http://www.engadget.com/'
    platform = guess(url)
    self.assertEqual(platform, 'Other')

def testTechCrunch(self):
    url = 'http://www.techcrunch.com/'
    platform = guess(url)
    self.assertEqual(platform, 'WordPress')

def testNonExistDomain(self):
    url = 'http://www.nonexistdomain.com/'
    platform = guess(url)
    self.assertEqual(platform, 'Other')

def testGetDomain(self):
    self.assertEqual(get_domain('foo.www.google.com'), 'google.com')
    self.assertEqual(get_domain('www.google.com'), 'google.com')
    self.assertEqual(get_domain('google.com'), 'google.com')
    self.assertEqual(get_domain(''), '')

#

(Forgotten your password?)

You may use Markdown syntax here, but raw HTML will be removed.