import pycurl from BeautifulSoup import BeautifulSoup, SoupStrainer, BeautifulStoneSoup from urlparse import urlparse def get_doc(url): try: c = pycurl.Curl() c.setopt(pycurl.URL, url) import StringIO b = StringIO.StringIO() c.setopt(pycurl.WRITEFUNCTION, b.write) c.setopt(pycurl.TIMEOUT, 5) c.setopt(pycurl.CONNECTTIMEOUT, 5) c.setopt(pycurl.MAXREDIRS, 5) #c.setopt(pycurl.PROXY, 'localhost:7654') # ssh tunnelling #c.setopt(pycurl.PROXYTYPE, pycurl.PROXYTYPE_SOCKS5) c.perform() c.close() return b.getvalue() except: return None def get_domain(d): pos = d.rfind('.', 0, d.rfind('.')) if pos > 0: return d[pos+1:] else: return d BSPs = ( 'Blogger', # 0 'Live Spaces', # 1 'LiveJournal', # 2 'WordPress', # 3 'AOL Journal', # 4 'XANGA', # 5 'Typepad', # 6 'MySpace', # 7 'Movable Type', # 8 ) DomainName2BSPMap = { 'blogspot.com':BSPs[0], 'live.com':BSPs[1], 'livejournal.com':BSPs[2], 'wordpress.com':BSPs[3], 'aol.com':BSPs[4], 'xanga.com':BSPs[5], 'typepad.com':BSPs[6], 'myspace.com':BSPs[7], } BSPMetaGeneratorMap = { 'blogger':BSPs[0], 'wordpress':BSPs[3], 'typepad':BSPs[6], 'movable type':BSPs[8], 'movabletype':BSPs[8], 'live spaces':BSPs[1], } def guess(url): '''Guess blog platform according a url. 1. If url match domain pattern, return; 2. If meta generator match, return; 3. If RSD engine name match, return; 4. Else return Other. REF: http://cyber.law.harvard.edu/blogs/gems/tech/rsd.html ''' url = url.lower() t = urlparse(url) domain = get_domain(t[1]) if DomainName2BSPMap.has_key(domain): return DomainName2BSPMap[domain] html = get_doc(url) if html: metaStrainer = SoupStrainer('meta', attrs={'name':'generator'}) metas = [meta for meta in BeautifulSoup(html, parseOnlyThese=metaStrainer)] if metas: generator = metas[0]['content'].lower() for k, v in BSPMetaGeneratorMap.iteritems(): if k in generator: return v linkStrainer = SoupStrainer('link', title="RSD") links = [link for link in BeautifulSoup(html, parseOnlyThese=linkStrainer)] if links: rsd_url = str(links[0]['href']) rsd = get_doc(rsd_url) if rsd: soup = BeautifulStoneSoup(rsd) try: enginename = soup.rsd.service.enginename.string.lower() for k, v in BSPMetaGeneratorMap.iteritems(): if k in enginename: return v except AttributeError: pass return 'Other' if __name__ == '__main__': urls = [ 'http://www.boingboing.net/', # Movable Type 'http://www.engadget.com/', # Other 'http://www.gizmodo.com/', 'http://www.techcrunch.com/', # WordPress 'http://www.huffingtonpost.com/', 'http://www.lifehacker.com/', 'http://arstechnica.com/', 'http://postsecret.blogspot.com/', 'http://www.dailykos.com/', 'http://michellemalkin.com/', 'http://www.tmz.com/', 'http://www.ilemoned.com/', # WordPress 'http://headrush.typepad.com/', 'http://thinkprogress.org/', 'http://googleblog.blogspot.com/', 'http://sethgodin.typepad.com/', 'http://yanxi.bokewu.com/', 'http://www.crooksandliars.com/', 'http://www.kotaku.com/', 'http://www.beppegrillo.it/', # Movable Type 'http://rateyourstudents.blogspot.com/', 'http://ninas72.spaces.live.com/', 'http://eshm.livejournal.com/', 'http://blogs4brownback.wordpress.com/', 'http://journals.aol.com/dailypulseblog/citizenjournalism/', 'http://www.xanga.com/MonchiKi', 'http://etherbrian.typepad.com/', 'http://www.myspace.com/nathanfillion', ] for url in urls: print url, guess(url)