Paste number 46603: Reddit parser

Index of paste annotations: 1 | 2

Paste number 46603: Reddit parser
Pasted by: shabda
1 year, 2 weeks ago
None
Paste contents:
Raw Source | XML | Display As
from BeautifulSoup import *
import urllib2
import re
from urlparse import urlparse
import itertools

base_url = 'http://programming.reddit.com/top?offset='
tot_links = 1000
opener = urllib2.build_opener()
all_hrefs = []
all_titles = []
all_points = []
soups = []
i = 0
def prin(x): print x,
while i < tot_links:
    fin_url = '%s%s' %(base_url, i)
    print fin_url
    req = urllib2.Request(fin_url)
    req.add_header('user-agent', 'reddit-crawler. (I am a friendly robot). shabda.raaj.nospam@gmail.removeme.com')
    data = opener.open(req).read()
    soup = BeautifulSoup(data)
    links = soup.findAll('a', {'class':'title'})
    try:
       points_ = [soup_el.find('span').string for soup_el in soup.findAll('td', {'class':'wide little'})]
       pat = re.compile('\d*')
       points = [eval(pat.findall(point)[0]) for point in points_]
    except:
        pass
    hrefs = [el['href'] for el in links]
    titles = [el.string for el in links]
    all_hrefs.extend(hrefs)
    all_titles.extend(titles)
    all_points.extend(points)
    soups.append(soup)
    i = len(all_hrefs)
    threading.Event().wait(1)
    
print '****fun with reddit urls(base_url = %s)****' % base_url
sites = [urlparse(href)[1] for href in all_hrefs]
def comp(x, y):
    return y[1] - x[1]
def comp2(tup):
    return tup[1]
g_sites = [(a,len(list(b))) for a,b in itertools.groupby(sorted(sites))]
g_sites.sort(comp)
singles = [site for site in g_sites if site[1] < 2]
print 'total sites are %s' % len(sites)
print 'total unique sites %s' % len(g_sites)
print 'top 20 sites are %s' %  g_sites [:20]
print 'Sites with only one entry %s' % len(singles)
points_href = zip(all_hrefs, all_points)
print 'maximum points are %s by %s' % (max(points_href, key = comp2)[1], max(points_href, key = comp2)[0])
#print 'maximum points are %s' % max(all_points, comp2)
print 'average points are %s' % (sum(all_points)/(len(all_points)+0.0))

title_lens = [(title, len(title.split())) for title in all_titles]
title_lens.sort(comp)
print 'average title length %s' % (sum([lens[1] for lens in title_lens])/(len(all_titles)+0.0))
print 'largest title has length %s and is %s' % (title_lens[0][1], title_lens[0][0])

words = []
for titles in all_titles:
    words.extend(titles.split())
len(words)
words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words))]
words_list.sort(comp)
print '50 most common words are %s' % words_list[:50]
words_ = [word.encode('utf-8').lower() for word in words]
words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words_))]
words_list.sort(comp)
print '50 most common words, ignoring case are %s' % words_list[:50]

filename = 'c:/%s.txt' % urlparse(base_url)[1]
f = file(filename, 'w')
for i in xrange(len(all_hrefs)):
    f.write(str(all_hrefs[i]))
    f.write('\t')
    f.write(all_titles[i].encode('utf-8'))
    f.write('\t')
    f.write(str(all_points[i]))
    f.write('\r\n')

f.close()    



    

Annotations for this paste:

Annotation number 1: Lisence
Pasted by: shabda
1 year, 2 weeks ago
Paste contents:
Raw Source | Display As
This is realeased under, "Do whatever you want to with it, but do not hold me responsible for you actions" Lisence.

Annotation number 2: Description
Pasted by: shabda
1 year, 2 weeks ago
Paste contents:
Raw Source | Display As
Its description can be read at <a href="http://seodummy.blogspot.com/2007/08/python-fun-with-reddit-urls.html">http://seodummy.blogspot.com/2007/08/python-fun-with-reddit-urls.html</a>

Colorize as:
Show Line Numbers
Index of paste annotations: 1 | 2

Ads absolutely not by Google

Lisppaste pastes can be made by anyone at any time. Imagine a fearsomely comprehensive disclaimer of liability. Now fear, comprehensively.