from BeautifulSoup import *
import urllib2
import re
from urlparse import urlparse
import itertools
base_url = 'http://programming.reddit.com/top?offset='
tot_links = 1000
opener = urllib2.build_opener()
all_hrefs = []
all_titles = []
all_points = []
soups = []
i = 0
def prin(x): print x,
while i < tot_links:
fin_url = '%s%s' %(base_url, i)
print fin_url
req = urllib2.Request(fin_url)
req.add_header('user-agent', 'reddit-crawler. (I am a friendly robot). shabda.raaj.nospam@gmail.removeme.com')
data = opener.open(req).read()
soup = BeautifulSoup(data)
links = soup.findAll('a', {'class':'title'})
try:
points_ = [soup_el.find('span').string for soup_el in soup.findAll('td', {'class':'wide little'})]
pat = re.compile('\d*')
points = [eval(pat.findall(point)[0]) for point in points_]
except:
pass
hrefs = [el['href'] for el in links]
titles = [el.string for el in links]
all_hrefs.extend(hrefs)
all_titles.extend(titles)
all_points.extend(points)
soups.append(soup)
i = len(all_hrefs)
threading.Event().wait(1)
print '****fun with reddit urls(base_url = %s)****' % base_url
sites = [urlparse(href)[1] for href in all_hrefs]
def comp(x, y):
return y[1] - x[1]
def comp2(tup):
return tup[1]
g_sites = [(a,len(list(b))) for a,b in itertools.groupby(sorted(sites))]
g_sites.sort(comp)
singles = [site for site in g_sites if site[1] < 2]
print 'total sites are %s' % len(sites)
print 'total unique sites %s' % len(g_sites)
print 'top 20 sites are %s' % g_sites [:20]
print 'Sites with only one entry %s' % len(singles)
points_href = zip(all_hrefs, all_points)
print 'maximum points are %s by %s' % (max(points_href, key = comp2)[1], max(points_href, key = comp2)[0])
print 'average points are %s' % (sum(all_points)/(len(all_points)+0.0))
title_lens = [(title, len(title.split())) for title in all_titles]
title_lens.sort(comp)
print 'average title length %s' % (sum([lens[1] for lens in title_lens])/(len(all_titles)+0.0))
print 'largest title has length %s and is %s' % (title_lens[0][1], title_lens[0][0])
words = []
for titles in all_titles:
words.extend(titles.split())
len(words)
words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words))]
words_list.sort(comp)
print '50 most common words are %s' % words_list[:50]
words_ = [word.encode('utf-8').lower() for word in words]
words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words_))]
words_list.sort(comp)
print '50 most common words, ignoring case are %s' % words_list[:50]
filename = 'c:/%s.txt' % urlparse(base_url)[1]
f = file(filename, 'w')
for i in xrange(len(all_hrefs)):
f.write(str(all_hrefs[i]))
f.write('\t')
f.write(all_titles[i].encode('utf-8'))
f.write('\t')
f.write(str(all_points[i]))
f.write('\r\n')
f.close()
This is realeased under, "Do whatever you want to with it, but do not hold me responsible for you actions" Lisence.