| Paste number 46603: | Reddit parser |
| Pasted by: | shabda |
| 1 year, 2 weeks ago | |
| None | |
| Paste contents: |
| from BeautifulSoup import * import urllib2 import re from urlparse import urlparse import itertools base_url = 'http://programming.reddit.com/top?offset=' tot_links = 1000 opener = urllib2.build_opener() all_hrefs = [] all_titles = [] all_points = [] soups = [] i = 0 def prin(x): print x, while i < tot_links: fin_url = '%s%s' %(base_url, i) print fin_url req = urllib2.Request(fin_url) req.add_header('user-agent', 'reddit-crawler. (I am a friendly robot). shabda.raaj.nospam@gmail.removeme.com') data = opener.open(req).read() soup = BeautifulSoup(data) links = soup.findAll('a', {'class':'title'}) try: points_ = [soup_el.find('span').string for soup_el in soup.findAll('td', {'class':'wide little'})] pat = re.compile('\d*') points = [eval(pat.findall(point)[0]) for point in points_] except: pass hrefs = [el['href'] for el in links] titles = [el.string for el in links] all_hrefs.extend(hrefs) all_titles.extend(titles) all_points.extend(points) soups.append(soup) i = len(all_hrefs) threading.Event().wait(1) print '****fun with reddit urls(base_url = %s)****' % base_url sites = [urlparse(href)[1] for href in all_hrefs] def comp(x, y): return y[1] - x[1] def comp2(tup): return tup[1] g_sites = [(a,len(list(b))) for a,b in itertools.groupby(sorted(sites))] g_sites.sort(comp) singles = [site for site in g_sites if site[1] < 2] print 'total sites are %s' % len(sites) print 'total unique sites %s' % len(g_sites) print 'top 20 sites are %s' % g_sites [:20] print 'Sites with only one entry %s' % len(singles) points_href = zip(all_hrefs, all_points) print 'maximum points are %s by %s' % (max(points_href, key = comp2)[1], max(points_href, key = comp2)[0]) #print 'maximum points are %s' % max(all_points, comp2) print 'average points are %s' % (sum(all_points)/(len(all_points)+0.0)) title_lens = [(title, len(title.split())) for title in all_titles] title_lens.sort(comp) print 'average title length %s' % (sum([lens[1] for lens in title_lens])/(len(all_titles)+0.0)) print 'largest title has length %s and is %s' % (title_lens[0][1], title_lens[0][0]) words = [] for titles in all_titles: words.extend(titles.split()) len(words) words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words))] words_list.sort(comp) print '50 most common words are %s' % words_list[:50] words_ = [word.encode('utf-8').lower() for word in words] words_list = [(a,len(list(b))) for a,b in itertools.groupby(sorted(words_))] words_list.sort(comp) print '50 most common words, ignoring case are %s' % words_list[:50] filename = 'c:/%s.txt' % urlparse(base_url)[1] f = file(filename, 'w') for i in xrange(len(all_hrefs)): f.write(str(all_hrefs[i])) f.write('\t') f.write(all_titles[i].encode('utf-8')) f.write('\t') f.write(str(all_points[i])) f.write('\r\n') f.close() |
Annotations for this paste:
| Annotation number 1: | Lisence |
| Pasted by: | shabda |
| 1 year, 2 weeks ago | |
| Paste contents: |
| This is realeased under, "Do whatever you want to with it, but do not hold me responsible for you actions" Lisence. |
| Annotation number 2: | Description |
| Pasted by: | shabda |
| 1 year, 2 weeks ago | |
| Paste contents: |
| Its description can be read at <a href="http://seodummy.blogspot.com/2007/08/python-fun-with-reddit-urls.html">http://seodummy.blogspot.com/2007/08/python-fun-with-reddit-urls.html</a> |