# FAS_scraper.py # v.1.0 (March 1, 2010) # Mel Chua # This is a quick proof-of concept scraper inspired by Diana Martin's research # on the Fedora community; she's trying to get a gauge on who in Fedora # is an "active contributor," so I suggested making a tiny scraper to gather # all the FAS-authenticated activity of a user from existing webpages. # I'm pretty sure most of these services have APIs that would do the job # better and less kludgily, but this is just to see if it's a useful thing. # We'll be using http://twill.idyll.org/python-api.html # in order to interface with the web. from twill.commands import * # FAS usernames - you'd probably automatically get these # usernames from FAS somehow, but they'd end up in a # list of strings. These usernames picked somewhat randomly from # people logged into #fedora-mktg at the time I wrote this code # to get different amounts of activity in different places (i.e. # ian has many packages, I'm extremely active on the wiki, and so on) usernames = ['mchua', 'dianam', 'nmarques','ianweller','rbergero','hiemanshu','mmcgrath','pfrields','spevack','jfalco'] # FAS services - the first half of the URLs we're looking at. # They're all strings; this is a quick and dirty way of doing it. # I'd love to pop this in some classes and make a little FAS-data-getting # library that others can play with. pkgdb = 'https://admin.fedoraproject.org/pkgdb/users/packages/' wiki = 'https://fedoraproject.org/wiki/Special:Contributions/' services = [pkgdb, wiki] # All this loop does is dump the HTML from each page into an output file # called .html in the directory this script is in. At the end of # the program's run, you'll have a series of .html files with all # the activity for that username, and can do things like line counts, etc. # to gauge the value of activity in them. # # They also display nicely in a browser, though the html is just straight-out # concatenated (so there are multiple tags in there, multiple # tags, etc.) for username in usernames: outputfile = open(username+'.html','w') #saves in username.txt for service in services: go(service+username) # go to the URL outputfile.write(show()) # scrape the HTML at that URL to outputfile outputfile.close()