LaptopTestingDataScraper
|
Size: 15260
Comment: fixed #ing
|
Size: 15302
Comment: sign
|
| Deletions are marked like this. | Additions are marked like this. |
| Line 23: | Line 23: |
Hope this is helpful! -- ThomasMHinkle |
I found that the Ubuntu Laptop Testing Team pages had lots of data that I wanted, but not presented in a way that was convenient for someone looking to find a laptop with features X,Y & Z, as I was. The following is a python script to grab data from the laptop testing pages and allow you to query the information as a sqlite database. This script does three things:
- Scrape the laptop testing information from the wiki
- Dump that information into csv files suitable for import into a database.
- Load that information into a SQLite database which can easily be queried to find, for example, laptops with support for a given feature.
Running the script dumps you to an interactive python shell where you can inspect the sqlite DB with some convenience methods. For example,
{{{$ ./scrape_laptop_info.py ... (processing stuff)... >>> get_laptops_for_criteria(Sleep='Yes',Hibernates='Yes',distro="Breezy%") [(' Dell ', ' Inspiron 9300 ', 'Breezy (current development)'), (' IBM/Lenovo ', ' ThinkPad T43 ', 'Breezy'), (' IBM/Lenovo ', ' ThinkPad X41 ', 'Breezy Final (current stable)'), (' Acer ', ' TravelMate 8100 ', 'Breezy (current development)'), (' Fujitsu ', ' Amilo A7640 ', 'Breezy (current development)'), (' Zitech ', ' Z223 ', 'Breezy (current development)')]}}}
The above asks for records that match Breezy% that have working Sleep and Hibernate. I could allow for Yes* or Yes, with problems by changing Sleep='Yes' to Sleep='Yes%'.
Hope this is helpful! -- ThomasMHinkle
Here is the script in its entirety:
import urllib, exceptions, csv, re
import BeautifulSoup
# Written by Thomas M. Hinkle 2006 - feel free to copy, modify,
# re-use, and share as you like.
#
# This script requires BeautifulSoup
# http://www.crummy.com/software/BeautifulSoup/download/BeautifulSoup.py
#
# USAGE: Running this script will scrape the laptop testing team
# pages, create 2 csv files with the data, and dump you on a python
# commandline from which you can inspect a sqlite database of the
# laptop info.
#
# If you don't have sqlite, this script will create the 2 csv files
# and then fail. You can of course still make use of those csv files
# as you like.
#
# If you don't have BeautifulSoup, this script will fail utterly.
wikipage = 'https://wiki.ubuntu.com/LaptopTestingTeam'
class FormatChange (exceptions.Exception):
pass
def get_string_from_element (el):
if el.string: return el.string
else:
s = ''
for c in el.contents:
s += get_string_from_element(c)
return s
def get_soup (url):
sock = urllib.urlopen(url)
return BeautifulSoup.BeautifulSoup(sock.read())
def process_link_table (tbl):
rows = tbl.fetchChildren('tr')
rows = rows[1:]
lst = []
for r in rows:
ret = {}
cells = r.fetchChildren('td')
ret['make'] = get_string_from_element(cells[0].findChild('p'))
ret['model'] = get_string_from_element(cells[1].findChild('p'))
ret['submitter'] = get_string_from_element(cells[2].findChild('a'))
ret['link'] = cells[3].findChild('a').attrs[0][1]
lst.append(ret)
return lst
def grab_list (url=wikipage):
"""Return 2 lists of webpages with laptop information.
The first list will be the canonical supplied list, presumably
more reliable.
The second list will be community supplied.
"""
scraper = get_soup(url)
try: header = scraper.fetch(name='h3',text='Canonical Supplied')[0]
except IndexError:
raise FormatChange("Can't find <h3>Canonical Supplied</h3>")
next = header.next
while not hasattr(next,'name'): next = next.next
if next.name != 'div': raise FormatChange("""We expect a div after <h3>Canonical Supplied</h3>""")
header = scraper.fetch(name='h3',text='Community Supplied')[0]
tbl = next.findChild('table')
canon_supplied = process_link_table(tbl)
for d in canon_supplied: d['Canonical supplied'] = True
# Community supplied
try:
header = scraper.fetch(name='h3',text='Community Supplied')[0]
except IndexError:
raise FormatChange("Can't find <h3>Community Supplied</h3>")
next = header.next
while not hasattr(next,'name'): next = next.next
if next.name != 'div': raise FormatChange("""We expect a div after <h3>Community Supplied</h3>""")
tbl = next.findChild('table')
community_supplied = process_link_table(tbl)
for d in community_supplied: d['Canonical supplied'] = False
canon_supplied.extend(community_supplied)
return canon_supplied
class InfoGetter:
def __init__ (self, url=wikipage):
self.base = url
def get_laptop_list (self):
self.data = grab_list(self.base)
self.laptop_columns = ['make','model','submitter']
self.record_columns = ['distro']
def get_info (self):
for d in self.data:
d['info']=self.process_laptop_page(d)
def process_laptop_page (self,d):
print 'Fetching info for ',d['make'],d['model'],d['submitter']
try:
soup = get_soup(urllib.basejoin(self.base,d['link']))
except:
print 'Problem fetching',d['link']
import traceback; traceback.print_exc()
self.data.remove(d)
return
hwtable = soup.first('table')
if not hwtable:
print 'Problem fetching',d['link']
print 'There appears to be no data table!'
self.data.remove(d)
return
works = hwtable.first('strong',text='Works?')
works_row = works.findParent('tr')
distr_row = works_row.findNext('tr')
distr_cells = distr_row.fetch('td')
distros = [
get_string_from_element(c.first('p')).strip().strip('in ').strip('?')
for c in distr_cells
]
d['distros'] = distros
rows = distr_row.fetchNextSiblings('tr')
for r in rows:
try:
values = [get_string_from_element(c.first('p')).strip()
for c in r.fetch('td')]
except AttributeError: # if strip fails because we get None instead of string
values = []
if len(values) < 3:
continue
name = values[0]; values = values[1:] #reverse pop!
name = name.strip().title()
# Add to our list of output DB columns
if name not in self.record_columns: self.record_columns.append(name)
bugno = values.pop()
#ret = {'any':False}
ret = {}
for n,v in enumerate(values):
if n >= len(distros):
print 'Unusual value',name,v
print 'This is value ',n
print 'We only have ',len(distros),'distros:',distros
break
distro = distros[n]
if v.lower().strip(', ;.') in ['yes','y','true']:
ret[distro] = 'Yes'
#ret['any'] = 'Yes'
elif v.lower() in ['no','n','false']:
ret[distro] = 'No'
else:
#print 'Strange value for ',name,distro,':',v
ret[distro] = v
#if not ret['any']: ret['any'] = v
d[name] = ret
def output_csv (self, base_filename='ubuntu_laptop_compatability_db'):
laptops = file(base_filename + '_laptops.csv','w')
laptop_writer = csv.writer(laptops)
laptop_writer.writerow(['LaptopID']+self.laptop_columns)
records = file(base_filename + '_records.csv','w')
record_writer = csv.writer(records)
record_writer.writerow(['Laptop']+self.record_columns)
laptop_no = 0
for d in self.data:
# We give each laptop a unique ID, which is then used
# to join the laptop table with the records table
laptop_fields = [d.get(c,'') for c in self.laptop_columns]
#laptop_fields = [str(laptop_no)] + ['"%s"'%f for f in laptop_fields]
laptop_writer.writerow([laptop_no] + laptop_fields)
distros = d.get('distros',[])
if not distros: print 'No records for ',d
# Write one record per distro
for dist in distros:
record_fields = [laptop_no,dist]
for r in self.record_columns[1:]: #ignore dist
val = d.get(r,None)
if not val:
record_fields.append('')
else:
record_fields.append('%s'%val.get(dist,''))
record_writer.writerow(record_fields)
laptop_no += 1
laptops.close()
records.close()
print "Wrote data to two CSV files:"
print "Laptop info: ",base_filename + '_laptops.csv'
print "Record info: ",base_filename + '_records.csv'
class LaptopDatabase:
"""A simple interface to a SQLite database of Ubuntu Laptop information.
Our SQL column names are going to be the ubuntu fields stripped of
spaces, numbers and punctuation.
For example,
Sleep? becomes Sleep
External
3D Acceleration? becomes DAcceleration
"""
def __init__ (self,
db_file='ubuntu_laptop_db.db'
):
import sqlite
self.connection = sqlite.connect(db_file)
self.cursor = self.connection.cursor()
def list_laptop_fields (self):
self.cursor.execute('SELECT * FROM sqlite_master WHERE name="laptops"')
return ld.cursor.fetchone()[4]
def list_record_fields (self):
self.cursor.execute('SELECT * FROM sqlite_master WHERE name="records"')
return ld.cursor.fetchone()[4]
def load_tables (self, base_filename='ubuntu_laptop_compatability_db'):
laptop_file = base_filename + '_laptops.csv'
records_file = base_filename + '_records.csv'
self.load_from_csv_file(laptop_file,'laptops')
self.load_from_csv_file(records_file,'records')
self.connection.commit()
def load_from_csv_file (self,filename,table_name):
infile = file(filename,'r')
reader = csv.reader(infile)
headers = reader.next()
# Column names must be plain old letters
headers = [re.sub('[^A-Za-z]','',h) for h in headers]
sql = "CREATE TABLE %s ("%table_name
idrow = headers[0]
rows = ["%(idrow)s int"%locals()]
rows += ["%s str"%h for h in headers[1:]]
sql += ", ".join(rows)
sql += ")"
self.cursor.execute(sql)
for row in reader:
sql = "INSERT INTO %(table_name)s"%locals()
sql += " (" + ", ".join(headers) + ') '
sql += "VALUES "
sql += "("
vals = [int(row[0])] + row[1:]
sql += ", ".join(len(vals)*["%s"])
sql += ")"
self.cursor.execute(sql,vals)
def execute (self, sql):
self.cursor.execute(sql)
return self.cursor.fetchall()
def get_laptops_for_criteria (self, **criteria):
"""Get laptops that match ALL criteria.
Criteria are keyword arguments of the form
column="value"
If criteria contains a %, we use LIKE instead of =
For example,
get_laptops_for_criteria(Sleep="Yes%") will execute the SQL
SELECT... WHERE Sleep LIKE "Yes%"
get_laptops_for_criteria(Sleep="Yes") will execute the SQL
SELECT... WHERE Sleep="Yes%"
"""
base = """SELECT laptops.make,laptops.model,records.distro
FROM laptops JOIN records on laptops.LaptopID=records.Laptop
"""
where_statements = []
for k,v in criteria.items():
k = re.sub('[^A-Za-z]','',k)
if v.find('%')>0:
where_statements.append('%s LIKE "%s"'%(k,v))
else:
where_statements.append('%s="%s"'%(k,v))
if where_statements:
base += "WHERE " + " AND ".join(where_statements)
return self.execute(base)
def get_hibernate_and_sleep (self):
return self.get_laptops_for_criteria(**{'Hibernates':'Yes',
'Sleep':'Yes'})
def get_almost_hibernate_and_sleep (self):
return self.get_laptops_for_criteria(**{'Sleep':'Yes%',
'Hibernates':'Yes%'}
)
def get_ideal_laptops (self):
return self.get_laptops_for_criteria(
**{
'Sleep':"Yes%",
'Hibernates':"Yes%",
'3DAcceleration':"Yes%",
'ExternalMonitorExtendDesktop':"Yes%",
}
)
if __name__ == '__main__':
import os, os.path
if not os.path.exists('ubuntu_laptop_db.db'):
if (not os.path.exists('ubuntu_laptop_compatability_db_laptops.csv')
or
not os.path.exists('ubuntu_laptop_compatability_db_records.csv')
):
print "Grabbing information from Ubuntu wiki"
ig = InfoGetter()
print "Get list of laptop pages"
ig.get_laptop_list()
print "Scrape info from laptop pages"
ig.get_info()
print "Write csv files"
ig.output_csv()
else:
print "Using previously existing csv files"
try:
ld = LaptopDatabase()
except:
print 'You have laptop info in dictionaries in ig'
print 'You have two csv files with all your data in the current directory'
print 'Unforunately, we had trouble loading a SQL database'
raise
print 'Loading laptop database from csv files'
print '(Delete those files if you want to regenerate them)'
ld.load_tables()
else:
print 'Loading previously existing database from ubuntu_laptop_db.db'
print '(Move or delete this DB if you want to regenerate it)'
ld = LaptopDatabase()
print 'You have a database sitting in ld'
print
print 'If you know some python and some SQL, you can go to town...'
print 'Type help(get_laptops_for_criteria) for info on the database object'
print "get_laptops_for_criteria is what you'll want to do most of your work with"
print "For example... "
print "get_laptops_for_criteria(Sleep='Yes') returns a list of laptops that Sleep"
print
print 'list_record_fields() will give you a sense of what the DB columns are.'
print
print 'Type Control-D to exit (or Control-Z on Windows)'
ld = LaptopDatabase()
execute = ld.execute
get_laptops_for_criteria = ld.get_laptops_for_criteria
cursor = ld.cursor
connection = ld.connection
list_record_fields = ld.list_record_fields
list_laptop_fields = ld.list_laptop_fieldsLaptopTestingDataScraper (last edited 2010-02-07 00:21:39 by vpn-8061f40e)