for people with large data sets
[log in]
Here's some code I wrote to generate a list of ASINs by crawling Amazon's similarity service:
import urllib, re, gzip
AWS_ID = '[put your AWS ID here]'
qurl = lambda n: 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%s&Operation=SimilarityLookup&ItemId=%s' % (AWS_ID, n)
data_r = re.compile('(<Items>.*?</Items>)', re.S)
asin_r = re.compile('<ASIN>(.*?)</ASIN>', re.S)
tried_fh = file('tried', 'a')
data_fh = gzip.open('data.xml.gz', 'a')
def handleone(asin):
myqurl = qurl(asin)
text = urllib.urlopen(myqurl).read()
for x in data_r.findall(text):
data_fh.write(x)
data_fh.flush()
tried_fh.write(asin + '\n')
tried_fh.flush()
return [x.strip() for x in asin_r.findall(text)]
patient_0 = '0805063897'
def crawler():
done = set([x.strip() for x in file('tried')])
todo = set([])
for line in gzip.open('data.xml.gz'):
for asin in asin_r.findall(line):
if asin not in done: todo.add(asin)
if patient_0 not in done: todo.add(patient_0)
while 1:
asin = todo.pop()
pointers = handleone(asin)
done.add(asin)
for p in pointers:
if p not in done:
todo.add(p)
print asin, '(%s todo; %s done)' % (len(todo), len(done))
crawler()
last modified January 16