theinfo.org

for people with large data sets

[log in]

amazon ASIN crawler

[edit] [history]

Here's some code I wrote to generate a list of ASINs by crawling Amazon's similarity service:

import urllib, re, gzip

AWS_ID = '[put your AWS ID here]'
qurl = lambda n:  'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%s&Operation=SimilarityLookup&ItemId=%s' % (AWS_ID, n)
data_r = re.compile('(<Items>.*?</Items>)', re.S)
asin_r = re.compile('<ASIN>(.*?)</ASIN>', re.S)

tried_fh = file('tried', 'a')
data_fh = gzip.open('data.xml.gz', 'a')

def handleone(asin):
    myqurl = qurl(asin)
    text = urllib.urlopen(myqurl).read()
    for x in data_r.findall(text):
        data_fh.write(x)
    data_fh.flush()
    tried_fh.write(asin + '\n')
    tried_fh.flush()
    return [x.strip() for x in asin_r.findall(text)]

patient_0 = '0805063897'
def crawler():
    done = set([x.strip() for x in file('tried')])
    todo = set([])
    for line in gzip.open('data.xml.gz'):
        for asin in asin_r.findall(line):
            if asin not in done: todo.add(asin)
    if patient_0 not in done: todo.add(patient_0)

    while 1:
        asin = todo.pop()
        pointers = handleone(asin)
        done.add(asin)
        for p in pointers:
            if p not in done:
                todo.add(p)
        print asin, '(%s todo; %s done)' % (len(todo), len(done))

crawler()

[edit] [history]

last modified January 16