textbook.com code
Posted by Anonymous on Fri 7th Jan 2011 00:52
raw | new post

import random
import re
import urllib2

PAGE_REGEX = re.compile('([0-9, ]+) p(?:\.|(?:ages))')

def getBetween(searched, start, end, startI=0):
    sInd = searched.find(start, startI)
    if sInd == -1:
        return '', -1
    sInd += len(start)
    eInd = searched.find(end, sInd)
    if eInd == -1:
        return '', -1
    return searched[sInd:eInd], eInd + len(end)

def getIsbns(url):
    isbns = []
    done = False
    for pg in xrange(1, 5000):
        print 'start page %d' % pg
        html = urllib2.urlopen(url + '?PG=%d' % pg).read()
        newI = 0
        first = True
        while True:
            foo, newI = getBetween(html, "<span class='regGray11px'>(", '</span>', newI)
            if not foo:
                if first:
                    done = True
                break
            first = False
            isbn = getBetween(foo, 'ISBN10: ', ';')[0]
            if isbn and 'Hardback' in foo:
                isbns.append(isbn)
        if done:
            break
    return isbns

def getNumPages(isbn):
    """return None if not found"""
    loc = urllib2.urlopen('http://isbndb.com/search-all.html?kw=' + isbn).geturl()
    match = re.search(PAGE_REGEX, urllib2.urlopen(loc).read())
    if match:
        print match.group(1)
        return max(int(p) for p in match.group(1).replace(',', '').split())
    return None

isbns = list(set(getIsbns('http://www.textbooks.com/Catalog/KME/Calculus-for-K-12.php')))
random.shuffle(isbns)
print len(isbns)

nums = []
for isbn in isbns:
    num = getNumPages(isbn)
    if num:
        nums.append(num)
        print isbn, nums