- import random
- import re
- import urllib2
- PAGE_REGEX = re.compile('([0-9, ]+) p(?:\.|(?:ages))')
- def getBetween(searched, start, end, startI=0):
- sInd = searched.find(start, startI)
- if sInd == -1:
- return '', -1
- sInd += len(start)
- eInd = searched.find(end, sInd)
- if eInd == -1:
- return '', -1
- return searched[sInd:eInd], eInd + len(end)
- def getIsbns(url):
- isbns = []
- done = False
- for pg in xrange(1, 5000):
- print 'start page %d' % pg
- html = urllib2.urlopen(url + '?PG=%d' % pg).read()
- newI = 0
- first = True
- while True:
- foo, newI = getBetween(html, "<span class='regGray11px'>(", '</span>', newI)
- if not foo:
- if first:
- done = True
- break
- first = False
- isbn = getBetween(foo, 'ISBN10: ', ';')[0]
- if isbn and 'Hardback' in foo:
- isbns.append(isbn)
- if done:
- break
- return isbns
- def getNumPages(isbn):
- """return None if not found"""
- loc = urllib2.urlopen('http://isbndb.com/search-all.html?kw=' + isbn).geturl()
- match = re.search(PAGE_REGEX, urllib2.urlopen(loc).read())
- if match:
- print match.group(1)
- return max(int(p) for p in match.group(1).replace(',', '').split())
- return None
- isbns = list(set(getIsbns('http://www.textbooks.com/Catalog/KME/Calculus-for-K-12.php')))
- random.shuffle(isbns)
- print len(isbns)
- nums = []
- for isbn in isbns:
- num = getNumPages(isbn)
- if num:
- nums.append(num)
- print isbn, nums
textbook.com code
Posted by Anonymous on Fri 7th Jan 2011 00:52
raw | new post
Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.