#-*- coding: utf-8 -*-
importrequestsimportreimportsys
reload(sys)
sys.setdefaultencoding('utf-8')classSpider(object):def __init__(self):print('开始爬取豆瓣图书top250的内容。。。。。。')#传入url,返回网页源代码
defgetSourceCode(self, url):
html=requests.get(url)returnhtml.text#从源代码中提取出我们需要的内容块:{书名、作者出版社等、评分、评价人数}。
defgetEveryBookContent(self, sourceCode):
everyBookContent= re.findall('
', sourceCode, re.S)#everyBookContent = re.findall('(.*?)
', sourceCode, re.S)returneveryBookContent#从内容块中提取出数据
defgetBookInfo(self, eachBookContent):
bookInfo={}#bookInfo['title'] = re.subn('( |\n|
|?span.*?>)', "", re.search('(.*?)', eachBookContent, re.S).group(1))[0]
bookInfo['title'] = re.sub('( |\n|
|?span.*?>)', "", re.search('(.*?)', eachBookContent, re.S).group(1))
bookInfo['author'] = re.search('
(.*?)
', eachBookContent, re.S).group(1)bookInfo['discussNum'] = re.sub('( |\n|
)', "", re.search('\((.*?)\)', eachBookContent, re.S).group(1))
bookInfo['score'] = re.search('
', eachBookContent, re.S).group(1)returnbookInfo#将结果保存到文件defsaveBookInfo(self, bookList):
f= open("bookList.txt", "a")for each inbookList:
f.writelines('书 名:\t {}\n'.format(each['title']))
f.writelines('作 者:\t {}\n'.format(each['author']))
f.writelines('评论数:\t {}\n'.format(each['discussNum']))
f.writelines('评 分:\t {}\n\n'.format(each['score']))
f.close()defstart(self, url):
sourceCode=self.getSourceCode(url)
everyBookContent=self.getEveryBookContent(sourceCode)
bookList=[]for each ineveryBookContent:
bookList.append(self.getBookInfo(each))
self.saveBookInfo(bookList)if __name__ == '__main__':
douban=Spider()
url= 'http://book.douban.com/top250?start=0'i=0while i <= 225:
url= 'http://book.douban.com/top250?start={}'.format(i)
douban.start(url)
i+= 25