python 爬虫爬取豆瓣读书信息 程序主要是由 6 个函数组成: get_html():请求页面,返回页面 html 源码。 get_pageurl(base_url):从 html 源码中提取分页链接部分字段。 def bookinfo(url):提取图书信息,以列表形式返回。 get_num(person):判断评价人数,没有评价人数的按 10 人处理。 write2csv():将图书信息保存为 csv 文件。 main():程序执行的主函数。 程序思路: 1.分析豆瓣读书链接,分析分页链接规律。 2.循环提取链接中书本信息。 3.将书本信息保存为 csv 文件
'''python #解析出图书信息 def bookinfo(url): html = requests.get(url) soup = BeautifulSoup(html.text, 'lxml') tag = url.split("?")[0].split("/")[-1] booknames = soup.select('#subject_list ul div.info h2 a') details = soup.select('#subject_list ul div.info div.pub') ratings = soup.select('#subject_list div.info div.star.clearfix span.rating_nums') # 评分 peoples = soup.select('#subject_list div.star.clearfix span.pl') # 评价人数 intros = soup.select('#subject_list ul div.info p') data=[] for bookname, detail, rating, person, intro in zip(booknames, details, ratings, peoples, intros): info = {} try: info['类型']=tag booktitle = bookname.get_text().split()[0] info['书籍名称'] = booktitle author = detail.get_text().split('/', 4)[0].lstrip('\n ').rstrip('\n ') info['作者'] = author translator = detail.get_text().split('/', 4)[1] info['译者'] = translator rating_num = rating.get_text() # 评分 info['豆瓣评分'] = rating_num press = detail.get_text().split('/', 4)[2] info['出版社'] = press date = detail.get_text().split('/', 4)[3].split('-')[0] info['出版日期'] = date price = detail.get_text().split('/', 4)[4].lstrip('\n ').rstrip('\n ') info['价格'] = price person = get_num(person) # 评价人数 info['评价人数'] = person introduction = intro.get_text() info['简介'] = introduction data.append(info) except IndexError: try: info['类型'] = tag booktitle = bookname.get_text().split()[0] info['书籍名称'] = booktitle author = detail.get_text().split('/', 3)[0].lstrip('\n ').rstrip('\n ') info['作者'] = author translator = "" info['译者'] = translator press = detail.get_text().split('/', 3)[1] rating_num = rating.get_text() info['豆瓣评分'] = rating_num info['出版社'] = press date = detail.get_text().split('/', 3)[2].split('-')[0] info['出版日期'] = date price = detail.get_text().split('/', 3)[3].lstrip('\n ').rstrip('\n ') info['价格'] = price person = get_num(person) info['评价人数'] = person introduction = intro.get_text() info['简介'] = introduction except (IndexError, TypeError): continue except TypeError: continue return data '''
'''python #判断评价人数,没有数据的按 10 人处理 def get_num(person): try: person = int(person.get_text().split()[0][1:len(person.get_text().split()[0]) - 4]) except ValueError: person = int(10) return person
'''
'''python
def main(): base_url = 'https://book.douban.com/tag/?view=cloud' start = time.clock() for urls in get_pageurl(base_url): urlss = [urls +"?start={}&type=T".format(str(i)) for i in range(0, 1000, 20)] for url in urlss: data=bookinfo(url) write2csv(url) time.sleep(int(format(random.randint(0,9)))) #爬取每页书本信息后随机等待几秒,反爬虫操作 end = time.clock() print('Time Usage:', end - start) #爬取结束,输出爬取时间
'''
主要代码部分如上所示:全部代码