headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'} fp=open("H:/doubanbook.csv",'wt',newline='',encoding='utf-8') writer=csv.writer(fp) writer.writerow(('name','url','auther','publisher','date','price','rate','comment')) urls=["https://book.douban.com/top250?start={}".format(str(i)) for i in range(0,250,25)] for url in urls: html = requests.get(url, headers) selector=etree.HTML(html.text) infos=selector.xpath('//tr[@class="item"]') for info in infos: name=info.xpath('td/div/a/@title')[0] url=info.xpath('td/div/a/@href')[0] book_info=info.xpath('td/p/text()')[0] auther=book_info.split('/')[0] publish=book_info.split('/')[-3] date=book_info.split('/')[-2] price=book_info.split('/')[-1] rate=info.xpath('td/div/span[2]/text()')[0] comments=info.xpath('td/p/span/text()') comment=comments[0] if len(comments)!=0 else "空" writer.writerow((name,url,auther,publish,date,price,rate,comment)) fp.close()
爬取豆瓣图书TOP250
最新推荐文章于 2024-04-26 18:35:17 发布