这次附上代码简单看下 还是和以前差不多 简单的网页爬取 还是很简单。有的网站还有 ban ip 需要通过配置一些代理来访问。
代码附上
import csv
import requests
from lxml import etree
import re
f = open(r"存放的位置"’,‘w+’,newline=’’,encoding=‘utf-8-sig’)
write = csv.writer(f)
write.writerow((‘title’,‘author_publish_data’,‘mark’,‘pf’))
urls = [‘https://book.douban.com/top250?start={}’.format(str(i)) for i in range(0,250,25)]
headers = {‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36’}
for url in urls:
html = requests.get(url,headers=headers)
selector = etree.HTML(html.text)
infos = selector.xpath(’//td[@valign=“top”]’)
for info in infos:
title = info.xpath(‘div[1]/a/@title’)
author_publish_data = info.xpath(‘p[@class=“pl”]/text()’)
mark = info.xpath(‘div[2]/span[2]/text()’)
pf = info.xpath(‘p/span/text()’)
write.writerow((title,author_publish_data,mark,pf))
f.close()
看看就行。还是和上面的一样。