import requests
from lxml import etree
import csv
url = "https://movie.douban.com/top250?start={}&filter="
def getcode(url):
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
}
res=requests.get(url,headers=headers)
res.encoding='utf-8'
return res.text
def setcode(resource):
html = etree.HTML(resource)
resu = html.xpath('//div[@class="info"]')
masterlist = []
master = {}
for res in resu:#因为xpath返回的值是一个列表类型所以将其遍历出来
title = res.xpath('div[@class="hd"]//span[@class="title"]/text()')
othertitle = res.xpath('div[@class="hd"]/a/span[@class="other"]/text()')
score = res.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
quote = res.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')
url = res.xpath('div[@class="hd"]/a/@href')[0]
if quote:
quote= quote[0]
else:
quote=''
master['title'] = ''.join(title + othertitle)
master['score']=score
master['quote']=quote
master['url']=url
masterlist.append(master)
print (masterlist)
return masterlist
def writedata(masterlist):
with open('douban.csv','w',encoding='utf-8',newline='')as f:
writer = csv.DictWriter(f,fieldnames=["score","quote","url","title"])
writer.writeheader()
for each in masterlist:
writer.writerow(each)
if __name__=='__main__':
masterlist = []
for i in range(10):
source=url.format(i * 25)
resource = getcode(source)
masterlist = masterlist+setcode(resource)
writedata(masterlist)
(三-1)xpath访问豆瓣实例
最新推荐文章于 2024-07-09 23:19:55 发布