python爬取信息的方式有很多,今天就教大家如何使用xpath爬取豆瓣top250的电影名称、评分、链接、引言,并存储到csv中
python版本:3.7.4
相关模块:lxml模块;requests模块;以及python自带模块csv
环境搭建:安装python并添加到环境变量,pip安装需要的相关模块
代码如下:
import requests
from lxml import etree
import csv
#豆瓣top250网址
doubanUrl='https://movie.douban.com/top250?start={}&filter='
#获取网页源码
def getSource(url):
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
res=requests.get(url,headers=headers)
res.encoding='utf-8'
return res.text
# 获取电影信息
def getEveryItem(source):
html_element=etree.HTML(source)
movieItemList=html_element.xpath('//div[@class="info"]')
#定义一个空列表
movieList=[]
for eachMovie in movieItemList:
#创建一个字典,向列表中存储数据[{电影一},{电影二}...]
movieDict={}
title=eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()') #电影名
otherTitle=eachMovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()') #其他名称
link=eachMovie.xpath('div[@class="hd"]/a/@href')[0] #链接
star=eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0] #评分
quote=eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()') #引言
#条件语句:如果有引言则写引言,若没有则不写
if quote:
quote=quote[0]
else:
quote=''
#保存数据
movieDict['title']=''.join(title+otherTitle)
movieDict['url']=link
movieDict['star']=star
movieDict['quote']=quote
movieList.append(movieDict)
print(movieList)
return movieList
#保存数据
def writeData(movieList):
with open('douban_top250.csv','w',encoding='utf-8') as f:
writer=csv.DictWriter(f,fieldnames=['title','star','quote','url'])
writer.writeheader() #写入表头
for each in movieList:
writer.writerow(each)
if __name__ == "__main__":
movieList=[]
#一共10页,循环10次
for i in range(10):
#调用函数
pageLink=doubanUrl.format(i*25)
source=getSource(pageLink)
movieList+=getEveryItem(source)
writeData(movieList)