xpath爬取豆瓣韩剧数据
需求:爬取豆瓣韩剧的标题、评分、评论以及详情页地址。
1、导入模块
import requests
from lxml import etree
import csv
2、分析每一页链接的规律
https://www.douban.com/doulist/2942804/?start=0&sort=seq&playable=0&sub_type= 第1页
https://www.douban.com/doulist/2942804/?start=25&sort=seq&playable=0&sub_type= 第2页
https://www.douban.com/doulist/2942804/?start=50&sort=seq&playable=0&sub_type= 第3页
所以 n=(page-1) * 25
3、获取url、模拟浏览器
baseurl='https://www.douban.com/doulist/2942804/?start={}&sort=seq&playable=0&sub_type='
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
4、定义函数
返回源码
def getSource(url):
response=requests.get(url,headers=headers)
return response.content.decode('utf-8')
爬取标题、评分、评论以及详情页地址
def getmain(source):
element=etree.HTML(source)
movies=element.xpath('//div[@class="mod"]')
# 展示信息 [{标题:XXX,评分:xxx,},{},{}]
movieList=[]
for movie in movies:
# 存储 标题 评分 评论 详情页的地址
movieDict={}
xpath语法查找
title=movie.xpath('div[@class="bd doulist-subject"]/div[@class="title"]/a/text()')
score=movie.xpath('div[@class="bd doulist-subject"]/div[@class="rating"]/span[@class="rating_nums"]/text()')
comment=movie.xpath('div[@class="ft"]/div[@class="comment-item content"]/blockquote/text()')
link=movie.xpath('div[@class="bd doulist-subject"]/div[@class="post"]/a/@href')
爬取为空列表的判断
if title:
title=title[0]
else:
title=''
if score:
score=score[0]
else:
score=''
if comment:
comment=comment[1]
else:
comment=''
if link:
link=link[0]
else:
link=''
添加字典键值对,追加并返回列表
movieDict['title']=title
movieDict['score']=score
movieDict['comment']=comment
movieDict['link']=link
movieList.append(movieDict)
print(movieList)
return movieList
写入数据
def writeData(movieList):
# newline='' 设置为空字符串
with open('hanjuTV.csv','w',encoding='utf-8',newline='') as f:
writer=csv.DictWriter(f,fieldnames=['title','score','comment','link'])
writer.writeheader() # 写入表头
for i in movieList:
writer.writerow(i)
if __name__ == '__main__':
movieList=[]
for j in range(10):
# 获取每一个列表页url
pagelink=baseurl.format(j*25)
# 获取每一页的源码数据
source=getSource(pagelink)
# 获取数据 movieList = movieList + getEveryItem(source) 注意点
movieList += getmain(source)
writeData(movieList)