爬取页面:https://movie.douban.com/chart
爬虫逻辑解析:
一级:获取豆瓣新片榜top10电影基本信息
yield->二级
二级:获取电影的描述信息
yield->三级
三级:获取电影的短评列表(拼接下一页的url+requests循环分页获取)
yield->四级
09/03更新:
四级:获取电影影评列表
yield->四级影评详情 return
yield->MongoDB
todo:正在解决ip异常问题。。。
09/04更新:爬取免费ip池,使用代理ip爬取数据(使用了scrapy_proxies这个插件) 代码已经更新。
->待优化,ip池应该另起服务单独持续运行,定时维护数据库中或文件中的可用ip。(ps:代码中使用的免费代理ip实用性不高。。这里只对这种方法做出演示,如需稳定高效代理ip,可自行付费购买-。-)
主代码:
from lxml import etree
import requests
import telnetlib
import scrapy
class DoubanmovieSpider(scrapy.Spider):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.initIp()
# 爬虫id 唯一
name = 'doubanmovie'
# 允许采集的域名(所有采集的数据仅限在当前域名下)
allowed_domains = ['movie.douban.com']
# 开始采集的网站
start_urls = ['https://movie.douban.com/chart/']
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
# 解析响应的数据,可以理解为一个http请求的的response
# 一级信息->电影简介
def parse(self, response):
# 整个div的数据
divResultList = response.xpath("//div[@class='pl2']")
for result in divResultList:
data = {}
name = result.xpath(".//a/text()").extract_first().replace('/', '').strip()
aliasName = result.xpath(".//a/span/text()").extract_first()
info = result.xpath(".//p/text()").extract_first()
rank = result.xpath(".//span[@class='rating_nums']/text()").extract_first()
rankPeople = result.xpath(".//span[@class='pl']/text()").extract_first()
linkUrl = result.xpath(".//a/@href").extract_first()
data['name'] = name
data['aliasName'] = aliasName
data['info'] = info
data['rank'] = rank
data['rankPeople'] = rankPeople
data['linkUrl'] = linkUrl
yield scrapy.Request(url=linkUrl, callback=self.movieDetail, meta={'data': data}, dont_filter=True)
# 二级信息->电影详情
def movieDetail(self, response):
# 上级信息
data = response.meta['data']
movieDetail = {}
# 电影剧情
description = response.xpath("//div[@class='indent']/span/text()").extract_first().strip()
movieDetail['description'] = description
data['movieDetail'] = movieDetail
# 短评列表后缀url
suffixUrl = response.xpath("//div[@id='hot-comments']/a/@href").extract_first()
longUrl = response.xpath("//section[@id='reviews-wrapper']/p/a/@href").extract_first()
# 短评列表完整url
shortReviewUrl = data['linkUrl'] + suffixUrl
# 影评列表完整url
longReviewUrl = data['linkUrl'] + longUrl
data['longLinkUrl'] = longReviewUrl
yield scrapy.Request(url=shortReviewUrl, callback=self.shortReview, meta={'data': data}, dont_filter=True)
# 三级信息->短评列表
def shortReview(self, response):
data = response.meta['data']
shortReviewBaseUrl = response.url
print(shortReviewBaseUrl)
limit = 20
start = 20
shortReviewList = []
while True:
# for i in range(1):
url = shortReviewBaseUrl + "&start=" + str(start) + "&limit=" + str(limit)
start = start + 20
res = requests.get(url=url, headers=self.headers).content.decode('utf8')
xpathHtml = etree.HTML(res)
xpathList = xpathHtml.xpath("//div[@class='comment-item']")
if len(xpathList) < 20:
break
for xpathResult in xpathList:
result = {}
# 评价人的姓名
people = xpathResult.xpath(".//span[@class='comment-info']/a/text()")
# 评级时间
time = str(xpathResult.xpath(".//span[@class='comment-time ']/text()")[0]).replace("\\n", "").strip()
# 评价内容
content = xpathResult.xpath(".//span[@class='short']/text()")
result['people'] = people
result['time'] = time
result['content'] = content
shortReviewList.append(result)
print("url========================================================" + url)
data['shortReviewList1111'] = shortReviewList
longLinkUrl = data['longLinkUrl']
yield scrapy.Request(url=longLinkUrl, callback=self.longReview, meta={'data': data}, dont_filter=True)
# 四级信息->影评列表
def longReview(self, response):
data = response.meta['data']
longReviewUrl = response.url
start = 20
longReviewList = []
while True:
# for i in range(1):
url = longReviewUrl + "?start=" + str(start)
start = start + 20
res = requests.get(url=url, headers=self.headers).content.decode('utf8')
xpathHtml = etree.HTML(res)
xpathList = xpathHtml.xpath("//div[@class='main review-item']")
if len(xpathList) < 20:
break
for xpathResult in xpathList:
result = {}
# 评价人姓名
name = xpathResult.xpath(".//header/a[@class='name']/text()")
# 评级
score = xpathResult.xpath(".//span[1]/@title")
# 评价时间
time = xpathResult.xpath(".//span[2]/text()")
# 评价标题
title = xpathResult.xpath(".//div[@class='main-bd']/h2/a/text()")
# 评价详情链接
linkUrl = str(xpathResult.xpath(".//div[@class='main-bd']/h2/a/@href")[0])
# 评价详情
content = self.longReviewContentDetail(linkUrl)
result['name'] = name
result['score'] = score
result['time'] = time
result['title'] = title
result['linkUrl'] = linkUrl
result['content'] = content
longReviewList.append(result)
pass
data['longReviewList'] = longReviewList
yield data
# 影评详情内容
def longReviewContentDetail(self, url):
detail = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
res = requests.get(url=url, headers=headers).content.decode('utf8')
xpathHtml = etree.HTML(res)
xpathList = xpathHtml.xpath("//div[@id='link-report']")
detail['content'] = str(xpathList[0].xpath(".//p/text()"))
detail['contentImageUrl'] = xpathList[0].xpath(".//div[@class='image-wrapper']//img/@src")
return detail
# 初始化ip池
def initIp(self):
print("初始IP池...")
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
tempIpList = []
for i in range(30):
res = requests.get(url='http://www.66ip.cn/areaindex_' + str(i) + '/1.html',
headers=headers).content.decode(
'gbk')
xpathHtml = etree.HTML(res)
xpathList = xpathHtml.xpath("//div[@class='containerbox boxindex']//table//tr")
for j in range(len(xpathList)):
if j < 2:
continue
tempIp = "//div[@class='containerbox boxindex']//table//tr[" + str(j) + "]//td[1]/text()"
tempHost = "//div[@class='containerbox boxindex']//table//tr[" + str(j) + "]//td[2]/text()"
ip = xpathList[j].xpath(tempIp)
host = xpathList[j].xpath(tempHost)
# ipHost = "http:"+ip[0]+":"+host[0]
ipHost = {'ip': ip, 'host': host}
tempIpList.append(ipHost)
# 校验ip是否可用并存入txt文件
print(tempIpList)
ipList = []
for ipHost in tempIpList:
try:
telnetlib.Telnet(str(ipHost['ip'][0]), port=str(ipHost['host'][0]), timeout=1)
except:
print(str(ipHost['ip'][0]) + ":" + str(ipHost['host'][0]) + "此地址不可用了-.-")
else:
print("http://" + str(ipHost['ip'][0]) + ":" + str(ipHost['host'][0]) + "此地址可用 ^-^.")
ipList.append("http://" + str(ipHost['ip'][0]) + ":" + str(ipHost['host'][0]))
with open('ip.txt', 'w', encoding='utf-8') as file_write:
for ip in ipList:
file_write.write(ip + "\n")
print("初始化IP池成功,共有" + str(len(ipList)) + "条IP可用~")
随机切换UserAgent中间件的代码:
from fake_useragent import UserAgent
# 随机切换UserAgent中间件
class UserAgentMiddleware(object):
def __init__(self, crawler):
super(UserAgentMiddleware, self).__init__()
self.ua = UserAgent(verify_ssl=False)
self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def process_request(self, request, spider):
def get_user_agent():
return getattr(self.ua, self.ua_type)
request.headers.setdefault('User-Agent', get_user_agent())
要使用代理的settings文件配置:
DOWNLOADER_MIDDLEWARES = {
# 'moviedoubanSpider.middlewares.MoviedoubanspiderDownloaderMiddleware': 543,
'moviedoubanSpider.UserAgentMiddleware.UserAgentMiddleware': 543,
'scrapy_proxies.RandomProxy': 100,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
}
数据:
ps:为什么不直接使用scpapy而也使用了requests?
因为我在爬取第三层分页数据的时候,两个yield发生的异步直接影响了数据的存储。。。目前还不知道怎么解决中-。- 只好在第三层不yield而直接在循环内调用requests完成爬取。。。
两个yield版本如下:欢迎各位大佬指点。。。
import scrapy
class DoubanmovieSpider(scrapy.Spider):
# 爬虫id 唯一
name = 'doubanmovie'
# 允许采集的域名(所有采集的数据仅限在当前域名下)
allowed_domains = ['movie.douban.com']
# 开始采集的网站
start_urls = ['https://movie.douban.com/chart/']
# 解析响应的数据,可以理解为一个http请求的的response
# 一级信息->电影简介
def parse(self, response):
# 整个div的数据
divResultList = response.xpath("//div[@class='pl2']")
i = 1
for result in divResultList:
data = {}
name = result.xpath(".//a/text()").extract_first().replace('/', '').strip()
aliasName = result.xpath(".//a/span/text()").extract_first()
info = result.xpath(".//p/text()").extract_first()
rank = result.xpath(".//span[@class='rating_nums']/text()").extract_first()
rankPeople = result.xpath(".//span[@class='pl']/text()").extract_first()
linkUrl = result.xpath(".//a/@href").extract_first()
# print("电影名称:" + name)
# print("电影别名:" + aliasName)
# print("电影简介:" + info)
# print("电影评分:" + rank)
# print("评分人数:" + rankPeople)
# print("链接:" + linkUrl)
# print("\n")
i = i + 1
data['name'] = name
data['aliasName'] = aliasName
data['info'] = info
data['rank'] = rank
data['rankPeople'] = rankPeople
data['linkUrl'] = linkUrl
yield scrapy.Request(url=linkUrl, callback=self.movieDetail, meta={'data': data})
# 二级信息->电影详情
def movieDetail(self, response):
# 上级信息
data = response.meta['data']
movieDetail = {}
# 电影剧情
description = response.xpath("//div[@class='indent']/span/text()").extract_first().strip()
movieDetail['description'] = description
data['movieDetail'] = movieDetail
# 短评列表后缀url
suffixUrl = response.xpath("//div[@id='hot-comments']/a/@href").extract_first()
# 短评列表完整url
shortReviewUrl = data['linkUrl'] + suffixUrl
yield scrapy.Request(url=shortReviewUrl, callback=self.shortReviewFor, meta={'data': data})
# 循环三级评论信息url 交给处理三级信息
def shortReviewFor(self, response):
data = response.meta['data']
print(data['name'])
shortReviewBaseUrl = response.url
limit = 20
for i in range(20):
print(i)
url = shortReviewBaseUrl + "&start=" + str(self.start) + "&limit=" + str(limit)
print("url========================================================"+url)
self.start = self.start + 20
yield scrapy.Request(url=url, callback=self.shortReviewList)
data['shortReviewList1111'] = self.tempList
print(data)
self.tempList = []
self.start = 20
# 临时存放的评论列表
tempList = []
start = 20
# 三级信息->短评列表
def shortReviewList(self, response):
# 评价列表
evaluateList = response.xpath("//div[@class='comment-item']")
shortReviewList = []
for evaluate in evaluateList:
shortReviewMap = {}
# 评价人的姓名
people = evaluate.xpath(".//span[@class='comment-info']/a/text()").extract_first()
# 评级时间
time = str(evaluate.xpath(".//span[@class='comment-time ']/text()").extract_first()).replace("\n",
"").strip()
# 评价内容
content = evaluate.xpath(".//span[@class='short']/text()").extract()
shortReviewMap['people'] = people
shortReviewMap['time'] = time
shortReviewMap['content'] = content
shortReviewList.append(shortReviewMap)
self.tempList += shortReviewList
pass
爬出来的数据是这样的。。。。
完整项目地址:github:https://github.com/chuanzige/douban_newMovie_top10
gitee:https://gitee.com/QingJiaoWoChuanZiGe/douban_newMovie_top10