腾讯视频信息数据爬取程序代码【笔记】
# -*- coding: utf-8 -*-
import scrapy
from ..items import TencentItem,CommentItem
import re,requests,json
class TencentSpiderSpider(scrapy.Spider):
name = 'tencent_spider'
allowed_domains = ['v.qq.com']
start_urls = ['https://v.qq.com/x/list/movie']
def parse(self, response):
category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract()
for href in category_part:
detail_url='https://v.qq.com/x/list/movie{}'.format(href)
yield scrapy.Request(url=detail_url,
callback=self.detail_parse
)
def detail_parse(self,response):
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \
'Firefox/53.0'}
#分类后的电影信息
movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract()
movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract()
movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract()
score_list=[]
total_score=[]
#得到处理后的评分列表
for movie_score in movie_scores:
if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t':
score_list.append(movie_score)
#print(score_list)
j=0
while j in range(0,len(score_list)-1):
score=score_list[j]+score_list[j+1]
j += 2
total_score.append(score)
#print(total_score)
movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量
movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数
#进入电影详情页
for x in range(0,len(movie_links)):
#获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】
#然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容
cid=movie_links[x].split('/')[-1]#获取cid
cid=cid.split('.')[0]
#print(cid)
#获取comment_id
comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid)
html=requests.get(comment_id_url).text
pattern=re.compile(r'comment_id":"(.*?)"')
comment_id=re.search(pattern,html).group(1)
#print(comment_id)
#获取评论页内容
comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id)
comment_html=requests.get(comment_url,headers=headers).text
dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据
data_dict = dict['data']
commentid_list = data_dict['commentid']
if commentid_list:#电影有评论
for detail in commentid_list:
comment =CommentItem()
comment['movie_title'] = movie_titles[x]#电影名
comment['timeDifference'] = detail['timeDifference']# 发布时间
comment['content'] = detail['content']# 内容
comment['up'] = detail['up']# 点赞
comment['rep'] = detail['rep']# 踩
userinfo_dict = detail['userinfo']# 用户信息(字典)
userid = userinfo_dict['userid']
comment['userid']=userid# 用户id
comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接
yield comment
yield scrapy.Request(url=movie_links[x],
callback=self.movie_parse,
meta={'movie_link':movie_links[x],
'movie_title':movie_titles[x],
'score':total_score[x],
'movie_playCount':movie_playCounts[x],
'movie_account':movie_account}
)
# 下一页
next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('')
print(next_pg)
if next_pg:
next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg)
yield scrapy.Request(url=next_url,
callback=self.detail_parse
)
def movie_parse(self,response):
#简介区
abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text('
')').extract_first('')
directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract()
director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract()
if directors:#存在导演信息
director=directors[0]
act=','.join(directors[1:])
director_link=director_links[0]
act_link=','.join(director_links[1:])
else:
director ='#'
act = '#'
director_link = '#'
act_link = '#'
#概览区
movie_title=response.meta['movie_title']
score=response.meta['score']
movie_playCount=response.meta['movie_playCount']
movie_account=response.meta['movie_account']
movie_link=response.meta['movie_link']
movie=TencentItem()
#简介
movie['abstract']=abstract
movie['director']=director
movie['act']=act
movie['director_link']=director_link
movie['act_link']=act_link
#概览
movie['movie_title']=movie_title
movie['score']=score
movie['movie_playCount']=movie_playCount
movie['movie_link']=movie_link
movie['movie_account']=movie_account
yield movie