腾讯视频电影信息爬取(scrapy框架下采用MySQL数据库)

一、准备

 1.cmd 命令行 pip install pymysql,pip install lxml,pip install requests

 2.创建scrapy项目并进行MySQL数据库配置

 具体配置过程可见转载博客:http://blog.csdn.net/qq_31518899/article/details/76576537 Scrapy连接MySQL数据库

 当然若有不太明白的可见本项目具体代码items.py中MysqlConnect类和settings.py代码比较后进行修改操作

二、代码块

 1.settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for Tencent project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Tencent'

SPIDER_MODULES = ['Tencent.spiders']
NEWSPIDER_MODULE = 'Tencent.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Tencent (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY =False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Tencent.middlewares.TencentSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'Tencent.middlewares.TencentDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'Tencent.pipelines.TencentPipeline': 300,
   #'Tencent.pipelines.MySQLConnectPipeline':305,
}

#Mysql数据库的配置信息
MYSQL_HOST='127.0.0.1'
MYSQL_USER='root'
#你自己数据库的密码
MYSQL_PASSWORD='root'
MYSQL_PORT =3306
#你自己数据库的名称
MYSQL_DB='test'
CHARSET='utf8'

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 2.pipelines.py
 
# -*- coding: utf-8 -*-
import scrapy
import xlwt
import pymysql
from scrapy.conf import settings
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class TencentPipeline(object):


    def process_item(self, item, spider):
        item.insert_data(item)
        return item

 3.tencent_spider.py
 
# -*- coding: utf-8 -*-
import scrapy
from ..items import TencentItem,CommentItem
import re,requests,json


class TencentSpiderSpider(scrapy.Spider):
    name = 'tencent_spider'
    allowed_domains = ['v.qq.com']
    start_urls = ['https://v.qq.com/x/list/movie']

    def parse(self, response):
        category_part = response.xpath('//div[@class="mod_row_filter"]/ul/li/a/@href').extract()
        for href in category_part:
            detail_url='https://v.qq.com/x/list/movie{}'.format(href)
            yield scrapy.Request(url=detail_url,
                                 callback=self.detail_parse
            )
    def detail_parse(self,response):
        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 ' \
                          'Firefox/53.0'}
        #分类后的电影信息
        movie_links=response.xpath('//div[@class="mod_figures mod_figure_v"]/ul/li/a/@href').extract()
        movie_titles=response.xpath('//div[@class="figure_title_score"]/strong/a/text()').extract()
        movie_scores=response.xpath('//div[@class="figure_score"]//text()').extract()
        score_list=[]
        total_score=[]
        #得到处理后的评分列表
        for movie_score in movie_scores:
            if movie_score !='\n\t\t\t\t\t\t\t' and movie_score!='\n\t\t\t\t\t\t':
                score_list.append(movie_score)
        #print(score_list)
        j=0
        while j in range(0,len(score_list)-1):
            score=score_list[j]+score_list[j+1]
            j += 2
            total_score.append(score)
        #print(total_score)
        movie_playCounts=response.xpath('//div[@class="figure_count"]/span/text()').extract()#播放量
        movie_account=response.xpath('//span[@class="option_txt"]/em/text()').extract_first('')#个数
        #进入电影详情页
        for x in range(0,len(movie_links)):
            #获取电影链接中的cid例如中括号的内容https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery(19109829145422060698_1517407245638)&op=3&【cid=b5i4g9z3u5h31jy】
            #然后接合GET请求中的评论页的json链接获取json数据中的comment_id,然后拼接评论页url,获取评论内容
            cid=movie_links[x].split('/')[-1]#获取cid
            cid=cid.split('.')[0]
            #print(cid)
            #获取comment_id
            comment_id_url='https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&callback=jQuery&op=3&cid={}'.format(cid)
            html=requests.get(comment_id_url).text
            pattern=re.compile(r'comment_id":"(.*?)"')
            comment_id=re.search(pattern,html).group(1)
            #print(comment_id)
            #获取评论页内容
            comment_url='http://coral.qq.com/article/{}/comment/'.format(comment_id)
            comment_html=requests.get(comment_url,headers=headers).text
            dict=json.loads(comment_html)#获得json数据,并通过解析取出需要数据
            data_dict = dict['data']
            commentid_list = data_dict['commentid']
            if commentid_list:#电影有评论
                for detail in commentid_list:
                    comment =CommentItem()
                    comment['movie_title'] = movie_titles[x]#电影名
                    comment['timeDifference'] = detail['timeDifference']# 发布时间
                    comment['content'] = detail['content']# 内容
                    comment['up'] = detail['up']# 点赞
                    comment['rep'] = detail['rep']# 踩
                    userinfo_dict = detail['userinfo']# 用户信息(字典)
                    userid = userinfo_dict['userid']
                    comment['userid']=userid# 用户id
                    comment['userLink']='http://video.coral.qq.com/review/user/{}'.format(userid)#用户链接
                    yield comment

            yield  scrapy.Request(url=movie_links[x],
                                callback=self.movie_parse,
                                 meta={'movie_link':movie_links[x],
                                     'movie_title':movie_titles[x],
                                       'score':total_score[x],
                                       'movie_playCount':movie_playCounts[x],
                                       'movie_account':movie_account}
            )
        # 下一页
        next_pg = response.xpath('//a[@class="page_next"]/@href').extract_first('')
        print(next_pg)
        if next_pg:
            next_url = 'https://v.qq.com/x/list/movie{}'.format(next_pg)
            yield scrapy.Request(url=next_url,
                                 callback=self.detail_parse
                                 )
    def movie_parse(self,response):

       #简介区
       abstract=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[2]/p/text('
                                ')').extract_first('')
       directors=response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//text()').extract()
       director_links = response.xpath('//div[@class="mod_row_box"]/div[2]/ul/li/div[1]/a//@href').extract()
       if directors:#存在导演信息
            director=directors[0]
            act=','.join(directors[1:])
            director_link=director_links[0]
            act_link=','.join(director_links[1:])
       else:
           director ='#'
           act = '#'
           director_link = '#'
           act_link = '#'
       #概览区
       movie_title=response.meta['movie_title']
       score=response.meta['score']
       movie_playCount=response.meta['movie_playCount']
       movie_account=response.meta['movie_account']
       movie_link=response.meta['movie_link']

       movie=TencentItem()
        #简介
       movie['abstract']=abstract
       movie['director']=director
       movie['act']=act
       movie['director_link']=director_link
       movie['act_link']=act_link
        #概览
       movie['movie_title']=movie_title
       movie['score']=score
       movie['movie_playCount']=movie_playCount
       movie['movie_link']=movie_link
       movie['movie_account']=movie_account
       yield movie
此为主要代码
 
4. items.py
 
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
import pymysql
from scrapy.conf import settings

class MySQLConnect(scrapy.Item):
    @staticmethod
    def mysqlConnect(sql):
        host = settings['MYSQL_HOSTS']
        user = settings['MYSQL_USER']
        psd = settings['MYSQL_PASSWORD']
        db = settings['MYSQL_DB']
        charset = settings['CHARSET']
        port = settings['MYSQL_PORT']
        # 数据库连接
        con = pymysql.connect(host=host, user=user, passwd=psd, db=db, charset=charset, port=port)
    # 数据库游标
        cur = con.cursor()
        try:
            cur.execute(sql)
            print("insert success")  # 测试语句
        except Exception as e:
            print('Insert error:', e)
            con.rollback()
        else:
            con.commit()
        con.close()

class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    #简介区
    abstract=scrapy.Field()
    director=scrapy.Field()
    director_link=scrapy.Field()
    act=scrapy.Field()
    act_link=scrapy.Field()

    #概览区
    movie_title=scrapy.Field()
    score=scrapy.Field()
    movie_playCount=scrapy.Field()
    movie_account=scrapy.Field()
    movie_link=scrapy.Field()

    def insert_data(self,item):
        sql1 = "insert into abstract(movieTitle,director,directorLink,act,actLink,abstract)values('%s','%s','%s','%s','%s','%s');" % (item['movie_title'], item['director'], item['director_link'], item['act'], item['act_link'],item['abstract'])
        print('TencentItem insert.....................')
        MySQLConnect.mysqlConnect(sql1)
        sql3 = "insert into overview(movieTitle,score,playCount,link)values('%s','%s','%s','%s');" % (item['movie_title'], item['score'], item['movie_playCount'], item['movie_link'])
        MySQLConnect.mysqlConnect(sql3)
class CommentItem(scrapy.Item):
    # 评论区
    movie_title = scrapy.Field()
    timeDifference = scrapy.Field()
    content = scrapy.Field()
    up = scrapy.Field()
    rep = scrapy.Field()
    userLink = scrapy.Field()
    userid = scrapy.Field()
    def insert_data(self,item):
        sql2 = "insert into comment(userID,userLink,timeDiffrence,content,praise,ref,movieTitle)values('%s','%s','%s','%s','%s','%s','%s');" % (item['userid'], item['userLink'], item['timeDifference'], item['content'], item['up'],item['rep'], item['movie_title'])
        print('CommentItem insert................')
        MySQLConnect.mysqlConnect(sql2)
具体项目可见码云: https://gitee.com/YunZhiBiDuan3555/TengXunShiPinWangYeBanDianYingXinXiPaQu.git




  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值