爬取hao123电影信息

目标网站url “http://v.hao123.baidu.com

爬取字段 电影标题,电影详情url,主角,导演,评分,国家,电影简介

数据库表的设计

CREATE TABLE movies_info (
  id int(10) unsigned NOT NULL AUTO_INCREMENT,
  movie_title varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  movie_title_url varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  protagonist varchar(100) DEFAULT NULL,
  movie_score varchar(20) DEFAULT NULL,
  movie_director varchar(100) DEFAULT NULL,
  movie_country varchar(20) DEFAULT NULL,
  movie_content longtext,
  PRIMARY KEY (id)
) ENGINE=InnoDB AUTO_INCREMENT=696 DEFAULT CHARSET=utf8;
​

创建爬虫项目

scrapy startproject movies_spider

cd movies_spider

scrapy genspider movies

修改items文件

import scrapy
class MoviesSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    movie_title = scrapy.Field()#电影标题
    movie_title_url = scrapy.Field()#电影标题链接
    protagonist = scrapy.Field() #主角
    movie_score = scrapy.Field() #评分
    movie_director = scrapy.Field()#导演
    movie_country = scrapy.Field()#国家
    movie_content = scrapy.Field() #电影简介

创建爬虫

# -- coding: utf-8 --
import scrapy
from scrapy.http import Request
# from urllib import parse
from movies_spider.items import MoviesSpiderItem
​
class MoviesSpider(scrapy.Spider):
    name = 'movies'
    allowed_domains = ['v.hao123.baidu.com']
    start_urls = ['http://v.hao123.baidu.com/v/search?channel=movie&decade=2018&pn=1']
    def parse(self, response):
​
"获取所有列表页的地址交给下载器下载"
​
        movies_urls = response.xpath("//ul[@class='wg-c-list clearfix']/li/a/@href").getall()#获取每一个电影的url
        for index in movies_urls:
​
此时的url是每个电影标题的url
​
            yield Request(url=index,callback=self.parse_detail)
​
提取下一页并交给scrapy进行下载
​
        next_url = 'http://v.hao123.baidu.com' + response.xpath("//div[@class='c-pagination clearfix']/a[last()]/@href").get()
        if next_url:
​
如果有下一页就提交下载
​
            yield Request(url=next_url, callback=self.parse)
​
    def parse_detail(self,response):
​
        #解析详情页的数据
        movie_title =  response.xpath("//div[@class='items clearfix']/h1/text()").get() #电影的标题
        movie_title_url = response.url  # 电影标题链接
        protagonist = ','.join(response.xpath("//span[@monkey='actor']//a/text()").getall() )    # 主角
        movie_score = response.xpath("//span@class='score'/text()").get()   # 评分
        movie_director =  ','.join(response.xpath("//span[@monkey='director']/a/text()").getall())   # 导演
        movie_country =  ','.join(response.xpath("//span[@monkey='area']/a/text() ").getall())  # 国家
        movie_content =  response.xpath("//p[contains(@class,'abstract')]/em/text()").get()   # 电影简介
        movies_info = MoviesSpiderItem(
            movie_title = movie_title,
            movie_title_url = movie_title_url,
            protagonist = protagonist,
            movie_score = movie_score,
            movie_director = movie_director,
            movie_country = movie_country,
            movie_content = movie_content
        )
        yield movies_info
​

修改pipelines文件

# -- coding: utf-8 --
​
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import json
import codecs
​
class MoviesSpiderPipeline(object):
    def init(self):
        dbparams = {
            'host': '127.0.0.1',
            'port': 3306,  # 注意是数字类型
            'user': 'root',
            'password': 'wulinlin',
            'database': 'movies',
            'charset': 'utf8'
        }
        self.conn = pymysql.connect(**dbparams)  # 创建数据库的连接
        self.cursor = self.conn.cursor()  # 创建游标
        self._sql = None
        self.file = codecs.open('movies_info.json', 'w', encoding='utf-8')#打开文件
​
    def process_item(self, item, spider):
        # 执行sql语句
        self.cursor.execute(self.sql, (
            item['movie_title'], item['movie_title_url'],
            item['protagonist'], item['movie_score'],
            item['movie_director'], item['movie_country'],
            item['movie_content'])
                            )
        self.conn.commit()  # 提交事务
        #写入json文件
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        self.file.close()
        return item
    @property
    def sql(self):
        if not self.sql:
            self.sql = """
            insert into movies_info
            (id,movie_title,movie_title_url ,protagonist,movie_score,movie_director,movie_country,movie_content)
            values (null,%s,%s,%s,%s,%s,%s,%s)
            """
            return self.sql
        return self.sql
​
#将数据保存到json文件#
​
class JsonPipeline(object):
​
def init(self):
​
self.file = codecs.open('movies_info.json', 'w', encoding='utf-8')
​
def process_item(self, item, spider):
​
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
​
self.file.write(line)
​
return item
​
def spider_closed(self, spider):
​
self.file.close()
​

修改settings文件

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
}
​
ITEM_PIPELINES = {
   'movies_spider.pipelines.MoviesSpiderPipeline': 300,
    # 'movies_spider.pipelinesJsonPipeline':301,
}
​
ROBOTSTXT_OBEY = False
​
DOWNLOAD_DELAY = 3
​

 

创建启动脚本

from scrapy import cmdline
cmdline.execute('spider crawl movies'.split())

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

代码小学生王木木

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值