目标网站url “http://v.hao123.baidu.com”
爬取字段 电影标题,电影详情url,主角,导演,评分,国家,电影简介
数据库表的设计
CREATE TABLE movies_info ( id int(10) unsigned NOT NULL AUTO_INCREMENT, movie_title varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, movie_title_url varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL, protagonist varchar(100) DEFAULT NULL, movie_score varchar(20) DEFAULT NULL, movie_director varchar(100) DEFAULT NULL, movie_country varchar(20) DEFAULT NULL, movie_content longtext, PRIMARY KEY (id) ) ENGINE=InnoDB AUTO_INCREMENT=696 DEFAULT CHARSET=utf8;
创建爬虫项目
scrapy startproject movies_spider
cd movies_spider
scrapy genspider movies
修改items文件
import scrapy class MoviesSpiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() movie_title = scrapy.Field()#电影标题 movie_title_url = scrapy.Field()#电影标题链接 protagonist = scrapy.Field() #主角 movie_score = scrapy.Field() #评分 movie_director = scrapy.Field()#导演 movie_country = scrapy.Field()#国家 movie_content = scrapy.Field() #电影简介
创建爬虫
# -- coding: utf-8 -- import scrapy from scrapy.http import Request # from urllib import parse from movies_spider.items import MoviesSpiderItem class MoviesSpider(scrapy.Spider): name = 'movies' allowed_domains = ['v.hao123.baidu.com'] start_urls = ['http://v.hao123.baidu.com/v/search?channel=movie&decade=2018&pn=1'] def parse(self, response): "获取所有列表页的地址交给下载器下载" movies_urls = response.xpath("//ul[@class='wg-c-list clearfix']/li/a/@href").getall()#获取每一个电影的url for index in movies_urls: 此时的url是每个电影标题的url yield Request(url=index,callback=self.parse_detail) 提取下一页并交给scrapy进行下载 next_url = 'http://v.hao123.baidu.com' + response.xpath("//div[@class='c-pagination clearfix']/a[last()]/@href").get() if next_url: 如果有下一页就提交下载 yield Request(url=next_url, callback=self.parse) def parse_detail(self,response): #解析详情页的数据 movie_title = response.xpath("//div[@class='items clearfix']/h1/text()").get() #电影的标题 movie_title_url = response.url # 电影标题链接 protagonist = ','.join(response.xpath("//span[@monkey='actor']//a/text()").getall() ) # 主角 movie_score = response.xpath("//span@class='score'/text()").get() # 评分 movie_director = ','.join(response.xpath("//span[@monkey='director']/a/text()").getall()) # 导演 movie_country = ','.join(response.xpath("//span[@monkey='area']/a/text() ").getall()) # 国家 movie_content = response.xpath("//p[contains(@class,'abstract')]/em/text()").get() # 电影简介 movies_info = MoviesSpiderItem( movie_title = movie_title, movie_title_url = movie_title_url, protagonist = protagonist, movie_score = movie_score, movie_director = movie_director, movie_country = movie_country, movie_content = movie_content ) yield movies_info
修改pipelines文件
# -- coding: utf-8 -- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import json import codecs class MoviesSpiderPipeline(object): def init(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, # 注意是数字类型 'user': 'root', 'password': 'wulinlin', 'database': 'movies', 'charset': 'utf8' } self.conn = pymysql.connect(**dbparams) # 创建数据库的连接 self.cursor = self.conn.cursor() # 创建游标 self._sql = None self.file = codecs.open('movies_info.json', 'w', encoding='utf-8')#打开文件 def process_item(self, item, spider): # 执行sql语句 self.cursor.execute(self.sql, ( item['movie_title'], item['movie_title_url'], item['protagonist'], item['movie_score'], item['movie_director'], item['movie_country'], item['movie_content']) ) self.conn.commit() # 提交事务 #写入json文件 line = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(line) self.file.close() return item @property def sql(self): if not self.sql: self.sql = """ insert into movies_info (id,movie_title,movie_title_url ,protagonist,movie_score,movie_director,movie_country,movie_content) values (null,%s,%s,%s,%s,%s,%s,%s) """ return self.sql return self.sql #将数据保存到json文件# class JsonPipeline(object): def init(self): self.file = codecs.open('movies_info.json', 'w', encoding='utf-8') def process_item(self, item, spider): line = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(line) return item def spider_closed(self, spider): self.file.close()
修改settings文件
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8', 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36', } ITEM_PIPELINES = { 'movies_spider.pipelines.MoviesSpiderPipeline': 300, # 'movies_spider.pipelinesJsonPipeline':301, } ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 3
创建启动脚本
from scrapy import cmdline cmdline.execute('spider crawl movies'.split())