代码中有详细的注释
# -*- coding: utf-8 -*-
import scrapy
from movie.items import MovieItem
class AllMovieSpider(scrapy.Spider):
name = 'all_movie'
# allowed_domains = ['4567tv.tv']
start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/1.html']
domain = "https://www.4567tv.tv"
# 需要爬取的的内容(电影名称:首页爬取)
# 电影简介详情页面爬取
# 电影详情页面的链接
# 爬取前五页的内容
# 通用的url模板,适用于第二页以及以后
url = "https://www.4567tv.tv/index.php/vod/show/id/1/page/%d.html"
page = 2
def parse(self, response):
all_li = response.xpath("//div[@class='stui-pannel_bd']//ul//li")
for li in all_li:
movie_name = li.xpath(".//h4/a/text()").get()
movie_url = self.domain + li.xpath(".//h4/a/@href").get()
item = MovieItem()
item['movie_name'] = movie_name
# 对详情页面的url发起get请求,并请求传参
# 请求传参:meta参数对应的字典就可以传递给请求对象中指定好的回调函数
yield scrapy.Request(url=movie_url, callback=self.detail_parse, meta={'item': item})
if self.page <= 5:
return
else:
new_url = format(self.url % self.page)
self.page += 1
yield scrapy.Request(url=new_url, callback=self.page)
def detail_parse(self, response):
# 回调函数内部通过response.meta就可以接收到请求传参传递过来的字典
item = response.meta['item']
detail = response.xpath("//div[@class='stui-content__detail']//p[@class='desc detail hidden-xs']"
"//span[@class='detail-content']/text()").get()
print(detail)
item['detail'] = detail
yield item
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
class MoviePipeline(object):
def __init__(self):
self.conn = pymysql.Connect(host='127.0.0.1',
port=3306,
user='root',
password='root',
db='yju')
self.cursor = self.conn.cursor()
def open_spider(self, spider):
print("爬虫开始了、、、")
def process_item(self, item, spider):
try:
sql = "insert into movie(movie_name, detail) values ('%s', '%s')" % (item['movie_name'], item['detail'])
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print("插入数据出现错误")
print("!" * 50)
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
print("爬虫结束了、、、")