首先创建项目
进入项目文件夹下用cmd
scrapy startproject dytt
打开项目创建app
scrapy genspider dy dytt8.net
修改settings.py
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
ITEM_PIPELINES = {
'dytt.pipelines.DyttPipeline': 300,
}
编写items.py增加字段
class DyttItem(scrapy.Item):
name = scrapy.Field()
date = scrapy.Field()
haibaio = scrapy.Field()
content = scrapy.Field()
zhongzi = scrapy.Field()
编写dy.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class DySpider(CrawlSpider):
name = 'dy'
allowed_domains = ['dytt8.net']
#这个位置写的是起始页的url
start_urls = ['https://www.dytt8.net/html/gndy/dyzz/list_23_1.html']
#通过正则或者bs4或者xpath找到每一页的选项(注意正则需要找到href)
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[@class='x']//a"),callback='parse_item'
,folrue),
)
#这里是回调函数
def parse_item(self, response):
#通过xpath找到一级页面所需要的所有数据的块
movielist = response.xpath("//div[@class='co_content8']//table")
#遍历出每一个并且找到每一个需要的字段存入字典
for movie in movielist:
item = {}
item['name'] = movie.xpath(".//tr[2]/td[2]//a/text()").extract_first()
item['date'] = movie.xpath(".//tr[3]/td[2]//font/text()").extract_first()
# print(item)
#找到二级页面的链接并且手动拼接到域名后面
next_url = "https://www.dytt8.net" + movie.xpath(".//tr[2]/td[2]//a/@href").extract_first()
# print(next_url)
#传入参数并且定义一个回掉函数处理二级页面
yield scrapy.Request(url=next_url, callback=self.parse_info, meta={'item': item})
def parse_info(self,response):
#调出传进来的字典
item = response.meta["item"]
item["haibaio"] = response.xpath("//div[@id='Zoom']//p[1]//img[1]/@src").extract_first()
item["content"] = response.xpath("//div[@id='Zoom']//p[1]/text()").extract()
item["zhongzi"] = response.xpath("//div[@id='Zoom']//table//a/text()").extract_first()
# print(item["content"])
# print(item)
传入pip管道使其下载保存
yield item
编写pipelines.py
#导入json模块
import json
#导入redis模块
import redis
class DyttPipeline(object):
#定义打开redis数据库的方法
def open_spider(self,spider):
self.rds = redis.StrictRedis(host='www.fanjianbo.com',port=6379,db=14)
#定义保存的方法并且传入item保存(注意需要把item转成字典格式再存入)
def process_item(self, item, spider):
print(item)
self.rds.lpush('stxiang', json.dumps(dict(item)))
return item