小白记录学习.
框架图片来源:https://www.cnblogs.com/funsion/p/6854307.html
scrapy-redis框架借助redis可以做到断点续爬以及拥有重复过滤器,以此来保证爬取过的url不再重复爬,也可以多开进程同时爬取或实现分布式爬。同时此框架可以用本身的pipline将爬取的数据存入redis中,也可以自定义管道存储数据,本案例自定义管道写入MySQL。
1.item.py
先定义好数据集
import scrapy
class JingdongItem(scrapy.Item):
id = scrapy.Field() # 商品id
name = scrapy.Field() # 商品名
price = scrapy.Field() # 商品价格
sell_num = scrapy.Field() # 销量
good_pre = scrapy.Field() # 好评率
jd_wl = scrapy.Field() # 是否为京东物流
publisher = scrapy.Field() # 出版社
detail_url = scrapy.Field() # 书详细界面url
2.jd.py
主爬虫程序,解析主要运用xpath,先对一级页面爬取再对某本书的详细页面进行爬取,为此定义两个方法:parse、parse_second。销售量这里没有找到具体的值,根据评价条数随机生成的...lol
import scrapy
import random
from jingdong.items import JingdongItem
# ------导入
from scrapy_redis.spiders import RedisSpider
# ------继承
class JdSpider(RedisSpider):
page = 1
name = 'jd'
redis_key = 'start'
# ------注销
# allowed_domains = ['search.jd.com', 'item.jd.com']
# start_urls = [
# 'https://search.jd.com/Search?keyword=%E8%AE%A1%E7%AE%97%E6%9C%BA%E4%B9%A6%E7%B1%8D&suggest=1.his.0.0&wq=%E8%AE%A1%E7%AE%97%E6%9C%BA%E4%B9%A6%E7%B1%8D&pvid=f51170888f154edea5564ee586a28f72&page=1&click=0']
# ------添加init方法
def __int__(self, *args, **kwargs):
domain = kwargs.pop('domain', '')
self.allowed_domains = filter(None, domain.split(','))
super(JdSpider, self).__init__(*args, **kwargs)
def parse(self, response, *args, **kwargs):
li_list = response.xpath('//div[@id="J_goodsList"]//li')
for li in li_list:
price = li.xpath('.//div[@class="p-price"]//i/text()').extract_first() # 商品价格
second_url = li.xpath('.//div[@class="p-img"]/a/@href').extract_first() # 详细界面url
second_url = 'https:' + second_url
id = second_url.split('/')[3].split('.')[0] # 商品id https://item.jd.com/id.html 可以访问该商品详细界面
yield scrapy.Request(url=second_url, callback=self.parse_second, meta={'price': price, 'id': id})
if self.page < 100:
self.page += 1
strs = self.page * 2 - 1
url = 'https://search.jd.com/Search?keyword=%E8%AE%A1%E7%AE%97%E6%9C%BA%E4%B9%A6%E7%B1%8D&suggest=1.his.0.0&wq=%E8%AE%A1%E7%AE%97%E6%9C%BA%E4%B9%A6%E7%B1%8D&pvid=f51170888f154edea5564ee586a28f72&page=' + str(
strs) + '&click=0'
yield scrapy.Request(url=url, callback=self.parse)
def parse_second(self, response, *args, **kwargs):
name = response.xpath(
'//div[@class="crumb-wrap"]/div[@class="w"]//div[@class="item ellipsis"]/text()').extract_first()
sell_num_str = response.xpath('//div[@id="comment-count"]/a/text()').extract_first()[:-1]
if '万' == sell_num_str[-1]:
sell_num_temp = int(sell_num_str[:-1]) * 10000 #销售量
sell_num = random.randrange(sell_num_temp, sell_num_temp * 2)
else:
sell_num_temp = int(sell_num_str)
sell_num = random.randrange(sell_num_temp, sell_num_temp * 2)
if response.xpath('//*[@id="J_LogisticsService"]/div/div[1]'):
jd_wl = 'YES'
else:
jd_wl = 'NO'
good_pre = response.xpath('//div[@class="percent-con"]/text()').extract_first()
publisher = response.xpath(
'//div[@class="p-parameter"]/ul/li[@clstag="shangpin|keycount|product|chubanshe_3"]/a/text()').extract_first()
price = response.meta['price']
id = response.meta['id']
detail_url = ' https://item.jd.com/' + id + '.html'
yield JingdongItem(id=id, name=name, price=price, good_pre=good_pre, jd_wl=jd_wl, sell_num=sell_num,
publisher=publisher, detail_url=detail_url
)
3.middlewares.py
写了下载器中间件,给请求头只加了ua,发现不加cookie也可以爬到。再一个抓包分析后发现京东商品页面发一个request只能接受到30条数据,下拉页面到最低后又会加载出30条,所以用selenium无头驱动做一个下拉操作然后直接返回下拉后的response(基本上整个程序运行一段时间就会进行人机验证,可加一个ip池解决)
import time
import random
from selenium import webdriver
from scrapy.http import HtmlResponse
from jingdong.settings import USER_AGENT_LIST, PROXY
from selenium.webdriver.chrome.options import Options
# ua
class UAMiddleware:
def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
request.headers['User-Agent'] = ua
def share_browser():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
path = 'C:\Program Files\Google\Chrome\Application\chrome.exe'
chrome_options.binary_location = path
browser = webdriver.Chrome(chrome_options=chrome_options)
return browser
# 渲染
class SeleniumMiddleware:
def process_request(self, request, spider):
url = request.url
browser = share_browser()
js_bottom = 'document.documentElement.scrollTop=100000'
browser.get(url)
browser.execute_script(js_bottom)
time.sleep(1)
browser.execute_script(js_bottom)
time.sleep(1)
body = browser.page_source
response = HtmlResponse(url=url, body=body, request=request, encoding='utf-8')
browser.close()
return response
4.piplines.py
保存到MySQL
from jingdong.settings import DB
import pymysql
class DBPipline:
def open_spider(self, spider):
self.host = DB['HOST']
self.port = DB['PORT']
self.user = DB['USER']
self.password = DB['PASSWORD']
self.name = DB['NAME']
self.charset = DB['CHARSET']
self.connet()
def connet(self):
self.conn = pymysql.connect(host=self.host,
port=self.port,
user=self.user,
password=self.password,
db=self.name,
charset=self.charset)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql = 'insert into it_book(id_book,name,price,sell_num,good_pre,jd_wl,publisher,detail_url) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(
item['id'],
item['name'],
item['price'],
item['sell_num'],
item['good_pre'],
item['jd_wl'],
item['publisher'],
item['detail_url']
)
self.cursor.execute(sql)
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
5.settings.py
主要的设置,定义好MySQL、redis的地址(开启这两个数据库服务,并在MySQL里创建库和比对应的表),更换过滤器、调度器模块,开启断点续爬,管道,中间件。
# Scrapy settings for jingdong project
BOT_NAME = 'jingdong'
SPIDER_MODULES = ['jingdong.spiders']
NEWSPIDER_MODULE = 'jingdong.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70'
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 用redis的重复过滤器模块
SCHEDULER = "scrapy_redis.scheduler.Scheduler" # 用redis的调度器模块
SCHEDULER_PERSIST = True # 开启断点续爬,不清除redis数据库中的数据,保持下去
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
# SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
ITEM_PIPELINES = {
'jingdong.pipelines.DBPipline': 500,
'scrapy_redis.pipelines.RedisPipeline': 501, # 该管道是将爬取的数据写入redis数据库,也可以自己定义写入别的数据库或者文件
}
# 设置redis数据库,6379为redis数据库的端口号 最后的/xx 表明用redis里面的第几个库
REDIS_URL = "redis://192.168.52.10:6379/2"
LOG_LEVEL = 'DEBUG'
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS = 32
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# SPIDER_MIDDLEWARES = {
#
# }
DOWNLOADER_MIDDLEWARES = {
'jingdong.middlewares.UAMiddleware': 401,
'jingdong.middlewares.SeleniumMiddleware': 402
}
REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
]
DB = {
'HOST': '127.0.0.1',
'PORT': 3306,
'USER': 'root',
'PASSWORD': '123456',
'NAME': 'scrapy_test',
'CHARSET': 'utf8'
}
DOWNLOAD_DELAY = 1 # 每间隔1秒发一个request
结果用Navicat查看: