Scrapy原理图
首先是要理解scrapy的运作原理是什么。这里不做解释,网上有很多资源可以理解。
声明item对象
明确了需要爬取哪些字段,就现在items.py里面定义字段
import scrapy
class IwatchImgItem(scrapy.Item):
"""
下载项目图片的item定义
依次对应:图片链接、图片对象、图片保存地址,插入之后返回的id
"""
image_urls = scrapy.Field()
images = scrapy.Field()
image_paths = scrapy.Field()
id = scrapy.Field()
class IwatchBrandItem(IwatchImgItem):
"""
品牌的参数定义
依次对应:品牌中文名字、品牌英文名字、品牌链接、品牌图片链接、品牌商标图片链接、品牌介绍
"""
brand_chname= scrapy.Field()
brand_enname = scrapy.Field()
brand_url = scrapy.Field()
brand_img_url = scrapy.Field()
brand_trademark_url = scrapy.Field()
brand_introduction = scrapy.Field()
class IwatchSerieItem(IwatchImgItem):
"""
品牌之系列参数定义
依次对应:品牌名字、系列名字、系列链接、系列介绍、系列图片链接
"""
brand_chname = scrapy.Field()
serie_name = scrapy.Field()
serie_url = scrapy.Field()
serie_introduction = scrapy.Field()
serie_img_url = scrapy.Field()
class IwatchwatchItem(IwatchImgItem):
"""
系列之商品手表参数定义
依次对应:手表名字、手表链接、手表图片链接、手表美元价格、手表港元价格、手表欧元价格
手表基本信息、手表机芯、手表外观、手表功能
"""
iwatch_name = scrapy.Field()
iwatch_url = scrapy.Field()
iwatch_img_url = scrapy.Field()
iwatch_price_yuan = scrapy.Field()
iwatch_price_hkd = scrapy.Field()
iwatch_price_euro = scrapy.Field()
iwatch_base = scrapy.Field()
iwatch_mechanism = scrapy.Field()
iwatch_exterior = scrapy.Field()
iwatch_fuction = scrapy.Field()
我这里有4个item对象,其中IwatchBrandItem,IwatchSerieItem,IwatchwatchItem都继承了IwatchImgItem,因为这三个item里面都需要下载项目的图片到oss上面去。
编写爬虫脚本
import scrapy
from spider_iwatch.items import IwatchBrandItem, IwatchSerieItem, IwatchwatchItem
class IwatchSpider(scrapy.Spider):
name = 'iwatch' # 爬虫名,这是独一无二的
allowed_domains = [''] 这里放允许爬取网址的域名
start_urls = [''] # 这里放最开始爬取的网址,我保密删除了
def parse(self, response):
div_list = response.xpath(".//div[@class='item_one']")
for div in div_list:
li_list = div.xpath(".//div[@class='list_main']//li")
for li in li_list:
# 省略部分解析内容
brandItem = IwatchBrandItem()
follow_serie_url = li.xpath(".//div[@class='item_btn']/a/@href").extract_first()
yield scrapy.Request(follow_serie_url, callback=self.parse_serie, meta={
'brand_chname': brand_chname}, priority=30
yield brandItem
def parse_serie(self, response):
# 省略部分解析内容
a_list = response.xpath(".//div[@class='c_wc_list']//a")
for a in a_list:
serie_url = a.xpath("./@href").extract_first().replace(".html", "_1.html")
yield scrapy.Request(serie_url, callback=self.parse_iwatch_page_first
yield serieitem
def parse_iwatch_page_first(self, response):
li_list = response.xpath(".//div[@class='w_class_list']//li")
for li in li_list:
iwatch_url = li.xpath("./a/@href").extract_first()
yield scrapy.Request(iwatch_url, callback=self.parse_iwatch)
try:
page = response.xpath(".//div[@class='manu']/a/text()")[-2].extract()
except:
page = 1
for i in range(1, int(page)):
next_link = response.url.replace("_1.html", "_{}.html".format(str(i + 1)))
yield scrapy.Request(next_link, callback=self.parse_iwatch_page_next)
def parse_iwatch_page_next(self, response):
li_list = response.xpath(".//div[@class='w_class_list']//li")
for li in li_list:
iwatch_url = li.xpath("./a/@href").extract_first()
yield scrapy.Request(iwatch_url, callback=self.parse_iwatch)
def parse_iwatch(self, response):
watchitem = IwatchwatchItem()
# 解析内容省略
yield watchitem
pipeline管道处理数据
pipeline.py代码
import hashlib, logging
import os, oss2, requests
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from spider_iwatch.items import IwatchBrandItem, IwatchSerieItem, IwatchwatchItem
from scrapy.exceptions import DropItem, NotConfigured
from functools import reduce
from spider_iwatch.common.iwatch_sql import DBHelper
from spider_iwatch.common import config
from spider_iwatch.utils.redis_filter.request_filter import RequestFilter
class IwatchDataMysqlPipeline(object):
def __init__(self):
self.rf = RequestFilter()
def process_item(self, item, spider):
# 保存brand信息
if isinstance(item, IwatchBrandItem):
brand_id = DBHelper().get_brand_id(item['brand_chname'])
if brand_id == False:
image_paths = reduce(lambda x, y: x + ';' + y, item['image_paths'</