一、环境准备
python3.8.3
pycharm
项目所需第三方包
pip install scrapy fake-useragent requests selenium virtualenv -i https://pypi.douban.com/simple
1.1创建虚拟环境
切换到指定目录创建
virtualenv .venv
创建完记得激活虚拟环境
1.2创建项目
scrapy startproject 项目名称
1.3使用pycharm打开项目,将创建的虚拟环境配置到项目中来
1.4创建京东spider
scrapy genspider 爬虫名称 url
1.4 修改允许访问的域名,删除https:
二、问题分析
爬取数据的思路是先获取首页的基本信息,在获取详情页商品详细信息;爬取京东数据时,只返回40条数据,这里,作者使用selenium,在scrapy框架中编写下载器中间件,返回页面所有数据。
爬取的字段分别是:
商品价格
商品评数
商品店家
商品SKU(京东可直接搜索到对应的产品)
商品标题
商品详细信息
三、spider
import re
import scrapy
from lianjia.items import jd_detailItem
class JiComputerDetailSpider(scrapy.Spider):
name = 'ji_computer_detail'
allowed_domains = ['search.jd.com', 'item.jd.com']
start_urls = [
'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&suggest=1.def.0.base&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&page=1&s=1&click=0']
def parse(self, response):
lls = response.xpath('//ul[@class="gl-warp clearfix"]/li')
for ll in lls:
item = jd_detailItem()
computer_price = ll.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first()
computer_commit = ll.xpath('.//div[@class="p-commit"]/strong/a/text()').extract_first()
computer_p_shop = ll.xpath('.//div[@class="p-shop"]/span/a/text()').extract_first()
item['computer_price'] = computer_price
item['computer_commit'] = computer_commit
item['computer_p_shop'] = computer_p_shop
meta = {
'item': item
}
shop_detail_url = ll.xpath('.//div[@class="p-img"]/a/@href').extract_first()
shop_detail_url = 'https:' + shop_detail_url
yield scrapy.Request(url=shop_detail_url, callback=self.detail_parse, meta=meta)
for i in range(2, 200, 2):
next_page_url = f'https://search.jd.com/Search?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&suggest=1.def.0.base&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&page={i}&s=116&click=0'
yield scrapy.Request(url=next_page_url, callback=self.parse)
def detail_parse(self, response):
item = response.meta.get('item')
computer_sku = response.xpath('//a[@class="notice J-notify-sale"]/@data-sku').extract_first()
item['computer_sku'] = computer_sku
computer_title = response.xpath('//div[@class="sku-name"]/text()').extract_first().strip()
computer_title = ''.join(re.findall('\S', computer_title))
item['computer_title'] = computer_title
computer_detail = response.xpath('string(//ul[@class="parameter2 p-parameter-list"])').extract_first().strip()
computer_detail = ''.join(re.findall('\S', computer_detail))
item['computer_detail'] = computer_detail
yield item
三、item
class jd_detailItem(scrapy.Item):
# define the fields for your item here like:
computer_sku = scrapy.Field()
computer_price = scrapy.Field()
computer_title = scrapy.Field()
computer_commit = scrapy.Field()
computer_p_shop = scrapy.Field()
computer_detail = scrapy.Field()
四、setting
import random
from fake_useragent import UserAgent
ua = UserAgent()
USER_AGENT = ua.random
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = random.uniform(0.5, 1)
DOWNLOADER_MIDDLEWARES = {
'lianjia.middlewares.jdDownloaderMiddleware': 543
}
ITEM_PIPELINES = {
'lianjia.pipelines.jd_csv_Pipeline': 300
}
五、pipelines
class jd_csv_Pipeline:
# def process_item(self, item, spider):
# return item
def open_spider(self, spi