你现在正在做的事,纵然很苦很累,只要它值得,也请你坚持下去,以后回首曾经你才有资格说,我也是这样过来的
#媳妇要买笔记本,上网爬点信息给她参考
前一篇 原 数据分析—1.数据获取----淘宝订单信息获取
这次要到的问题就是商品价格在js中,需要破解JS的url
解决办法,进入商品详情页,刷新,F12抓包,一个一个看返回的信息,肯定会有一个返回信息是你想要的,接下来就是组合url
该js返回的信息如
##settings.py
"""
@author: cht
@time: 2019/8/17 13:23
"""
# -*- coding: utf-8 -*-
# Scrapy settings for Suning project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'Suning'
SPIDER_MODULES = ['Suning.spiders']
NEWSPIDER_MODULE = 'Suning.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Suning (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Suning.middlewares.SuningSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'Suning.middlewares.SuningDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'Suning.pipelines.SuningPipeline': 300,
'Suning.pipelines.MySQLPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
# Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'suning' # 数据库名字,请修改
MYSQL_USER = 'root' # 数据库账号,请修改
MYSQL_PASSWD = 'cht555' # 数据库密码,请修改
MYSQL_PORT = 3306
MYSQL_CHAR = 'utf8'
##itmes.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SuningItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link = scrapy.Field() # 商品链接
project_id = scrapy.Field()
name = scrapy.Field() # 商品名字
shop_name = scrapy.Field() # 店家名字
price = scrapy.Field() # 价钱
##pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SuningPipeline(object):
def process_item(self, item, spider):
return item
from pymysql import cursors
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings
SETTINGS = get_project_settings()
# 管道文件 pipelines. XinlangSpider.py中返回的内容进行存储
class MySQLPipeline(object):
# 它是一个类方法,用@classmethod 标识, 是一种依赖注入的方式。 它的参数
# 就是crawler,通过 crawler 我们可以拿到全局配置的每个配置信息。settings.py
@classmethod
def from_settings(cls, settings):
'''1、@classmethod声明一个类方法,而对于平常我们见到的则叫做实例方法。
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
dbparams = dict(
host=settings['MYSQL_HOST'], # 读取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
cursorclass=cursors.DictCursor,
use_unicode=False,
)
dbpool = adbapi.ConnectionPool('pymysql', **dbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得
def __init__(self, dbpool):
self.dbpool = dbpool
# pipeline默认调用
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
# 写入数据库中
def _conditional_insert(self, tx, item):
sql = "insert into suningshow(project_id,name,shop_name,link,price) values(%s,%s,%s,%s,%s)"
params = (item['project_id'],item["name"],item["shop_name"], item["link"], item["price"])
print('商品信息',params)
tx.execute(sql, params)
print('插入数据库成功')
# 错误处理方法
def _handle_error(self, failue, item, spider):
print('--------------database operation exception!!-----------------')
print(failue)
##suning.py
# -*- coding: utf-8 -*-
import requests
from Suning.items import SuningItem
import scrapy
import re
from scrapy import Request
class SuningSpider(scrapy.Spider):
name = 'suning'
allowed_domains = ['suning.com']
start_urls = ['http://suning.com/']
def parse(self, response):
"""苏宁"""#%E7%AC%94%E8%AE%B0%E6%9C%AC是笔记本三个字的加密结果
#查看页面有50页商品
for j in range(51):
url = "https://search.suning.com/%E7%AC%94%E8%AE%B0%E6%9C%AC/&iy=0&isNoResult=0&cp="+str(j)
yield Request(url, callback=self.parseMainPage)
def parseMainPage(self, response):
urls = response.xpath('//*[@id="product-list"]/ul/li/div/div/div[1]/div/a')
for url in urls:
item = SuningItem()
url = url.xpath('@href').extract()
all_url = response.urljoin(url[0]) # 使用urljoin()方法构建完整的绝对URL
item['link'] = all_url # 商品链接
print("link>>>>>>>",all_url)
id= all_url.split("/")[-1].split(".")[0] # 商品id
item['project_id'] = id # 商品id
print("商品id....",id)
for link in url:
# 如果找到下一页的url,得到绝对路径构造新的Request
url = response.urljoin(link)
# 回调函数进行解析或生成下一个请求,parseDetails。
yield Request(url, meta={'meta': item}, callback=self.parseDetails)
def parseDetails(self, response):
item = response.meta['meta']
id = item['project_id']
print("id",item['project_id'])
shop_name = response.xpath('//div[@class="header-shop-inline"]/a[1]/text()').extract()[0] # 商店名称
print("商店名称>>", shop_name)
item['shop_name'] = shop_name
item['name'] = response.xpath('/html/head/title/text()').extract()[0].strip().split("【价格 图片 品牌 报价】")[0] # 商品名字
print("商品名称》》%s" % item['name'])
"""
获取京东商品价格的url:通过id和接口
"""
priceurl = "https://pas.suning.com/nspcsale_0_0000000%s_0000000%s_0000000000_130_571_5710101_157122_1000323_9315_12499_Z001.html"%(id,id)
# print(priceurl)
data = requests.get(priceurl).text
# print("pcdata:",data)
# re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
price = float(re.findall(r'\"netPrice\":\"(.*?)\",',data)[0])
item["price"] = price
print("商品价格:",price)
yield item
结果: