原数据分析---1.数据获取----苏宁笔记本信息获取（scrapy）

最新推荐文章于 2024-10-09 22:04:12 发布

cht2511

最新推荐文章于 2024-10-09 22:04:12 发布

阅读量222

点赞数

文章标签：数据分析 scrapy 爬虫

本文链接：https://blog.csdn.net/cht2511/article/details/99693878

版权

你现在正在做的事，纵然很苦很累，只要它值得，也请你坚持下去，以后回首曾经你才有资格说，我也是这样过来的

#媳妇要买笔记本，上网爬点信息给她参考

前一篇原数据分析—1.数据获取----淘宝订单信息获取
这次要到的问题就是商品价格在js中，需要破解JS的url
解决办法，进入商品详情页，刷新，F12抓包，一个一个看返回的信息,肯定会有一个返回信息是你想要的，接下来就是组合url
在这里插入图片描述
该js返回的信息如

##settings.py

"""
@author: cht
@time: 2019/8/17 13:23
"""

# -*- coding: utf-8 -*-

# Scrapy settings for Suning project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'Suning'

SPIDER_MODULES = ['Suning.spiders']
NEWSPIDER_MODULE = 'Suning.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'Suning (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'Suning.middlewares.SuningSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'Suning.middlewares.SuningDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   # 'Suning.pipelines.SuningPipeline': 300,
   'Suning.pipelines.MySQLPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
LOG_LEVEL = 'INFO'
# Mysql数据库的配置信息
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'suning'  # 数据库名字，请修改
MYSQL_USER = 'root'  # 数据库账号，请修改
MYSQL_PASSWD = 'cht555'  # 数据库密码，请修改

MYSQL_PORT = 3306
MYSQL_CHAR = 'utf8'

##itmes.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SuningItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    link = scrapy.Field()  # 商品链接
    project_id = scrapy.Field()
    name = scrapy.Field()  # 商品名字
    shop_name = scrapy.Field()  # 店家名字
    price = scrapy.Field()  # 价钱

##pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class SuningPipeline(object):
    def process_item(self, item, spider):
        return item

from pymysql import cursors
from twisted.enterprise import adbapi
from scrapy.utils.project import get_project_settings


SETTINGS = get_project_settings()


# 管道文件   pipelines.   XinlangSpider.py中返回的内容进行存储
class MySQLPipeline(object):

    # 它是一个类方法，用＠classmethod 标识， 是一种依赖注入的方式。 它的参数
    # 就是crawler，通过 crawler 我们可以拿到全局配置的每个配置信息。settings.py
    @classmethod
    def from_settings(cls, settings):
        '''1、@classmethod声明一个类方法，而对于平常我们见到的则叫做实例方法。
           2、类方法的第一个参数cls（class的缩写，指这个类本身），而实例方法的第一个参数是self，表示该类的一个实例
           3、可以通过类来调用，就像C.f()，相当于java中的静态方法'''
        dbparams = dict(
            host=settings['MYSQL_HOST'],  # 读取settings中的配置
            db=settings['MYSQL_DBNAME'],
            user=settings['MYSQL_USER'],
            passwd=settings['MYSQL_PASSWD'],
            charset='utf8',  # 编码要加上，否则可能出现中文乱码问题
            cursorclass=cursors.DictCursor,
            use_unicode=False,
        )
        dbpool = adbapi.ConnectionPool('pymysql', **dbparams)  # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
        return cls(dbpool)  # 相当于dbpool付给了这个类，self中可以得

    def __init__(self, dbpool):
        self.dbpool = dbpool

    # pipeline默认调用
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item

    # 写入数据库中
    def _conditional_insert(self, tx, item):
        sql = "insert into suningshow(project_id,name,shop_name,link,price) values(%s,%s,%s,%s,%s)"

        params = (item['project_id'],item["name"],item["shop_name"], item["link"], item["price"])
        print('商品信息',params)
        tx.execute(sql, params)
        print('插入数据库成功')

    # 错误处理方法
    def _handle_error(self, failue, item, spider):
        print('--------------database operation exception!!-----------------')
        print(failue)

##suning.py

# -*- coding: utf-8 -*-
import requests
from Suning.items import SuningItem
import scrapy
import re
from scrapy import Request



class SuningSpider(scrapy.Spider):
    name = 'suning'
    allowed_domains = ['suning.com']
    start_urls = ['http://suning.com/']


    def parse(self, response):
        """苏宁"""#%E7%AC%94%E8%AE%B0%E6%9C%AC是笔记本三个字的加密结果
        #查看页面有50页商品
        for j in range(51):
            url = "https://search.suning.com/%E7%AC%94%E8%AE%B0%E6%9C%AC/&iy=0&isNoResult=0&cp="+str(j)
            yield Request(url, callback=self.parseMainPage)

    def parseMainPage(self, response):
        urls = response.xpath('//*[@id="product-list"]/ul/li/div/div/div[1]/div/a')
        for url in urls:
            item = SuningItem()
            url = url.xpath('@href').extract()
            all_url = response.urljoin(url[0])  # 使用urljoin()方法构建完整的绝对URL
            item['link'] = all_url  # 商品链接
            print("link>>>>>>>",all_url)
            id= all_url.split("/")[-1].split(".")[0] # 商品id
            item['project_id'] = id # 商品id
            print("商品id....",id)
            for link in url:
                # 如果找到下一页的url，得到绝对路径构造新的Request
                url = response.urljoin(link)
                # 回调函数进行解析或生成下一个请求，parseDetails。
                yield Request(url, meta={'meta': item}, callback=self.parseDetails)


    def parseDetails(self, response):
        item = response.meta['meta']
        id = item['project_id']
        print("id",item['project_id'])
        shop_name = response.xpath('//div[@class="header-shop-inline"]/a[1]/text()').extract()[0]  # 商店名称
        print("商店名称>>", shop_name)
        item['shop_name'] = shop_name
        item['name'] = response.xpath('/html/head/title/text()').extract()[0].strip().split("【价格 图片 品牌 报价】")[0] # 商品名字
        print("商品名称》》%s" % item['name'])
        """
        获取京东商品价格的url:通过id和接口
        """
        priceurl = "https://pas.suning.com/nspcsale_0_0000000%s_0000000%s_0000000000_130_571_5710101_157122_1000323_9315_12499_Z001.html"%(id,id)
        # print(priceurl)
        data = requests.get(priceurl).text
        # print("pcdata:",data)
        # re.findall(r'\"p\"\:\"(.*?)\"}]\)', price)
        price = float(re.findall(r'\"netPrice\":\"(.*?)\",',data)[0])
        item["price"] = price
        print("商品价格：",price)
        yield item