用scrapy+selenium + phantomjs 爬取vip网页,保存为json格式,写入到mysql数据库,下载图片(二)

接上一编


weipin.py文件的代码 :

# -*- coding: utf-8 -*-
import scrapy
from weipinhui.items import WeipinhuiItem
import urllib.parse

class WeipinSpider(scrapy.Spider):
    name = 'weipin'
    allowed_domains = ['vip.com']
    keyword = input("请输入搜索内容: ")
    start_page = input("请输入开始页: ")
    end_page = input("请输入结束页:")
    start_urls = []
    # 用for循环取得每一页的网址
    for page in range(int(start_page),int(end_page)+1):
        data = {
            "keyword":keyword,
            "page":page
        }
        data = urllib.parse.urlencode(data)
        url = 'https://category.vip.com/suggest.php?'
        urls = url + data
        start_urls.append(urls)

    def parse(self, response):
        div_list = response.xpath("//div[starts-with(@id,'J_pro_')]")
        # print(div_list)
        item = WeipinhuiItem()
        for div in div_list:
            # 用xpath进行解析
            item["brand"] = div.xpath(".//h4/a/span/text()").extract_first()
            item["title"] = div.xpath("./div/h4/a/@title").extract_first()
            item["old_price"] = "¥"+div.xpath(".//del/text()").extract_first()
            item["new_price"] = "¥"+div.xpath(".//div[@class='goods-price-wrapper']/em/span[2]/text()").extract_first()
            item["discount"] = div.xpath("./div/div[@class='goods-info goods-price-info']/span/text()").extract_first()
            item["img_url"] = "http:"+div.xpath(".//div[@class='goods-image']/a/img/@src").extract_first()
            item["url"] = "http:"+div.xpath(".//h4/a/@href").extract_first()
            yield item





中间件 

middlewares.py代码:

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from time import sleep
from scrapy.http import HtmlResponse
from scrapy import signals
from selenium import webdriver

class WeipinhuiSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


# 下载器的中间件
class WeipinhuiDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 在这里我们可以对整个下载过程进行控制
        if spider.name == "weipin":
            # 说明此时正在爬取唯品会的网址,需要借助于webdriver来下载.
            driver = webdriver.PhantomJS()
            driver.get(request.url)
            sleep(5)
            # js = "document.body.scrollTop = '8000'"
            # for i in range(20):
            #     driver.execute_script(js)
            #     sleep(5)
            while True:
                # 可能像这样要拉很多次,中间要适当的延时。
                # 如果说说内容都很长,就增大下拉的长度。
                for i in range(8):
                    driver.execute_script("window.scrollBy(0,500)")
                    sleep(1)
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                break
            body = driver.page_source
            # print("正在用phantomjs浏览器下载")

            return HtmlResponse(driver.current_url,body=body,encoding="utf-8",request=request)
        return None
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

管道

pipelines.py文件代码:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json
import urllib.request

import pymysql
from scrapy.utils.project import get_project_settings

class WeipinhuiPipeline(object):
    def __init__(self):
        self.fp = open("weipin.json","w",encoding="utf-8")
        self.items = []

    def open_spider(self,spider):
        pass

    def process_item(self, item, spider):
        self.items.append(dict(item))
        return item

    def close_spider(self,spider):
        # print(len(self.items))
        # 下载图片
        # for item in self.items:
        #     for i in item:
        #         if item["img_url"]:
        #             url = item["img_url"]
        #             img_name = url.split("/")[-1]
        #             urllib.request.urlretrieve(url,"./img/"+img_name)
        # 保存json格式数据
        self.fp.write(json.dumps(self.items,ensure_ascii=False))
        self.fp.close()


# 定义一个类用来写入数据库
class MysqlPipeline(object):
    def __init__(self):
        settings = get_project_settings()
        self.host = settings["DB_HOST"]
        self.port = settings["DB_PORT"]
        self.user = settings["DB_USER"]
        self.pwd = settings["DB_PWD"]
        self.name = settings["DB_NAME"]
        self.charset = settings["DB_CHARSET"]
        self.connect()

    def connect(self):
        self.conn = pymysql.connect(host=self.host,
                                    port=self.port,
                                    user=self.user,
                                    password=self.pwd,
                                    db=self.name,
                                    charset=self.charset)
        # 创建游标,用于对数据库进行操作
        self.cursor = self.conn.cursor()

    def open_spider(self,spider):
        pass

    def process_item(self,item,spider):
        # 创建一个sql语句
        if isinstance:
            # sql = "INSERT INTO goods VALUES(NULL ,\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\',\'%s\');"%(item["brand"],item["title"],item["new_price"],item["discount"],item["old_price"],item["img_url"],item["url"])
            sql = "INSERT INTO goods VALUES(NULL ,'%s','%s','%s','%s','%s','%s','%s');" % (item["brand"], item["title"], item["new_price"], item["discount"], item["old_price"], item["img_url"],item["url"])
            self.cursor.execute(sql)
            self.conn.commit()
            return item

    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()






阅读更多
版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/cats_miao/article/details/79965101
文章标签: python
个人分类: python爬虫
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页

关闭
关闭
关闭