【爬虫】Scrapy 爬取excel中500个网址首页，使用Selenium模仿用户浏览器访问，将网页title、url、文本内容组成的item保存至json文件

最新推荐文章于 2024-06-22 19:07:40 发布

栗子ma

最新推荐文章于 2024-06-22 19:07:40 发布

阅读量3.6k

点赞数 1

本文链接：https://blog.csdn.net/sinat_40431164/article/details/81329278

版权

爬虫同时被 3 个专栏收录

14 篇文章 0 订阅

订阅专栏

Python

14 篇文章 1 订阅

订阅专栏

Scrapy

13 篇文章 0 订阅

订阅专栏

创建含有网址首页的excel文件

host_tag_网站名称_主域名_子域名.xlsx

编辑读取excel文件的工具类项目FileUtils

新建项目FileUtils

编辑file_utils.py

# -*- coding: utf-8 -*-
"""
Created on Thu Jul 26 10:42:49 2018

@author: Administrator
"""
import pandas as pd

class FileUtils:
    
    def read_from_excel_show_sheets(file):
        xl = pd.ExcelFile(file)
        print(xl.sheet_names)
        
    def read_from_excel_show_5_entries(file, sheet):
        xl = pd.ExcelFile(file)
        df1 = xl.parse(sheet)
        print(df1.head())
        
    def read_from_excel_show_columns(file, sheet, columns):
        xl = pd.ExcelFile(file)
        df1 = xl.parse(sheet)
        print(df1[columns])
        
    def read_from_excel_to_dict(file, sheet, columns):
        xl = pd.ExcelFile(file)
        df1 = xl.parse(sheet)
        df2 = df1[columns]
        tuples = zip(df2[columns[0]], df2[columns[1]])
        distinct_tuple_set = set(tuples)
        result_dict = {}
        for tuple in distinct_tuple_set:
            name = tuple[0]
            domain = tuple[1]
            if name not in result_dict.keys():
                result_dict.update({name : []})
            domain_list = result_dict.get(name)
            if domain not in domain_list:
                domain_list.append(domain)
        #{key1 : set1, key2 : set2}
        return result_dict
    
#read_from_excel_show_sheets("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx")
#read_from_excel_show_5_entries("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域")
#read_from_excel_show_columns("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ['host', 'tag'])
#print(read_from_excel_to_dict("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ["主网站", "主域名"]))

创建爬虫项目MainDomainCrawler

scrapy startproject MainDomainCrawler

编辑items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class MaindomaincrawlerItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    domain = scrapy.Field()
    url = scrapy.Field()
    name = scrapy.Field()
    title = scrapy.Field()
    text = scrapy.Field()

编辑pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#import json
from scrapy.exporters import JsonItemExporter

class MaindomaincrawlerPipeline(object):
    #版本一
    '''my_items = []
    
    def process_item(self, item, spider):
        self.my_items.append(dict(item))
        return item

    def close_spider(self, spider):
        with open('items1.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.my_items, ensure_ascii=False, indent = 4))'''
    
    #版本二
    '''def open_spider(self, spider):
        self.file = open('items2.jl', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False, indent = 4) + "\n"
        self.file.write(line)
        return item
    
    def close_spider(self, spider):
        self.file.close()'''
        
    #版本三
    def __init__(self):
        self.file = open('items3.json', 'wb')
        self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False, indent = 4)
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

MainDomainCrawler/MainDomainCrawler/spiders下面创建MainDomainSpider2.py

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 17:55:45 2018

@author: Administrator
"""

from scrapy import Spider,Request
from selenium import webdriver
from os import path
import sys
sys.path.append(path.abspath('E:/workspace_python/FileUtils'))
from file_utils import FileUtils
result_dict = FileUtils.read_from_excel_to_dict("E:/work/92lucky/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ["主网站", "主域名"])
from MainDomainCrawler.items import MaindomaincrawlerItem

class MainDomainSpider2(Spider):
    name = "MainDomainSpider2"

    def __init__(self):
        self.browser = webdriver.Firefox(executable_path='E:\software\python\geckodriver-v0.21.0-win64\geckodriver.exe')
        #The timeout is defined when the driver object is created. How to define the timeout depends on the browser
        #以下设置超时的方法是selenium下webdriver.py模块中的方法
        self.browser.set_page_load_timeout(60) #网页只加载3秒
    def closed(self,spider):
        print("spider closed")
        self.browser.close()

    def start_requests(self):
        url_list = []
        for k, values in result_dict.items():
            for value in values:
                if value != 'inaccessible' and value != 'https://m.lvmama.com/':
                    url_list.append(value)
                    
        #start_urls = ['https://m.51test.net/', 'http://www.xuehuiwang.com.cn/', 'http://www.studyez.com/', 'http://www.zhaokao.net/']
        start_urls = url_list
        for url in start_urls:
            print('---------------------------------------------------')
            yield Request(url=url, callback=self.parse)
            
    def parse(self, response):
        url = response.url
        domain = url.split('/')[2]
        key = ''
        for k, value_list in result_dict.items():
            for value in value_list:
                if value == url:
                    key = k
                    break        
        title = response.selector.xpath('//title/text()[normalize-space(.)]').extract_first()
        textlist = response.selector.xpath('//*[not(self::script or self::style or self::title)]/text()[normalize-space(.)]').extract()
        text = []
        for i in range(0, len(textlist)):
            text.append(textlist[i].strip())
                
        item = MaindomaincrawlerItem()
        item['url'] = url
        item['domain'] = domain
        item['name'] = key
        item['title'] = title
        item['text'] = text
        
        return item

编辑middlewares.py，使用Selenium模拟用户浏览器访问，获取网页中动态加载部分的内容，做到浏览器中可见的内容即可爬取的内容

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException
import time

class SeleniumMiddleware(object):
    def process_request(self, request, spider):
        if spider.name == 'MainDomainSpider2':
            try:
                #发现在访问https://m.lvmama.com/这个URL的时候，浏览器一直打开着，已经超过60秒了但是还没有timeout。通过debug发现程序卡在了WebDriver.get(request.url)上面了。但是假如set_page_load_timeout设置为1或2秒则任何问题都没有。
                #pip install eventlet，然鹅并没有卵用
                driver = spider.browser
                url = request.url
                driver.get(url)
                spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
            except TimeoutException as e:
                print('超时')
                spider.browser.execute_script('window.stop()')
            time.sleep(2)
            return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)

class MaindomaincrawlerSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class MaindomaincrawlerDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

编辑settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for MainDomainCrawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'MainDomainCrawler'

SPIDER_MODULES = ['MainDomainCrawler.spiders']
NEWSPIDER_MODULE = 'MainDomainCrawler.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'MainDomainCrawler (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'MainDomainCrawler.middlewares.MaindomaincrawlerSpiderMiddleware': 543,
#}


# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'MainDomainCrawler.middlewares.SeleniumMiddleware': 543,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'MainDomainCrawler.pipelines.MaindomaincrawlerPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

运行spider

scrapy crawl MainDomainSpider2

栗子ma

关注

1
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
【爬虫】Scrapy 爬取excel中500个网址首页，使用Selenium模仿用户浏览器访问，将网页title、url、文本内容组成的item保存至json文件

创建含有网址首页的excel文件host_tag_网站名称_主域名_子域名.xlsx编辑读取excel文件的工具类项目FileUtils新建项目FileUtils编辑file_utils.py# -*- coding: utf-8 -*-"""Created on Thu Jul 26 10:42:49 2018@author: Administrator"...
复制链接

扫一扫

专栏目录