创建含有网址首页的excel文件
host_tag_网站名称_主域名_子域名.xlsx
编辑读取excel文件的工具类项目FileUtils
新建项目FileUtils
编辑file_utils.py
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 26 10:42:49 2018
@author: Administrator
"""
import pandas as pd
class FileUtils:
def read_from_excel_show_sheets(file):
xl = pd.ExcelFile(file)
print(xl.sheet_names)
def read_from_excel_show_5_entries(file, sheet):
xl = pd.ExcelFile(file)
df1 = xl.parse(sheet)
print(df1.head())
def read_from_excel_show_columns(file, sheet, columns):
xl = pd.ExcelFile(file)
df1 = xl.parse(sheet)
print(df1[columns])
def read_from_excel_to_dict(file, sheet, columns):
xl = pd.ExcelFile(file)
df1 = xl.parse(sheet)
df2 = df1[columns]
tuples = zip(df2[columns[0]], df2[columns[1]])
distinct_tuple_set = set(tuples)
result_dict = {}
for tuple in distinct_tuple_set:
name = tuple[0]
domain = tuple[1]
if name not in result_dict.keys():
result_dict.update({name : []})
domain_list = result_dict.get(name)
if domain not in domain_list:
domain_list.append(domain)
#{key1 : set1, key2 : set2}
return result_dict
#read_from_excel_show_sheets("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx")
#read_from_excel_show_5_entries("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域")
#read_from_excel_show_columns("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ['host', 'tag'])
#print(read_from_excel_to_dict("E:/work/@@@/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ["主网站", "主域名"]))
创建爬虫项目MainDomainCrawler
scrapy startproject MainDomainCrawler
编辑items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MaindomaincrawlerItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
domain = scrapy.Field()
url = scrapy.Field()
name = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
编辑pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#import json
from scrapy.exporters import JsonItemExporter
class MaindomaincrawlerPipeline(object):
#版本一
'''my_items = []
def process_item(self, item, spider):
self.my_items.append(dict(item))
return item
def close_spider(self, spider):
with open('items1.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(self.my_items, ensure_ascii=False, indent = 4))'''
#版本二
'''def open_spider(self, spider):
self.file = open('items2.jl', 'w', encoding='utf-8')
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False, indent = 4) + "\n"
self.file.write(line)
return item
def close_spider(self, spider):
self.file.close()'''
#版本三
def __init__(self):
self.file = open('items3.json', 'wb')
self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False, indent = 4)
self.exporter.start_exporting()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
MainDomainCrawler/MainDomainCrawler/spiders下面创建MainDomainSpider2.py
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 17:55:45 2018
@author: Administrator
"""
from scrapy import Spider,Request
from selenium import webdriver
from os import path
import sys
sys.path.append(path.abspath('E:/workspace_python/FileUtils'))
from file_utils import FileUtils
result_dict = FileUtils.read_from_excel_to_dict("E:/work/92lucky/开发事宜/大数据平台/2. MySQL数据到HDFS/current_config/host_tag_网站名称_主域名_子域名.xlsx", "网站|一级域名|子版块|子域", ["主网站", "主域名"])
from MainDomainCrawler.items import MaindomaincrawlerItem
class MainDomainSpider2(Spider):
name = "MainDomainSpider2"
def __init__(self):
self.browser = webdriver.Firefox(executable_path='E:\software\python\geckodriver-v0.21.0-win64\geckodriver.exe')
#The timeout is defined when the driver object is created. How to define the timeout depends on the browser
#以下设置超时的方法是selenium下webdriver.py模块中的方法
self.browser.set_page_load_timeout(60) #网页只加载3秒
def closed(self,spider):
print("spider closed")
self.browser.close()
def start_requests(self):
url_list = []
for k, values in result_dict.items():
for value in values:
if value != 'inaccessible' and value != 'https://m.lvmama.com/':
url_list.append(value)
#start_urls = ['https://m.51test.net/', 'http://www.xuehuiwang.com.cn/', 'http://www.studyez.com/', 'http://www.zhaokao.net/']
start_urls = url_list
for url in start_urls:
print('---------------------------------------------------')
yield Request(url=url, callback=self.parse)
def parse(self, response):
url = response.url
domain = url.split('/')[2]
key = ''
for k, value_list in result_dict.items():
for value in value_list:
if value == url:
key = k
break
title = response.selector.xpath('//title/text()[normalize-space(.)]').extract_first()
textlist = response.selector.xpath('//*[not(self::script or self::style or self::title)]/text()[normalize-space(.)]').extract()
text = []
for i in range(0, len(textlist)):
text.append(textlist[i].strip())
item = MaindomaincrawlerItem()
item['url'] = url
item['domain'] = domain
item['name'] = key
item['title'] = title
item['text'] = text
return item
编辑middlewares.py,使用Selenium模拟用户浏览器访问,获取网页中动态加载部分的内容,做到浏览器中可见的内容即可爬取的内容
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium.common.exceptions import TimeoutException
import time
class SeleniumMiddleware(object):
def process_request(self, request, spider):
if spider.name == 'MainDomainSpider2':
try:
#发现在访问https://m.lvmama.com/这个URL的时候,浏览器一直打开着,已经超过60秒了但是还没有timeout。通过debug发现程序卡在了WebDriver.get(request.url)上面了。但是假如set_page_load_timeout设置为1或2秒则任何问题都没有。
#pip install eventlet,然鹅并没有卵用
driver = spider.browser
url = request.url
driver.get(url)
spider.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
except TimeoutException as e:
print('超时')
spider.browser.execute_script('window.stop()')
time.sleep(2)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8", request=request)
class MaindomaincrawlerSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class MaindomaincrawlerDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
编辑settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for MainDomainCrawler project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'MainDomainCrawler'
SPIDER_MODULES = ['MainDomainCrawler.spiders']
NEWSPIDER_MODULE = 'MainDomainCrawler.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'MainDomainCrawler (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'MainDomainCrawler.middlewares.MaindomaincrawlerSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'MainDomainCrawler.middlewares.SeleniumMiddleware': 543,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'MainDomainCrawler.pipelines.MaindomaincrawlerPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
运行spider
scrapy crawl MainDomainSpider2