Spider_Scrapy_Redis_NIMS_PoLyInfo[高分子材料分布式爬虫]

分布式爬虫
项目结构
在这里插入图片描述

PoLyInfo登录账号.xlsx

账号使用情况
在这里插入图片描述

polyInfo.py(spider)

# -*- coding: utf-8 -*-
"""
1.分布式爬虫
2002.04.28 潘叶舟

爬快了要封号
单独一台主机也能运行

如果阻塞了 一般都是daili阻塞的
    HeadersPoolAndLogin.py 的 s.get(url, headers=headers, proxies=proxies 位置
    pipelines.py 的 NIMS_PoLyInfo.middlewares.ProxyDownloadMiddleware 位置
    这两个位置开启和关闭daili
    建议不开daili 虽然慢点 但是稳定

爬这个有两种思路
    1.每天限速慢慢爬 保证每个账号的安全 每个账号大概500条
    2.多找几台机器一个账号 网速全开 瞬间爬完 账号让他封
"""
import re
import math
import redis
import scrapy
import threading
from urllib import parse
from scrapy.conf import settings
from scrapy.exceptions import CloseSpider
from scrapy_redis.spiders import RedisSpider
from NIMS_PoLyInfo.spiders.InteractiveMongoDB import DataToMongo
from NIMS_PoLyInfo.spiders.HeadersPoolAndLogin import Login, MaintainUser
from NIMS_PoLyInfo.spiders.InteractiveRedis import UrlToRedis, HeadersToRedis, UserInRedis


class PolyinfoSpider(RedisSpider):
    name = 'polyInfo'

    allowed_domains = [
        'polymer.nims.go.jp',
        'mits.nims.go.jp'
    ]

    # 初始化账号密码   ps:这个位置在open_spider之前
    UserInRedis().user_to_redis()
    count_n = 0

    # 创建redis连接
    pool = redis.ConnectionPool(
        host=settings['REDIS_HOST'],
        port=settings['REDIS_PORT'],
        decode_responses=True,
        password=settings['REDIS_PARAMS']['password'],
        db=0)

    conn = redis.Redis(connection_pool=pool)
    UTR = UrlToRedis(conn)  	# 创建 url存入redis的对象
    HTR = HeadersToRedis()  	# 创建操作Redis中求请求头的对象
    DTM = DataToMongo()  		# 创建与MongoDB交互的对象
    MU = MaintainUser()  		# 创建维护账号对象
    Login = Login()  			# 创建登录对象

    instr = input("是否发送初始URL?\n"
                  "ps:只有主机第一次启动的时候需要发送,"
                  "断连续爬和从机启动不需要发送"
                  "\n  发送:1\n不发送:请输入2\n")

    if instr == '1':
        UTR.start_urls()  # 主机会发送第一条 url  靠它生成所有的url

    def parse(self, response):
        """把不同的 response 交给不同的函数处理; 通过eval调用的
        eval('self.parse%s(response)'%mytype) = self.parse??(response)
        """

        mytype = parse.unquote(re.findall('&mytype=(\d{2})', response.url)[0])
        eval('self.parse%s(response)' % mytype)

        # 每发送 CONCURRENT_REQUESTS/2 这么多次请求启动一次维护函数 maintain_headers()
        self.count_n += 1
        remainder_n = self.count_n % (settings['CONCURRENT_REQUESTS'] / 2)
        if remainder_n == 0:
            t_mh = threading.Thread(target=self.Login.maintain_headers)   # 维护请求头
            t_mh.start()                                                  # 开启线程
            a_number = self.MU.average_number()                           # 账号使用次数平均数
            print('a_number', a_number)
            if a_number > 500:                                            # 如果使用次数大于该值
                self.parse_close()                                        # 主动退出爬虫  ps:会把现有任务处理完才退出

    # 解析左边文件夹 Polymer 下 url 并循环发起请求
    def parse00(self, response):
        html_data = response.text
        # 匹配url
        url_list = re.findall(
            """ajaxGet[(]'(.+?)',.+#contents""",
            re.findall(""">Polymer</a></span>[\s\S]+return false;">Property</a></span>""",
                       html_data)[0])
        del (url_list[-1])
        i = 0
        for url in url_list:  # 循环发起请求

            # 测试
            i += 1
            if i == 2:
                self.UTR.next_urls(url + '&mytype=01')
            else:
                pass

    # 匹配 Polymer Database(PoLyInfo)Hitxxx Preview 的url (就是那个播放图标)
    def parse01(self, response):
        html_data = response.text
        url_list = re.findall(
            """Polymer Database[\s\S]+ajaxGet[(]'(.+)', '#contents'""",
            html_data)
        for url in url_list:  # 循环发起请求
            self.UTR.next_urls(url + '&o=0' + '&mytype=02')

    # 每个子文件夹翻页
    def parse02(self, response):
        url = re.sub('&o=0&mytype=02', '', response.url)
        html_data = response.text

        # 匹配最大页数
        page_all = re.findall('Results<span>'
                              '\d+</span>-<span>'
                              '\d+</span>of<span>('
                              '\d+)</span>', html_data)[0]

        # 最大页数
        page_all = int(page_all)
        # 当前页数
        page = 0

        # 循环发起请求
        while page < page_all:
            self.UTR.next_urls(url + '&o=%s' % str(page) + '&mytype=03')
            page += 20
            break  # 测试

    # 匹配进入详情页的url
    def parse03(self, response):
        html_data = response.text
        url_list = re.findall("""idata" href="javascript:window_open[(]'(.+?)'""", html_data)

        # 循环发起请求
        for url in url_list:
            self.UTR.next_urls(url + '&mytype=04')

    # 0级详情页
    def parse04(self, response):
        html_data = response.text  # 获取网页文本

        # 源码入库
        id = re.findall(r'<td class=[\s\S]*?ID.*?:[\s\S]*?\s*<td class=.*?>(.+?)</td>',         # 匹配id
                        html_data)[0]
        db_data = {'html': html_data, 'url':                                                    # 构建数据
            response.url, 'id': id, 'level': 0}
        # self.DTM.data_to_mongo(db_data)                                                       # 存入MongoDB

        if 'P010001' in id:  # 测试
            # 解析一级详情页中的urlr
            url_front = 'https://polymer.nims.go.jp'                                            # url 前半部分
            # 解析出下面可以点击的连接和标题
            url_and_title_list = re.findall('<li><a href="(.+)">(.+?)</a><br>', html_data)
            for url_and_title in url_and_title_list:
                url_last = url_and_title[0]                                                     # url后半部分
                title0 = url_and_title[1]                                                       # 标题
                url = url_front + url_last                                                      # 拼接url

                # 按不同的标题赋予不同的type和title
                # 并存入Redis start_urls 列表; 由 parse 函数统一分配
                # title 就是第一个详情页下面的标题
                if 'external site' in title0:
                    pass
                elif 'Summary' in title0:
                    self.UTR.next_urls(url + '&mytype=05&myid=%s&mytitle0=%s' % (id, title0))
                elif 'Polymerization' in title:
                    self.UTR.next_urls(url + '&mytype=06&myid=%s&mytitle0=%s' % (id, title0))
                elif 'copolymers' in title:
                    self.UTR.next_urls(url + '&mytype=07&myid=%s&mytitle=%s' % (id, title))
                elif 'polymer' in title:
                    self.UTR.next_urls(url + '&mytype=08&myid=%s&mytitle=%s' % (id, title))
                elif 'NMR' in title:
                    self.UTR.next_urls(url + '&mytype=09&myid=%s&mytitle=%s' % (id, title))
                elif 'Property' in title:
                    self.UTR.next_urls(url + '&mytype=10&myid=%s&mytitle=%s' % (id, title))
                else:
                    print('未分类标题', title, ':', response.url)

    def parse05(self, response):
        """标题 Summary of property data 页面"""

        html_data = response.text  # 获取网页文本
        id = parse.unquote(re.findall('&myid=(.+?)&', response.url)[0])                 # 获取id   parse.unquote() url解码
        title0 = parse.unquote(re.findall('&mytitle0=(.+)', response.url)[0])           # 获取title
        db_data = {'html': html_data, 'url': response.url,  # 构建数据
                   'id': id, 'level': 1, 'title0': title0}
        # self.DTM.data_to_mongo(db_data)                                               # 存入MongoDB

        # 解析下一级url
        url_front = 'https://polymer.nims.go.jp'  # url 前半部分
        url_last_list = re.findall('<td class=".+?"><a href="(.+?)\d+">(.+?)</*a>', html_data)
        for url_last in url_last_list:
            url = url_front + url_last[0]
            title1 = url_last[1]
            self.UTR.next_urls(url + '&page=1&pagesize=100&mytype=11&myid=%s&mytitle0=%s&mytitle1=%s'
                               % (id, title0, title1))

    def parse06(self, response):
        """标题Polymerization页面"""

        html_data = response.text  # 获取网页文本
        id = parse.unquote(re.findall('&myid=(.+?)&', response.url)[0])         # 获取id   parse.unquote() url解码
        title0 = parse.unquote(re.findall('&mytitle0=(.+)', response.url)[0])   # 获取title
        db_data = {'html': html_data, 'url': response.url,                      # 构建数据
                   'id': id, 'level': 1, 'title0': title0}
        # self.DTM.data_to_mongo(db_data)                                       # 存入MongoDB

    def parse11(self, response):
        """
        Summary of property data 的下级页面
        """

        html_data = response.text  # 获取网页文本
        id = parse.unquote(re.findall('&myid=(.+?)&', response.url)[0])         # 获取id   parse.unquote() url解码
        title0 = parse.unquote(re.findall('&mytitle0=(.+)&', response.url)[0])  # 获取0级详情页title
        title1 = parse.unquote(re.findall('&mytitle1=(.+)', response.url)[0])   # 获取1级详情页title
        db_data = {'html': html_data, 'url': response.url,  # 构建数据
                   'id': id, 'level': 2, 'title0': title0,
                   'title1': title1}
        self.DTM.data_to_mongo(db_data)  # 存入MongoDB

        # 翻页
        previous_url = response.url                                            # 之前的url
        if 'page=1&' in previous_url:                                          # 如果是第一页
            total_number = int(                                                # 取到最大数据条数
                re.findall('<b>Number of .+?: (\d+)</b><br>',
                           html_data)[0])
            page_all = math.ceil((total_number / 100) * 10 / 10)               # 页数向上取整
            for page in range(2, page_all + 1):                                # 生成对应的页数
                next_url = re.sub('page=1', 'page=%s' % page, previous_url)    # 生成所有下一页url
                self.UTR.next_urls(next_url)                                   # 将url发送到Redis队列

        # 进入下级页面
        url_and_id_list = re.findall(                                          # 匹配 url 和 Sample ID
            '<TD class=".+?><a href="(.+?)">(.+?)</a></td>',
            html_data)
        url_front = 'https://polymer.nims.go.jp/PoLyInfo/cgi-bin/'             # url 前半部分
        for url_and_id in url_and_id_list:
            url_last = url_and_id[0]
            title2 = url_and_id[1]
            next_level_url = url_front + url_last + \
                             '&mytype=12&myid=%s&mytitle0=%s&mytitle1=%s&mytitle2=%s'\
                             % (id, title0, title1, title2)
            self.UTR.next_urls(next_level_url)

    def parse12(self, response):
        """
        Summary of property data 下级页面的下级页面
        (Summary of property data最后那个页面)
        """

        html_data = response.text                                                 # 获取网页文本
        id = parse.unquote(re.findall('&myid=(.+?)&', response.url)[0])           # 获取id  parse.unquote() url解码
        title0 = parse.unquote(re.findall('&mytitle0=(.+)&', response.url)[0])    # 获取0级详情页title
        title1 = parse.unquote(re.findall('&mytitle1=(.+)&', response.url)[0])    # 获取1级详情页title
        title2 = parse.unquote(re.findall('&mytitle2=(.+)', response.url)[0])     # 获取2级详情页title
        db_data = {'html': html_data, 'url': response.url,                        # 构建数据
                   'id': id, 'level': 3, 'title0': title0,
                   'title1': title1, 'title2': title2}
        self.DTM.data_to_mongo(db_data)                                           # 存入MongoDB

    # 主动退出Scrapy
    def parse_close(self):
        print('告辞告辞...')
        raise CloseSpider('CloseSpider...')

HeadersPoolAndLogin.py

# -*- coding: utf-8 -*-
"""
1.登录
2.Header池
2002.04.28 潘叶舟
"""
import re
import time
import json
import random
import requests
import threading
import scrapy.conf
from urllib.parse import quote
from fake_useragent import UserAgent
from requests.adapters import HTTPAdapter
from NIMS_PoLyInfo.spiders.InteractiveRedis import HeadersToRedis, UserInRedis


class Login():

    def __init__(self, login_url=''):

        self.username, self.password = MaintainUser().min_uesr()                   # 账号密码
        self.HTR = HeadersToRedis()                                                # Headers与Redis交互的类
        self.proxies = {'http': '127.0.0.1:10808'}                                 # daili
        self.user_agent = UserAgent().random                                       # 随机UserAgent
        self.splash_url = 'http://192.168.0.213:8050//execute?lua_source='         # Splash交互地址
        self.conn = self.HTR.conn                                                  # Redis链接池

        # Splash js登录代码
        self.js_code = """function main(splash)
splash:set_user_agent("%s")
splash:go("%s")
splash:wait(5)
input = splash:select("#username")
input:send_text('%s')
input = splash:select("#password")
input:send_text('%s')
submit = splash:select('#kc-login')
submit:mouse_click()
splash:wait(15)
return {
    html = splash:html(),
    har = splash:har(),
  }
end"""%(self.user_agent, login_url, self.username, self.password)

    # 获取headers  ps:里面包含Cookie
    def get_html_header(self):

        monitor = False                                     # 用来记录 Splash是否拿到header; 若没有拿到则重发请求
        # 判断Splash是否阻塞的函数
        def if_clogged():
            time.sleep(60)                                  # 等一会再判断
            if monitor:                                     # 如果有response回来
                pass                                        # 不做操作
            else:                                           # 如果Splash阻塞
                print('Splash阻塞...重新发起登录请求...')
                main()                                      # 重新发起登录请求

        # t_ic = threading.Thread(target=if_clogged)          # 创建if_clogged()检测线程对象
        # t_ic.setDaemon(True)                                # 守护主线程; 主线程如果返回了header; 那么if_clogged()函数不再执行
        # t_ic.start()                                        # 开启线程
        url = self.splash_url + quote(self.js_code)         # 与香港47.104.73.6:8080服务器 Splash交互的接口
        r = requests.get(url)                               # 获取splash返回的数据
        response_data = json.loads(r.text)                  # 将获取的Splash返回的JSON数据并转成字典
        if 'error' in response_data:                        # 如果出现error字段 重新发起请求
            print('状态码:',response_data['error'])
            print('重新发起登录请求...')
            main()

        # 如果出现以下文本; 说明账号被封
        elif 'Account is disabled, contact admin.' in str(response_data):
            print(self.username, '账户已被封号...')
            MaintainUser().limit(self.username)             # 调用该函数修改Redis数据和本地Excel数据
            main()                                          # 重新发起登录请求

        else:
            # 正则从Splash返回的数据中找到最后一条 response headers
            headers_list = re.findall("'headers': \[({.+?})\]", str(response_data))
            headers_list_of_cookie = []
            for headers_dic in headers_list:
                if "'Cookie'" in str(headers_dic):
                    headers_list_of_cookie.append(eval(headers_dic))
            key_list = []
            value_list = []
            for hloci in headers_list_of_cookie[-1]:
                key_list.append(hloci['name'])
                value_list.append(hloci['value'])
            headers = dict(zip(key_list, value_list))
            headers = {'username': self.username, 'headers': headers}

            # 如果拿到headers; 把monitor变量改成True;
            # 在不久的将来if_clogged()会来判断这个值; 如果monitor不等于True 就重发请求
            monitor = True
            if headers:
                HeadersToRedis().to_redis(headers)  # headers 存 redis

        # 账号使用计数
        MaintainUser().usage_count(self.username)

    # 获取登录的url ps:由于登录的url是动态的; 只能从不动的页面进去拿会跳转到登录的url
    def get_login_url(self):

        url = 'https://mits.nims.go.jp/MatNaviSearch/?_e=i&w=1&m=p&tx=polyethylene&by=0&cva=on&lot=and&sy=01&_lang=en'
        headers = {}
        headers['User-Agent'] = self.user_agent

        # 在Session实例上附加HTTPAdapaters 参数,增加失败重试次数;
        # max_retries 最大重试次数
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=10))
        s.mount('https://', HTTPAdapter(max_retries=10))

        # 本地 daili端口
        proxies = {"http": "https://127.0.0.1:10809", "https": "https://127.0.0.1:10809"}
        # r = s.get(url, headers=headers, proxies=proxies, timeout=20)       # 发送请求
        r = s.get(url, headers=headers, timeout=20)  # 发送请求(不通过daili)
        html_data = r.text

        # 匹配url
        url_list = re.findall("""idata" href="javascript:window_open[(]'(.+?)'""", html_data)
        # 随机return一个url
        return random.choice(url_list)

    # 维护请求头
    def maintain_headers(self):

        headers_key_list = self.HTR.get_headers_key()                       # 获取Redis中的本项目对应的headers的key值
        for headers_key in headers_key_list:
            redis_keys_timestamp = float(headers_key.split(":")[-1])        # 取出里面的时间戳
            now_timestamp = time.time()                                     # 生成现在的时间戳
            redis_keys_timestamp = now_timestamp - redis_keys_timestamp     # 算出header的存在时间
            if redis_keys_timestamp > 3600:                                 # 如果大于这个值的删除 单位:秒
                self.conn.delete(headers_key)                               # 删除操作

        if len(headers_key_list) < 20:                                      # 如果Redis中请求头数量小于该值; 做添加操作
            main()


# 维护账号
class MaintainUser():
    def __init__(self):
        self.UIR = UserInRedis()                                            # 账号密码与Redis交互的类
        self.HTR = HeadersToRedis()                                         # Headers与Redis交互的类

    # 获取使用次数最少的账号
    def min_uesr(self):
        user_df_all = self.UIR.get_user_data()                              # 从Redis获取所有账号数据
        user_df_all_jz = user_df_all.loc[user_df_all['status'] == '健在']   # 查询出还能用的账号
        user_index = user_df_all_jz['weights'].idxmin()                     # 获取使用次数最少账号的 index
        user_df = user_df_all[user_index:user_index + 1]                    # 获取使用最少的这条数据
        for i, user in user_df.iterrows():                                  # 遍历取出
            return user['username'], user['password']

    # 账号使用计数 (在middlewares.py中使用)
    def usage_count(self, username):
        user_df_all = self.UIR.get_user_data()                              # 从Redis获取所有账号数据
        # print(user_df_all)
        user_df_one = user_df_all.loc[user_df_all['username'] == username]  # 按username查询
        for user_index, user in user_df_one.iterrows():                     # 获取该条数据的下标
            user_df_all.loc[user_index, 'weights'] += 1                     # 权重值+1
            self.UIR.user_to_redis(user_df_all)                             # 把数据重新放回Redis

    def limit(self, username):
        user_df_all = self.UIR.get_user_data()                               # 从Redis获取所有账号数据
        user_df_one = user_df_all.loc[user_df_all['username'] == username]   # 按username查询
        for user_index, user in user_df_one.iterrows():                      # 获取该条数据的下标
            user_df_all.loc[user_index, 'status'] = '去世'                   # 标注去世的账号
            self.UIR.user_to_redis(user_df_all)                              # 把数据重新放回Redis
        user_df_all.to_excel('PoLyInfo登录账号.xlsx', index=None)            # 同时修改Excel中的数据

    # 获取使用次数最少的 header
    def get_redis_header(self):
        user_all_df = self.UIR.get_user_data()                               # 获取所有账号数据
        headers_all_df = self.HTR.get_headers_value()                        # 获取现有的headers数据
        if headers_all_df.empty:                                             # redis中没有headers
            return None, None, None
        headers_s = set(headers_all_df['username'])                          # 现有headers所对应的账号
        user_now_df = user_all_df[user_all_df['username'].isin(headers_s)]   # 按headers_s范围查找
        user_index = user_now_df['weights'].idxmin()                         # 获取使用次数最少账号的 index

        user_df = user_all_df[user_index:user_index + 1]                     # 获取使用最少的账号数据
        for i, user in user_df.iterrows():                                   # 拿到账号
            # 查找所有该账号下 header 并随机抽取一条
            user_df_one = headers_all_df.loc[headers_all_df['username'] == user['username']].sample()
            for i2, user2 in user_df_one.iterrows():
                header = user2['headers']                                    # 拿到最终header
                headers_key = user2['headers_key']                           # header对应的redis_key
                username = user2['username']
                return header, headers_key, username                         # 给UserAgentmiddleware中间件

    # 账号使用的平均数
    def average_number(self):
        user_all_df = self.UIR.get_user_data()                               # 获取所有账号数据
        print(user_all_df)
        user_df_all_jz = user_all_df.loc[user_all_df['status'] == '健在']    # 查询出还能用的账号
        return user_df_all_jz['weights'].mean()                              # 求账号使用的平均值


def main():

    login_url = Login().get_login_url()             # 获取登录的url
    Login(login_url).get_html_header()              # 获取 headers


if __name__ == "__main__":
   main()

InteractiveMongoDB.py

# -*- coding: utf-8 -*-
"""
1.与MongoDB交互的一些功能
2002.04.26 潘叶舟
"""
import pymongo
import threading
from scrapy.conf import settings


# 数据存入MongoDB
class DataToMongo():

    def __init__(self):
        self.MONGO_URI = settings['MONGO_URI']                         # MongoDB IP地址
        self.MONGO_DB = settings['MONGO_DB']                           # 库名
        self.MONGO_COLL = settings['MONGO_COLL']                       # 表名
        self.db = pymongo.MongoClient(self.MONGO_URI)[self.MONGO_DB]   # 建立与MongoDB的连接
        if 'MONGO_AUTHENTICATE' in settings:
            self.db.authenticate(name=settings['MONGO_AUTHENTICATE']['name'],
                                 password=settings['MONGO_AUTHENTICATE']['password'],
                                 source=settings['MONGO_AUTHENTICATE']['source'])

    # 数据存入MongoDB
    def data_to_mongo(self, data):
        def data_to_mongo2(data):
            self.db[self.MONGO_COLL].insert_one(data)                     # 单条存入
        # t_mh = threading.Thread(target=data_to_mongo2, args=(data,))    # 启动线程
        # t_mh.start()                                                    # 开启线程
        print('存入Mongo')

InteractiveRedis.py

# -*- coding: utf-8 -*-
"""
1.与Redis交互的一些功能
2002.04.28 潘叶舟
"""
import time
import redis
import pandas as pd
from scrapy.conf import settings


# url存入Redis
class UrlToRedis:

    def __init__(self, conn=None):
        if conn:
            self.conn = conn
        else:
            pool = redis.ConnectionPool(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], decode_responses=True,
                                        password=settings['REDIS_PARAMS']['password'], db=0)
            self.conn = redis.Redis(connection_pool=pool)

    # 初始的第一条url存入Redis
    def start_urls(self):
        url = r'https://mits.nims.go.jp' \
              r'/MatNaviSearch/?tx=polyethylene&lot=and&cva=on&csy=on&by=0&e=i&m=g&_lang=en&mytype=00'
        self.conn.lpush(settings['NAME'] + ':start_urls', url)

    # 把后续的url存入Redis
    def next_urls(self, url):
        self.conn.lpush(settings['NAME'] + ':start_urls', url)


# 请求头存入Redis
class HeadersToRedis:

    def __init__(self):

        # 创建redis连接
        pool = redis.ConnectionPool(host=settings['REDIS_HOST'],
                                    port=settings['REDIS_PORT'],
                                    decode_responses=True,
                                    password=settings['REDIS_PARAMS']['password'],
                                    db=0)
        self.conn = redis.Redis(connection_pool=pool)

    # headers存Redis
    def to_redis(self, headers):
        self.conn.set(settings['NAME'] + ':headers:' + str(time.time()), str(headers))

    # 删除某条过期的headers
    def delete_headers(self, headers_elem):
        self.conn.delete(headers_elem)
        print('删除过期的请求头...', headers_elem)

    def get_headers_key(self):
        return self.conn.keys(settings['NAME'] + ':headers:' + '*')

    # 获取Redis中所有header的值
    def get_headers_value(self):
        headers_value_list = []
        headers_key_list = self.conn.keys(settings['NAME'] + ':headers:' + '*')
        for headers_key in headers_key_list:
            headers_value = self.conn.get(headers_key)
            headers_value = eval(headers_value)
            headers_value['headers_key'] = headers_key
            headers_value_list.append(headers_value)
        df = pd.DataFrame(headers_value_list)
        return df


# 跟账号相关的
class UserInRedis:

    def __init__(self):
        # 创建redis连接池
        pool = redis.ConnectionPool(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], decode_responses=True, password=settings['REDIS_PARAMS']['password'], db=0)
        self.conn = redis.Redis(connection_pool=pool)

    # 获取账号数据
    def get_user_data(self):

        # 从redis里面读字节的不是不能用连接池,会报编码错误,只能建立单个链接
        rs = redis.StrictRedis(host=settings['REDIS_HOST'], password=settings['REDIS_PARAMS']['password'])
        df_bytes = rs.get(settings['NAME'] + ':users')
        df = pd.read_msgpack(df_bytes)      # bytes 转DataFrame
        rs.close()
        return df

    # 登录账号密码数据 存redis
    def user_to_redis(self, df=None):
        """传参进去就写入传递进去的数据; 不传参就写入'PoLyInfo登录账号.xlsx'的数据"""

        if 'NoneType' in str(type(df)):
            df = pd.read_excel('PoLyInfo登录账号.xlsx')
            df_bytes = df.to_msgpack()
            self.conn.set(settings['NAME'] + ':users', df_bytes)
        else:
            df_bytes = df.to_msgpack()
            self.conn.set(settings['NAME'] + ':users', df_bytes)

middlewares.py

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html

import redis
import threading
from scrapy.conf import settings
from scrapy import signals
from fake_useragent import UserAgent
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from NIMS_PoLyInfo.spiders.HeadersPoolAndLogin import Login, MaintainUser
from NIMS_PoLyInfo.spiders.InteractiveRedis import HeadersToRedis, UrlToRedis
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware


# 随机UserAgent
ua = UserAgent()

# 连接池
pool = redis.ConnectionPool(host=settings['REDIS_HOST'], port=settings['REDIS_PORT'], decode_responses=True,password=settings['REDIS_PARAMS']['password'], db=0)
conn = redis.Redis(connection_pool=pool)
UTR = UrlToRedis(conn)


# 登陆302跳转的处理
class LoginAgain(UserAgentMiddleware):
    """
    登录跳转的url的逻辑:
        网页判定到headers过期
        302跳转登录页面
        由LoginAgain中间件接收到
        修改url为登录前的url,并重新发送改请求
        ps: 爬虫里面处理不了这个逻辑,可能是因为网站设置的原因,会在不同的登录页面无限跳转...
    """

    def process_request(self, request, spider):
        print('重新登录中间件', request.url)
        meta = request.meta
        if 'login-matnavi' in request.url:
            headers_elem = meta['headers_elem']               # 获取要删除headers的RedisKey
            url = meta['redirect_urls'][-1]                   # 获取跳转之前的url
            HeadersToRedis().delete_headers(headers_elem)     # 删除过期的请求头
            UTR.next_urls(url)                                # 发送修改后的url
            print('重新发送',meta)
            request._set_url(url)                             # 修改发送的url


class UserAgentmiddleware(UserAgentMiddleware):

    # 随机headers
    def process_request(self, request, spider):

        print('随机请求头中间件:', request.url)
        headers, headers_elem, username = MaintainUser().get_redis_header()
        if headers:
            for headers_i in headers:                                           # 循环添加请求头参数
                request.headers[headers_i] = headers[headers_i]
            request.headers["User-Agent"] = ua.random                           # 随机UserAgent
            request.meta['headers_elem'] = headers_elem                         # Redis的key传到meta; 登陆过期在polyInfo.py删除
            MaintainUser().usage_count(username)                                # 账号使用计数
        else:                                                                   # 如果Redis中没有请求头
            print('请求头维护中(middlewares)...')
            # ps: 这里没有做异常url处理;
            # 不需要请求头的url 这里可以正常访问;
            # 需要请求头的会在polyInfo.py函数parse_login中被重新发送

            # 在Redis中添加请求头
            t_mh = threading.Thread(target=Login().maintain_headers)            # 维护请求头
            t_mh.start()                                                        # 开启线程


# 异常URL处理
class GetFailedUrl(RetryMiddleware):

    def process_exception(self, request, exception, spider):

        # 以下错误重试请求
        if 'TCP connection timed out' in str(exception):
            return request
        elif 'twisted.python.failure.Failure twisted.internet.error' in str(exception):
            return request
        elif 'User timeout caused connection failure' in str(exception):
            return request


# daili中间件
class ProxyDownloadMiddleware(object):

    def process_request(self, request, spider):
        request.meta['proxy'] = settings['PROXY']


class NimsPolyinfoSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class NimsPolyinfoDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

settings.py

# -*- coding: utf-8 -*-
from scrapy_redis.queue import LifoQueue,PriorityQueue
import os
import datetime
date = datetime.datetime.now()

NAME = 'polyInfo'
BOT_NAME = 'NIMS_PoLyInfo'

SPIDER_MODULES = ['NIMS_PoLyInfo.spiders']
NEWSPIDER_MODULE = 'NIMS_PoLyInfo.spiders'

# url 深度优先搜索
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.LifoQueue'

# 日志等级
# LOG_LEVEL = 'WARNING'

# 不遵守ROBOTS协议
ROBOTSTXT_OBEY = False

# 禁用Scrapy Cookie中间件  ps:这个不关的话自定义Cookie中间件无效
COOKIES_ENABLED = False

# 并发量
CONCURRENT_REQUESTS = 20

# 下载超时,单位秒
DOWNLOAD_TIMEOUT = 600

# 重试无效响应
DOWNLOAD_FAIL_ON_DATALOSS = False

# 使用scrapy-redis里的去重组件,不使用scrapy默认的去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 使用scrapy-redis里的调度器组件,不实用scrapy默认的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"

# 允许暂停,redis请求记录不丢失
SCHEDULER_PERSIST = True

# daili地址
PROXY = 'http://127.0.0.1:10809'

REDIS_HOST = '127.0.0.1'
# REDIS_HOST = '192.168.0.xxx'
REDIS_PORT = 6379
REDIS_PARAMS = {'password': 'xxxxx'}

# MONGO_URI = 'mongodb://192.168.0.100:27017'
MONGO_URI = 'mongodb://127.0.0.1:27017'
MONGO_DB = "Japan"
MONGO_COLL = "NIMS_PoLyInfo--{}.{}.{}".format(date.year, date.month, date.day)
# MONGO_AUTHENTICATE = {'name': 'root', 'password': 'xxxxx', 'source': 'admin'}


DOWNLOADER_MIDDLEWARES = {
    'NIMS_PoLyInfo.middlewares.UserAgentmiddleware': 600,           # 随机Headers中间件
    'NIMS_PoLyInfo.middlewares.GetFailedUrl': 544,                   # 异常url处理
    # 'NIMS_PoLyInfo.middlewares.ProxyDownloadMiddleware': 510,       # daili
    'NIMS_PoLyInfo.middlewares.LoginAgain': 530,                     # 重新登陆
}

# ITEM_PIPELINES = {
#    'NIMS_PoLyInfo.pipelines.NimsPolyinfoPipeline': 300,
# }

# 保存源码的路径
SOURCE = r'D:\CrawledData\Japan\NIMS_PoLyInfo'
# 创建路径
os.makedirs(SOURCE, exist_ok=True)

展开阅读全文
©️2020 CSDN 皮肤主题: 数字20 设计师: CSDN官方博客 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值