python urllib代理爬取(定时，增量)

最新推荐文章于 2024-04-09 17:47:36 发布

Ctrl_Y

最新推荐文章于 2024-04-09 17:47:36 发布

阅读量479

点赞数

本文链接：https://blog.csdn.net/weixin_41857619/article/details/79803406

版权

时间：2018/4/3 15:15记

注：此程序为使用爬取到的代理，代理后继续爬取，最后存为文本文件，每三小时刷新一次，增量爬取。

环境：python3.6.4，win10，第三方库(lxml)

1.time1.py定时程序

# coding:utf-8
__author__ = 'wzq'
from threading import Timer
from poolproxy import PoolProxy


def func():
    f = PoolProxy()
    f.run()


def tt(n):
    a = Timer(n, func)
    a.start()
    a.join()


tt(0.0)  # 立即执行
while 1:
    tt(10800.0)  # 三小时后执行

2.poolproxy.py主程序

#!/D:/python/Anaconda/python.exe
# coding:utf-8
__author__ = 'wzq'

from useragents import USER_AGENT
import random
import time
from urllib import request
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import urllib
import json
import copy
import re


def first_check(func):
    """开始验证文件中已存在的ips"""

    def wrapper(self, *args, **kwargs):
        with open('poolproxy.txt', 'r', encoding='utf-8') as f:
            line = f.readline().strip('\n')
            if line:
                line0 = json.loads(line)
                lines_list = f.readlines()
                pool = ThreadPoolExecutor(max_workers=100)
                for x in lines_list:
                    x = x.strip('\n')
                    pool.submit(self.check_ip, x, spider_name='first')  # 此spider_name随意，并不需要处理
                pool.shutdown()
                with open('poolproxy.txt', 'w', encoding='utf-8') as wf:
                    wf.write(json.dumps(line0))
                    wf.write('\n')
                for line in list(self.ips_useful):
                    with open('poolproxy.txt', 'a', encoding='utf-8') as af:
                        af.write(line + '\n')
            else:
                pass
        return func(self, *args, **kwargs)

    return wrapper


class PoolProxy(object):
    def __init__(self):
        self.ips_useful = set({})  # 最后的有效ips(set)
        self.latest_time = {}  # 文件中最新有效ip的时间戳
        self.latest_useful = {}  # 队列中最新有效ip的时间戳
        self.proxy_list = []  # http类有效代理列表
        self.proxy = None  # 用来爬代理的代理
        self.file_list = []  # 文本每行组成的队列

    @first_check
    def read_file(self):
        """获取文本内容"""
        with open('poolproxy.txt', 'r', encoding='utf-8') as f:
            first_line = f.readline().strip('\n')
            if first_line:  # 如果文件有值，深拷贝
                line = json.loads(first_line)
                self.latest_useful = line
                self.latest_time = copy.deepcopy(line)
                lines = f.readlines()
                for x in lines:
                    self.file_list.append(x.strip('\n'))
            else:  # 如果文件空，设置初始时间戳
                t = int(time.mktime(time.strptime('2018-04-03 00:00:00', "%Y-%m-%d %H:%M:%S")))
                self.latest_useful = {'xici': t, 'kuaidaili': t, 'ihuan': t, 'ip3366': t}
                self.latest_time = copy.deepcopy({'xici': t, 'kuaidaili': t, 'ihuan': t, 'ip3366': t})

    def get_response(self, start_url):
        """使用代理,根据url获取html内容"""
        headers = {'User-Agent': random.choice(USER_AGENT)}
        req = request.Request(url=start_url, headers=headers)
        while 1:
            if self.proxy:
                opener = urllib.request.build_opener(urllib.request.ProxyHandler(self.proxy))  # 使用代理
                try:
                    resp = opener.open(req, timeout=5)
                    html = etree.HTML(str(resp.read(), encoding='utf-8')) if 'ihuan' in start_url else etree.HTML(
                        resp.read())
                    return html
                except Exception as e:
                    if self.proxy_list:
                        self.proxy_list.pop()
                        self.proxy = self.proxy_list[-1] if self.proxy_list else None
                        time.sleep(1)
            else:
                try:
                    req.host = req.origin_req_host  # 重置代理host
                    resp = request.urlopen(req, timeout=5)
                    html = etree.HTML(str(resp.read(), encoding='utf-8')) if 'ihuan' in start_url else etree.HTML(
                        resp.read())
                    return html
                except Exception as e:
                    print('ERROR', start_url, e)
                    time.sleep(1)

    def get_ips_xici(self):
        """根据html提取ips"""
        start_url = 'http://www.xicidaili.com/nn/1'
        while 1:
            ips_list = []
            html = self.get_response(start_url)
            print(start_url, '当前使用的代理:', self.proxy)
            # 提取信息
            ips = html.xpath('//*[@id="ip_list"]/tr/td[2]/text()')
            ports = html.xpath('//*[@id="ip_list"]/tr/td[3]/text()')
            styles = html.xpath('//*[@id="ip_list"]/tr/td[6]/text()')
            ips_time = html.xpath('//*[@id="ip_list"]/tr/td[10]/text()')
            next_url = 'http://www.xicidaili.com' + html.xpath('//*[@id="body"]/div[2]/a[last()]/@href')[0]
            for ip, port, style, ip_time in zip(ips, ports, styles, ips_time):
                ip_time = str(int(time.mktime(time.strptime('20' + ip_time + ':00', "%Y-%m-%d %H:%M:%S"))))
                item = ','.join([ip, port, style.lower(), ip_time, '0', '0'])
                ips_list.append(item)
            sign = self.get_item(ips_list, spider_name='xici')  # 增量信号到达判断
            if sign:
                break
            time.sleep(7)
            start_url = next_url

    def get_ips_ip3366(self):
        """根据html提取ips"""
        start_url = 'http://www.ip3366.net/?stype=1&page=1'
        page = 1
        while 1:
            page += 1
            ips_list = []
            html = self.get_response(start_url)
            print(start_url, '当前使用的代理:', self.proxy)
            # 提取信息
            ips = html.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
            ports = html.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
            styles = html.xpath('//*[@id="list"]/table/tbody/tr/td[4]/text()')
            ips_time = html.xpath('//*[@id="list"]/table/tbody/tr/td[8]/text()')
            next_url = 'http://www.ip3366.net/?stype=1&page={}'.format(page)
            for ip, port, style, ip_time in zip(ips, ports, styles, ips_time):
                ip_time = re.sub(r'[/]+', r'-', ip_time)
                ip_time = str(int(time.mktime(time.strptime(ip_time, "%Y-%m-%d %H:%M:%S"))))
                item = ','.join([ip, port, style.lower(), ip_time, '0', '0'])
                ips_list.append(item)
            sign = self.get_item(ips_list, spider_name='ip3366')  # 增量信号到达判断
            if sign:
                break
            time.sleep(7)
            start_url = next_url

    def get_ips_kuaidaili(self):
        """根据html提取ips"""
        start_url = 'https://www.kuaidaili.com/free/inha/1'
        page = 1
        while 1:
            page = int(page) + 1
            ips_list = []
            html = self.get_response(start_url)
            print(start_url, '当前使用的代理:', self.proxy)
            # 提取信息
            ips = html.xpath('//*[@id="list"]/table/tbody/tr/td[1]/text()')
            ports = html.xpath('//*[@id="list"]/table/tbody/tr/td[2]/text()')
            styles = html.xpath('//*[@id="list"]/table/tbody/tr/td[4]/text()')
            ips_time = html.xpath('//*[@id="list"]/table/tbody/tr/td[7]/text()')
            next_url = 'https://www.kuaidaili.com/free/inha/' + str(page)
            for ip, port, style, ip_time in zip(ips, ports, styles, ips_time):
                ip_time = str(int(time.mktime(time.strptime(ip_time, "%Y-%m-%d %H:%M:%S"))))
                item = ','.join([ip, port, style.lower(), ip_time, '0', '0'])
                ips_list.append(item)
            sign = self.get_item(ips_list, spider_name='kuaidaili')  # 增量信号到达判断
            if sign:
                break
            time.sleep(7)
            start_url = next_url

    def get_ips_ihuan(self):
        """根据html提取ips"""
        start_url = 'https://ip.ihuan.me/?page=1&anonymity=2'  # 注意此网站首页自动刷新
        k = 0
        while 1:
            ips_list = []
            html = self.get_response(start_url)
            print(start_url, '当前使用的代理:', self.proxy)
            # 提取信息
            ips = html.xpath('//*[@class="table table-hover table-bordered"]/tbody/tr/td[1]/a/text()')
            ports = html.xpath('//*[@class="table table-hover table-bordered"]/tbody/tr/td[2]/text()')
            styles = html.xpath('//*[@class="table table-hover table-bordered"]/tbody/tr/td[5]/text()')
            ips_time = html.xpath('//*[@class="table table-hover table-bordered"]/tbody/tr/td[8]/text()')
            t = time.time()
            countrys = html.xpath('//*[@class="table table-hover table-bordered"]/tbody/tr/td[3]/a[1]/text()')
            if ips and ports and styles and ips_time and countrys:
                for ip, port, style, country in zip(ips, ports, styles, countrys):
                    if '中国' in country:  # 提取中国地区的高匿
                        style = 'https' if '支持' in style else 'http'
                        item = ','.join([str(ip), str(port), style.lower(), str(int(t)), '0', '0'])
                        ips_list.append(item)
            sign = self.get_item(ips_list, spider_name='ihuan')  # 增量信号到达判断
            k += 1
            if k > 100:  # 每过一段时间爬取一次首页
                break
            time.sleep(5)

    def get_item(self, ips, spider_name):
        """多线程验证ips"""
        if spider_name == 'xici':
            latest_time = self.latest_time['xici']
        if spider_name == 'kuaidaili':
            latest_time = self.latest_time['kuaidaili']
        if spider_name == 'ihuan':
            latest_time = self.latest_time['ihuan']
        if spider_name == 'ip3366':
            latest_time = self.latest_time['ip3366']
        pool = ThreadPoolExecutor(max_workers=100)
        for line in ips:
            line_list = line.rstrip().split(',')
            ip, port, style = line_list[0], line_list[1], line_list[2]
            ip_time, connect, success = str(int(line_list[3])), str(int(line_list[4])), str(int(line_list[5]))

            if int(ip_time) > int(latest_time):
                pass
            else:
                return True
            item = ','.join([ip, port, style, ip_time, connect, success])
            pool.submit(self.check_ip, item, spider_name)
        pool.shutdown()
        return False

    def check_ip(self, item, spider_name):
        """验证单个ip代理"""
        line_list = item.rstrip().split(',')
        ip, port, style = line_list[0], line_list[1], line_list[2]
        ip_time, connect, success = int(line_list[3]), int(line_list[4]), int(line_list[5])

        headers = {'User-Agent': random.choice(USER_AGENT)}
        proxy = {}
        if style == 'http':
            proxy = {'http': 'http://{0}:{1}'.format(ip, port)}
            url = 'http://httpbin.org/get?show_env=1'
        if style == 'https':
            proxy = {'https': 'https://{0}:{1}'.format(ip, port)}
            url = 'https://httpbin.org/get?show_env=1'
        req = request.Request(url, headers=headers)
        opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy))
        try:
            resp = opener.open(req, timeout=5)
            if resp.status == 200:
                html = str(resp.read(), encoding='utf8')
                if ip in html:
                    connect += 1
                    success += 1
                    items = ','.join([ip, port, style, str(ip_time), str(connect), str(success)])
                    self.ips_useful.add(items)

                    print(items)
                    if spider_name == 'xici':
                        if ip_time > self.latest_useful['xici']:
                            self.latest_useful['xici'] = ip_time
                    if spider_name == 'kuaidaili':
                        if ip_time > self.latest_useful['kuaidaili']:
                            self.latest_useful['kuaidaili'] = ip_time
                    if spider_name == 'ihuan':
                        if ip_time > self.latest_useful['ihuan']:
                            self.latest_useful['ihuan'] = ip_time
                    if spider_name == 'ip3366':
                        if ip_time > self.latest_useful['ip3366']:
                            self.latest_useful['ip3366'] = ip_time

                    if style == 'http':
                        self.proxy_list.append(proxy)
                        self.proxy = self.proxy_list[-1]
            else:
                print(resp.status)
        except Exception as e:
            connect += 1

        # time_local = time.localtime(ip_time)  # 转换成localtime
        # dt = time.strftime("%Y-%m-%d %H:%M:%S", time_local)  # 转换成新的时间格式(2018-05-05 20:28:54)
        # itemss = ','.join([ip, port, style, str(ip_time), str(connect), str(success)])
        # print(dt, itemss)

    def write_text(self):
        """最后将有效代理写入文件"""
        with open('poolproxy.txt', 'w', encoding='utf-8') as f:
            f.write(json.dumps(self.latest_useful))
            f.write('\n')
        for line in list(set(self.file_list) | self.ips_useful):
            with open('poolproxy.txt', 'a', encoding='utf-8') as af:
                af.write(line + '\n')

    def run(self):
        """主逻辑"""
        self.read_file()
        self.get_ips_xici()
        self.write_text()

        self.get_ips_ip3366()
        self.write_text()

        self.get_ips_kuaidaili()
        self.write_text()

        self.get_ips_ihuan()
        self.write_text()


if __name__ == '__main__':
    tt = PoolProxy()
    tt.run()

3.useragents.py

#!/D:/java/Anaconda/python.exe
# -*-coding:utf-8 -*-
__author__ = 'wzq'

USER_AGENT = [
    # IE
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; WOW64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
    'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; ARM; Trident/6.0)',
    # chrome
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2',
    'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.2 (KHTML, like Gecko) Ubuntu/10.04 Chromium/15.0.874.106 Chrome/15.0.874.106 Safari/535.2',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.106 Safari/535.2',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7',

]

自己新建空白poolproxy.txt(4个文件在同一目录)

经测试，在ubuntu服务器上运行没问题(打开出口安全策略)

# 程序后台运行
# nohup python -u time1.py >poolproxy.log 2>&1 &
# 实时log显示
# tail -f poolproxy.log

~/py36_projects/poolproxy » ls

__init__.py poolproxy.log poolproxy.py poolproxy.txt __pycache__ time1.py useragents.py

Ctrl_Y

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫