python3如何获取在线更新网站解锁？

最新推荐文章于 2023-01-04 11:56:12 发布

q56731523

最新推荐文章于 2023-01-04 11:56:12 发布

阅读量265

点赞数

文章标签： python linux 开发语言数据抓取

本文链接：https://blog.csdn.net/weixin_44617651/article/details/127627039

版权

利用python3获取在线更新网站解锁。可以根据以下代码，获取正在实时更新的网站解锁，大家不妨尝试下！

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import re
import sys
import time
import datetime
import threading
from random import choice
import requests
import bs4
 
 
class Proxy:
    def __init__(self, url='http://jshk.com.cn/nn', header='', user_agent=''):
        self.url = url
        self.header = header
        self.user_agent = user_agent
 
    def getIpList(self):
        # 获取网站解锁（取当前页的ip列表，每页100条ip）
        url = self.url
        headers = self.header
 
        r = requests.get(url, headers=headers)
        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        data = soup.table.find_all("td")
        # 匹配规则需要用浏览器的开发者工具进行查看
        # 匹配IP：<td>208.135.217.21</td>
        ip_compile = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td>')
        # 匹配端口：<td>808</td>
        port_compile = re.compile(r'<td>(\d+)</td>')
        # 获取所有IP，返回的是数组[]
        ip = re.findall(ip_compile, str(data))
        # 获取所有端口：返回的是数组[]
        port = re.findall(port_compile, str(data))
        # 组合IP+端口，如：125.135.217.7:808
        return [":".join(i) for i in zip(ip, port)]
 
    # 打开页面。执行操作
    def done(self, code=0, ips=[]):
        try:
            # 随机选取一个ip
            ip = choice(ips)
        except:
            return False
        else:
            proxies = {
                "http": ip,
            }
            headers_ = {
                "Accept": "*/*",
                "Accept-Encoding": "gzip, deflate, sdch",
                "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
                "Referer": "https://best.zhaopin.com/",
                "User-Agent": choice(self.user_agent),
            }
        try:
            # url
            link = ''
            requests.get(link, headers=headers_, proxies=proxies, verify=False)
        except requests.exceptions.ConnectionError:
            print("Connection Error")
            if not ips:
                print("not ip")
                sys.exit()
            # 删除不可用
            if ip in ips:
                ips.remove(ip)
            # 重新请求
            self.done(code, ips)
        else:
            date = datetime.datetime.now().strftime('%H:%M:%S')
            print(u"第%s次 [%s] [%s]： (剩余可用网站解锁数：%s)" % (code, date, ip, len(ips)))
 
 
if __name__ == '__main__':
    url = 'http://jshk.com.cn/nn'
    user_agent = [
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4",
        "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",
        "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko",
    ]
 
    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;",
               "Accept-Encoding": "gzip, deflate, sdch",
               "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6",
               "Referer": "http://jshk.com.cn",
               "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"
               }
 
    proxy = Proxy(url, headers, user_agent)
 
    ips = []
    # python3把xrange()与rang()e整合为一个range()
    for i in range(500):
        # 每隔1000次重新获取一次最新的网站解锁
        if i % 1000 == 0:
            ips.extend(proxy.getIpList())
        # 启用线程，隔2秒产生一个线程
        t1 = threading.Thread(target=proxy.done, args=(i, ips))
        t1.start()
        # time.sleep的最小单位是毫秒
        time.sleep(2)