python基础爬虫-爬取代理池

 基础爬虫,爬取代理池并进行测试


import requests
import re
from requests import adapters

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
def ip_66(data_text1):
    global IPS_66
    IPZ_66 = re.compile('<td>(\d+.\d+.\d+.\d+)</td><td>(\d+)</td>',re.S)
    IPS_66 = re.findall(IPZ_66,data_text1)
    return IPS_66
def port_p(data_text):
    global DKS
    DKZ = re.compile('<td data-title="PORT">(\d+)</td>',re.S)
    DKS = re.findall(DKZ,data_text)
    return DKS
def ip_ip(data_text):
    global IPS
    IPZ = re.compile('<td data-title="IP">(\d+\.\d+\.\d+\.\d+)</td>',re.S)
    IPS = re.findall(IPZ,data_text)
    return IPS
def pinjie():
    # 功能需求:实现两个IPS和DKS的拼接,结果是IPS:DSK
    ip_z= []
    for i in range(0,len(IPS)):
        ip_z.append(IPS[i]+':'+DKS[i])
    # 输出结果  ['xxxx:xxxx']
    dl_write(ip_z)
    return ip_z
# def merge_list(list1,list2):
#      python 有个拉链函数, zip(list1,list2),可以实现合并功能
#     print(list(zip(list1,list2)))
def dl_write(ip_z):
    with open('ipcs.txt','w',encoding='utf-8') as f:
        for DIP in ip_z:
            f.write(DIP+'\n')
    test_corr()
def test_corr():
    corr=[]
    with open('ipcs.txt','r',encoding='utf-8') as f:
        for i in f:
            try:
                # 设置重连次数
                adapters.DEFAULT_RETRIES = 3
                # IP = random.choice(IPAgents)
                proxy = f"http://{i}"
                # thisIP = "".join(IP.split(":")[0:1])
                # print(thisIP)
                res = requests.get(url="http://icanhazip.com/", timeout=2, proxies={"http": proxy})
                if (res.status_code == 200):
                    print("代理IP:"+i+"有效")
                    corr.append(i)
                else:
                    print("代理IP:"+i+"无效")
            except:
                print("错误")
    text_qualified(corr)
    return corr         
def text_qualified(corr):
    with open('ip.txt','w',encoding='utf-8') as f:
        for i in corr:
            f.write(i)
def ipyc():
    with open('ip.txt','r',encoding='utf-8') as f:
        for i in f:
            proxy = f"http://{i}"
            data = requests.get('https://free.kuaidaili.com/free/inha/',headers= header,proxies={"http": proxy})
            data_1=requests.get('http://www.66ip.cn/',headers= header,proxies={"http": proxy})
            data_text1=data_1.text
            data_text = data.text
            if data.status_code == 200:
                port_p(data_text)
                ip_ip(data_text)
                pinjie()
                ip_66(data_text1)
                break
ipyc()
    
  

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值