抓取ip代理网免费ip构建ip代理池

本人新学python,写的第一个爬虫,记录自己的学习过程,并且分享代码,因为刚刚
学习代码不够简洁,很多地方考虑不周,有不好的地方望大家指教.一起进步

# coding=utf-8
# 此项目的目的是为了抓取测试各代理免费IP,并测试提供可使用的IP给其他爬虫使用
import requests
import re
import random
import time
import os
from functools import reduce

"""

1.随机报头
2.ip地址池

"""
# 地址栏输入 “about:version”来获取用户代理,伪装成流浪器访问网站
# 注意点如果是txt文件可能存在换行符,在遍历打印显示不出来,在列表中打印能显示,网页提取数据时一样

file_name = "ip_adress.txt"#保存文件名


path = "C:/Users/Administrator/Desktop/linxuan/internet_worm_project/" # 文件路劲


ip_num = 20 # 可用ip少于10个时开始去爬取网页,剩下的时间自我检测


sleeptime = 60 # 程序间隔时间


def Txt_Create(path): # path是指定文件路径,msg是写入的文件内容
    if os.path.isfile(path):
        print(file_name + '文件已存在')
    else:
        txt_file = open(path, 'w')


Txt_Create(path + file_name) #  无文件的话创建


def get_headers():
    """
    # 创建随机报头
    :return:
    """
    user_agent = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50" ,
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
	"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25",

    ]

    random_user = random.choice(user_agent)

    headers = {
        'User-Agent': random_user
    }

    return headers


def spider():#爬取页面
    """
    爬取文件内容
    :return:
    """

    page = 5 #提取前几页
    raw_url = "http://www.89ip.cn/" # 请换个网址,嘿嘿,后面的正则表达式也做下相应的修改

    #url = " http://www.89ip.cn/tqdl.html?api=1&num=300&port=&address=&isp= "
    #这个是api但是发现api提取的ip

    all_response = [] # 所有获取网页数据存储
    for i in range(0,page):
        cnt = 8 # 第几次开始使用本机ip
        url = raw_url+'index_%d.html' % (i+1)
        print("----------")
        print("[spider]   爬取第%d页 "% (i+1))

        for j in range(0,10): #抓取五次失败后结束
            try:
                response = ""
                if len(ip_list)!=0 and j+1 < cnt : #如果没有ip则直接跳过

                    random_ip = random.choice(ip_list) # 从ip地址池里调用随机ip
                    print("[spider] %d 本次调用ip %s: " % (j+1,random_ip))
                    proxies_ip = {
                        "http": random_ip,
                        "https": random_ip
                    }
                    response = requests.get(url, headers=get_headers(),proxies=proxies_ip, timeout=5)
                    if response.status_code == 200:
                        all_response.append(response.text)
                        print("[spider]   成功爬取 %d 页" % (i+1))
                        break # 这样停止,不加break会重复抓取本页
                    else:
                        print("[spider]   ip访问失败 ")
                else :
                    print("[spider]   本次调用ip : " + "本机ip")
                    response = requests.get(url, headers=get_headers(), timeout=5)
                    if response.status_code == 200:
                        all_response.append(response.text)
                        print("[spider]    成功爬取 %d 页" % (i+ 1))
                        break
                    else:
                        print("[spider]   ip访问失败 ")

            except Exception as err:
                print("[spider]   ip访问失败 ")

    print("==========")
    return all_response

def get_data( all_data ): #将爬取的数据进行提取


    ip_address = []  # 页面抓取后存储ip的列表

    pat = '<td>[^<]*?(\d*\.\d*\.\d*\.\d*)[^<]*?</td>'
    pat_1 = '</td>[^<]*?<td>[^<]*?(\d{2,8})[^<]*?</td>[^<]*?<td>'

    # rst = re.compile(pat,re.S).findall(response)

    rst = re.compile(pat).findall(all_data)
    rst_1 = re.compile(pat_1, re.S).findall(all_data)
    if len(rst) == len(rst_1):
        for i in range(0, len(rst)):
            if (rst[i] + ":" + rst_1[i]) in ip_list:
                print("[get_data]  %s 已在ip池中,跳过" % (rst[i] + ":" + rst_1[i]))
            elif (rst[i] + ":" + rst_1[i]) not in ip_list:
                ip_address.append(rst[i] + ":" + rst_1[i])

    print("====================")
    return ip_address



def proxy_ip(data):
    """
    将获取的ip进行测试后写入文件
    :param ip:
    :return:
    """
    print("[proxy_ip] 抓取数据 %d 条 " % len(data))
    for i in range(0,len(data)):
        print("[proxy_ip] %d 测试 ip : %s" % (i+1,data[i]))
        proxies= {
            "http": data[i],
            "https": data[i]
        }
        if data[i] in ip_list:
            print("[proxy_ip]  %s 在ip池中跳过测试 " % data[i])
            continue
        else:
            try:
                response = requests.get('http://icanhazip.com',headers= get_headers(),proxies=proxies ,timeout=5)
                if response.status_code == 200:
                    print("[proxy_ip]   测试成功 "+response.text)

                    fh = open(path+file_name, "a")
                    if data[i] not in ip_list:
                        fh.write(data[i]+'\r\n')
                    fh.close()  # 关闭文件

                else:
                    print("[proxy_ip]   ip失效")

            except Exception as err:
                print("[proxy_ip]   ip失效")

    print("----------")


def update_ip():
    """
    检查地址池再次检查可用的ip,无效的清除
    :return:
    """
    # with open 不需要在close关闭文件了,write等于关闭了
    ip_adress = []
    fh = open(path+file_name, 'r', encoding='utf-8')  # 以只读模式读取文件,如果是w,a必须写的入,不然会出错
    for i in fh:
        data = i.strip('\n')  # 去掉换行符
        if data == "":
            print("[update_ip]  未抓取到 ip , 程序跳过" + data)
            continue

        print("[update_ip]  检测ip有效性 : " + data)
        proxies = {
            "http": data,
            "https": data
        }

        try:
            response = requests.get('http://icanhazip.com', headers=get_headers(), proxies=proxies, timeout=5)
            if response.status_code == 200:
                print("[update_ip]  测试成功  " +response.text)
                ip_adress.append(data)
            else:
                print("[update_ip]  ip失效")
        except Exception as err:
            print("[update_ip]  ip失效")


        print("--------------------")
    fh.close()

    fh = open(path+file_name, 'w', encoding='utf-8')  # 以只读模式读取文件,c重新写入txt
    for i in ip_adress:
        fh.writelines(i)
        fh.writelines('\n')
    fh.close()
    print("====================")


if __name__ == "__main__": # 本模块下列程序才可使用


    while True:

        ip_list = []  # 文档里已存的ip,需要写在循环里,每次程序开始后得清空

        with open(path+file_name, 'r') as fh:
            for i in fh:
                if i.strip('\n') != '':  # 去除换行符的另外一种写法 i[:-1]
                    ip_list.append(i.strip('\n').strip())

        if len(ip_list)<ip_num :

            print(" ip 数量 %d 不满足设定值,启动爬虫 " % len(ip_list))
            #spider()

            t = reduce(lambda x, y: x+y,spider())# 对提取的网页合内容合并成一个列表
            #print("t ",spider())
            proxy_ip(get_data( str(t)))
            update_ip()

        else:
            print("ip 数量 %d 满足设定值,运行爬虫维护" % len(ip_list))
            print("====================")
            update_ip()

        time.sleep(sleeptime)

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值