python爬虫小白升仙_4-----爬取代理IP并多线程检测IP有效性-CSDN博客

本文链接：https://blog.csdn.net/a549742320/article/details/98774916

本文实现爬取"https://www.xicidaili.com/nn/"该网站的高匿代理ip，并使用多线程进行IP有效性的检测

涉及：

1. requests.get添加参数headers

2. 多页网页数据的爬取

3. 使用xpath获取的列表为空的问题的处理

4. 代理IP的有效性检测

5. 多线程的简单使用，提高ip验证的效率

网页信息：

请求头：
headers = {
    "Host": "www.xicidaili.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}

运行结果：

爬取5页，共500条信息，获取其ip、port、http/https类型，处理数据，组成239组，检测代理ip的有效性，得到最终proxies

代理ip：requests.get(url,headers=headers,proxies=random.choice(proxies))    在列表里随机选取ip

问题：使用xpath获取的列表为空

原因：浏览器会对html文本进行一定的规范化，所以会自动在路径中加入tbody，导致读取失败，去除tbody即可。
# ip_list=xml.xpath('//table[@id="ip_list"]//tbody//tr//td[2]/text()')  # 之前返回空列表的写法
ip_list = xml.xpath('//table[@id="ip_list"]//tr//td[2]/text()')  # ip  修改后ok

多线程：

# 通过多线程验证ip的有效性
threads=[]  # 创建列表
for i in range(len(get_effective_ip.complete_ip)):
    thread=threading.Thread(target=get_effective_ip.verify_ip,args=[i])  # target:目标方法  args:方法的参数
    threads.append(thread)   # 添加线程到列表
    thread.start()    # 启动线程
# 阻塞主进程，等待所有子线程结束
for t in threads:
    t.join()
for ip in get_effective_ip.effective_ip_list:
    print(ip)

源码：

class Get_IP(object):
    def __init__(self):
        self.complete_ip = []
        self.effective_ip_list = []

    #  获取ip  index：爬取的页数
    def get_ip(self,index):
        total_ip_list=[]
        total_port_list=[]
        total_http_list=[]
        headers = {
            "Host": "www.xicidaili.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
        }
        for i in range(index):
            response = requests.get("https://www.xicidaili.com/nn/" + str(i+1), headers=headers)
            xml = etree.HTML(response.text)
            # print(response.text)
            # 浏览器会对html文本进行一定的规范化，所以会自动在路径中加入tbody，导致读取失败，在此处直接在路径中去除tbody即可。
            # ip_list=xml.xpath('//table[@id="ip_list"]//tbody//tr//td[2]/text()')  # 返回空列表
            ip_list = xml.xpath('//table[@id="ip_list"]//tr//td[2]/text()')  # ip
            port_list = xml.xpath('//table[@id="ip_list"]//tr//td[3]/text()')  # 端口
            http_list = xml.xpath('//table[@id="ip_list"]//tr//td[6]/text()')  # http or  https
            total_ip_list=total_ip_list+ip_list
            total_port_list=total_port_list+port_list
            total_http_list=total_http_list+http_list
        print(len(total_ip_list))
        self.get_complete_ip(ip_list=total_ip_list, port_list=total_port_list, http_list=total_http_list)


    # 处理数据，得到完整的ip
    def get_complete_ip(self,ip_list,port_list,http_list):
        http_ip = []
        https_ip = []
        # 区分http和https,并将ip和port拼接起来
        for i in range(len(port_list)):
            if "HTTP"==http_list[i]:
                http_ip.append("http://"+ip_list[i]+":"+port_list[i])
            elif "HTTPS"==http_list[i]:
                https_ip.append("https://"+ip_list[i]+":"+port_list[i])
        # 由于要将http_ip与https_ip凑成对组成字典，故选取数量较少的一组
        if len(http_ip)>len(https_ip):
            group_num=len(https_ip)
        else:
            group_num=len(http_ip)
        print(group_num)
        # 将http/https和ip组成字典，再形成列表，构造出proxies（包括多组ip）
        for i in range(group_num):
            ip=dict(http=http_ip[i],https=https_ip[i])
            self.complete_ip.append(ip)
        # print(self.complete_ip)

    # 通过get请求返回结果的响应码来判断ip是否有效
    def verify_ip(self,index):
        try:
            response = requests.get("https://www.baidu.com/", proxies=self.complete_ip[index], timeout=3)
            if response.status_code == 200:
                self.effective_ip_list.append(self.complete_ip[index])
        except:
            pass
if __name__ == '__main__':
    get_effective_ip=Get_IP()
    get_effective_ip.get_ip(5)
    # 通过多线程验证ip的有效性
    threads=[]  # 创建列表
    for i in range(len(get_effective_ip.complete_ip)):
        thread=threading.Thread(target=get_effective_ip.verify_ip,args=[i])  # target:目标方法  args:方法的参数
        threads.append(thread)   # 添加线程到列表
        thread.start()    # 启动线程
    # 阻塞主进程，等待所有子线程结束
    for t in threads:
        t.join()
    for ip in get_effective_ip.effective_ip_list:
        print(ip)

effective_ip_list为最终的ip