分享一个爬取免费代理——代码

#!D:/spider/src/ python
# -*- coding: utf-8 -*-
# @Time     : 20:49
# @Author   :lion
# @Site     :10kb
# File      :agency.py
# @Software :PyCharm
import csv
import threading
from queue import Queue
from threading import Thread

import requests
from lxml import etree

ip_text = "ip.txt"


class QuickAgency(Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44",
        "Connection": "close"
    }

    def __init__(self, page_queue: Queue, joke_queue: Queue, *args, **kwargs):
        super(QuickAgency, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.joke_queue = joke_queue
        self.index = "https://www.kuaidaili.com"

    def run(self) -> None:
        while True:
            if self.page_queue.empty():
                break
            urls = self.page_queue.get()
            response = requests.request("GET", urls, headers=self.headers, timeout=3).content  # 快代理

            html = etree.HTML(response)
            ip = html.xpath("//*[@id='list']/table/tbody/tr/td[1]/text()")  # ip
            port = html.xpath("//*[@id='list']/table/tbody/tr/td[2]/text()")  # 端口
            print("第%s页爬取完毕,开始下一页..." % urls.split("/")[-1])


class Free89(Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44",
    }

    def __init__(self, page_queue: Queue, joke_queue: Queue, *args, **kwargs):
        super(Free89, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.joke_queue = joke_queue

    def run(self) -> None:
        while True:
            if self.page_queue.empty():
                break
            try:
                urls = self.page_queue.get()
                print(urls)
                response = requests.get(urls, headers=self.headers, timeout=3.0).content.decode()
                html = etree.HTML(response)
                ip = html.xpath("//*[@class='layui-table']/tbody/tr/td[1]/text()")
                port = html.xpath("//*[@class='layui-table']/tbody/tr/td[2]/text()")
                for i, p in zip(ip, port):
                    i, p = i.strip(), p.strip()

                    proxy = {"http": i + ":" + p}
                    r = requests.request("get", "http://httpbin.org/ip", proxies=proxy, timeout=3.0).content.decode()
                    if str(i) in r:
                        print("ip:%s可用" % proxy)
                        self.joke_queue.put((i, p))
                    else:
                        print("ip:%s不可用" % proxy)
            except Exception:
                pass


class Writer(Thread):

    def __init__(self, joke_queue: Queue, writer, glock: threading.Lock, *args, **kwargs):
        super(Writer, self).__init__(*args, **kwargs)
        self.joke_queue = joke_queue
        self.writer = writer
        self.lock = glock

    def run(self) -> None:
        while True:
            try:
                joke_info = self.joke_queue.get(timeout=40)
                self.lock.acquire()
                self.writer.writerow(joke_info)
                self.lock.release()
                print("存入一条有效地址...")
            except Exception as e:
                print(e)
                break


if __name__ == '__main__':
    page_queue = Queue(50)
    joke_queue = Queue(50)

    glock = threading.Lock()
    ip = open("ip.csv", "a", newline="", encoding="utf-8")
    writer = csv.writer(ip)
    writer.writerow(("ip", "port", "type"))

    for i in range(1, 41):
        # url = "https://www.kuaidaili.com/free/intr/%d" % i
        url = "http://www.89ip.cn/index_%d.html" % i
        page_queue.put(url)

    # 抓取数据
    for i in range(15):
        # t = QuickAgency(page_queue, jock_queue)
        t = Free89(page_queue, joke_queue)
        t.start()

    for w in range(15):
        t = Writer(joke_queue, writer, glock)
        t.start()
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值