IP代理池简单应用

world_in_world

已于 2024-04-06 16:46:51 修改

阅读量384

点赞数

分类专栏： python爬虫文章标签： ip 爬虫

于 2023-05-03 18:32:50 首次发布

本文链接：https://blog.csdn.net/world_in_world/article/details/130434069

版权

python爬虫专栏收录该内容

18 篇文章 1 订阅

订阅专栏

该代码示例展示了如何使用Python从快代理网站抓取免费IP并验证其有效性，以及如何使用付费IP进行亚马逊商品信息的多线程抓取。程序涉及到IP验证、数据存储到MongoDB和MySQL数据库，以及异常处理和重试策略。

摘要由CSDN通过智能技术生成

一、免费IP采集（几乎没有有效IP）


import requests
import time
from lxml import etree
import pymongo
import random


class KuaiDaiLi:
    def __init__(self):
        self.db = pymongo.MongoClient(host='localhost', port=27017)
        self.collection = self.db.python.IP
        self.url = 'https://www.kuaidaili.com/free/inha/{}/'
        self.test_url = 'http://httpbin.org/ip'
        self.headers = {
            'User-Agent': 'xxx'
        }


    def get_data(self):
        for i in range(155, 256):
            response = requests.get(self.url.format(i), headers=self.headers)
            html = response.text
            self.parse_data(html)
            time.sleep(5/random.randint(3, 4))


    def parse_data(self, html):
        element = etree.HTML(html)
        trs = element.xpath('//*[@id="list"]/table/tbody/tr[position()>1]')
        for tr in trs:
            item = {}
            ip = tr.xpath('./td[@data-title="IP"]/text()')[0]
            port = tr.xpath('./td[@data-title="PORT"]/text()')[0]
            item['anonymous'] = tr.xpath('./td[@data-title="匿名度"]/text()')[0]
            item['style'] = tr.xpath('./td[@data-title="类型"]/text()')[0]
            item['localtion'] = tr.xpath('./td[@data-title="位置"]/text()')[0]
            item['speed'] = tr.xpath('./td[@data-title="响应速度"]/text()')[0]
            item['time'] = tr.xpath('./td[@data-title="最后验证时间"]/text()')[0]
            item['pay'] = tr.xpath('./td[@data-title="付费方式"]/text()')[0]
            # print(item)
            self.ip_verificate(ip, port, item)


    def ip_verificate(self, ip, port, item):
        proxies = {'http': 'http://' + ip + ':' + port}
        try:
            response = requests.get(self.test_url, headers=self.headers, proxies=proxies, timeout=2)
            if response.status_code == 200:
                item['ip'] = ip
                item['port'] = port
                print(response.text, item)
                self.save_data(item)
            else:
                print(f'{ip}:{port}，状态码：', response.status_code)
        except Exception as e:
            print('ConnectTimeoutError', ip)


    def save_data(self, item):
        self.db.insert_one(item)
        print('插入成功')


    def main(self):
        self.get_data()
        self.db.close()


if __name__ == '__main__':
    kdl_IP = KuaiDaiLi()
    kdl_IP.main()

二、付费IP应用


import requests
import pymysql
import threading
from lxml import etree
from queue import Queue
import time
from pprint import pprint
import re
from retrying import retry
from feapder.network.user_agent import get
from loguru import logger


class Amazon:
    def __init__(self):
        self.base_url = 'https://www.amazon.cn/nav/ajax/hamburgerMainContent?ajaxTemplate=hamburgerMainContent&pageType=Gateway&hmDataAjaxHint=1&navDeviceType=desktop&isSmile=0&isPrime=0&isBackup=false&hashCustomerAndSessionId=c108bde04b677f19f2e5d7df74ff6ce0cad515fc&languageCode=zh_CN&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6122949553-AL2_x86_64&secondLayerTreeName=apparel_shoes%2Bcomputer_office%2Bhome_kitchen%2Bbeauty_pca%2Bkindle_ebook%2Bsports_outdoor%2Bgrocery%2Bbaby_toy%2Bphones_elec%2Bjewelry_watch%2Bhome_improvement%2Bvideo_game%2Bmusical_instrument%2Bcamera&customerCountryCode=null'
        self.headers = {
            "Connection": "keep-alive",
            "downlink": "10",
            "ect": "4g",
            "rtt": "50",
            "User-Agent": "xxx",
        }
        self.db = pymysql.connect(user='root', password='12345', host='localhost', database='python', port=3306, charset='utf8')
        self.cursor = self.db.cursor()
        self.ip_url = '获取付费ip接口'
        self.ip_queue = Queue()
        self.classification_info_queue = Queue()
        self.good_info_queue = Queue()
        self.save_queue = Queue()


    def get_ip(self):
        while True:
            if self.ip_queue.empty():
                response = requests.get(self.ip_url)
                print(response.text)
                self.ip_queue.put(response.text)
            else:
                continue


    @retry(stop_max_attempt_number=3)
    def test_ip(self, url):
        ip = self.ip_queue.get()
        proxies = {'http': 'http:' + ip}
        self.headers['User-Agent'] = get()
        response = requests.get(url, headers=self.headers, proxies=proxies, timeout=2)
        assert response.status_code == 200, '状态码错误'
        self.ip_queue.put(ip)
        return response


    def get_secondary_classification(self):
        response = self.test_ip(self.base_url)
        # pprint(response.json())
        html = response.json()['data']
        element = etree.HTML(html)
        li_list = element.xpath('//ul/li[position() > 2]')
        for li in li_list:
            classification_info = {}
            if li.xpath('./a/text()'):
                if '全部' in li.xpath('./a/text()')[0]:
                    continue
                if 'http' in li.xpath('./a/@href')[0]:
                    continue
                classification_info['classification_name'] = li.xpath('./a/text()')[0]
                classification_href = li.xpath('./a/@href')[0]
                classification_info['classification_keyid'] = re.findall('.*?node=(.*?)&ref_=.*?', classification_href)[0]
                self.classification_info_queue.put(classification_info)
            # print(classification_info)


    def get_goods_info(self):
        while True:
            classification_info = self.classification_info_queue.get()
            start_url = f"https://www.amazon.cn/s?rh=n%3A{classification_info['classification_keyid']}&fs=true"
            # try:
            #     response = self.test_ip(start_url)
            # except Exception as e:
            #     logger.error(start_url)
            #     continue
            # element = etree.HTML(response.text)
            # max_index = element.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()')[0]
            # for i in range(1, int(max_index) + 1):
            for i in range(1, 11):
                goods_url = start_url + f"&page={i}"
                try:
                    response = self.test_ip(goods_url)
                except Exception as e:
                    logger.error('商品列表页', goods_url)
                    continue
                element = etree.HTML(response.text)
                divs = element.xpath('//span/div[@class="s-main-slot s-result-list s-search-results sg-row"]/div[@data-component-type="s-search-result"]')
                for div in divs:
                    good_info = {}
                    good_info['classification_name'] = classification_info['classification_name']
                    good_info['goods_url'] = goods_url
                    h = div.xpath('.//div/h2/a[@class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]/@href')
                    # print(good_info)
                    if h:
                        good_info['good_url'] = 'https://www.amazon.cn' + h[0]
                        self.good_info_queue.put(good_info)
                    else:
                        continue
            self.classification_info_queue.task_done()


    def get_good_data(self):
        while True:
            good_info = self.good_info_queue.get()
            try:
                response = self.test_ip(good_info['good_url'])
            except Exception as e:
                logger.error('商品详情页', good_info['good_url'])
                continue
            element = etree.HTML(response.text)
            good_title = element.xpath('//div[@id="centerCol"]//h1/span/text()')[0] if element.xpath('//div[@id="centerCol"]//h1/span/text()') else element.xpath('//title/text()')[0]
            good_price = element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()')[0] \
                        if element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()') \
                        else '-'.join(element.xpath('//td[@class="a-span12"]//span[@class="a-offscreen"]/text()'))
            good_tup = (good_info['classification_name'], good_info['goods_url'], good_info['good_url'], good_title, good_price)
            print(good_tup)
            self.save_queue.put(good_tup)
            self.good_info_queue.task_done()


    def create_table(self):
        sql = """
        create table if not exists amazon(
            id int unsigned primary key auto_increment,
            classification_name varchar(50),
            goods_url varchar(400),
            good_url varchar(400),
            good_title varchar(400),
            good_price varchar(30)
        )
        """
        try:
            self.cursor.execute(sql)
            print('表创建成功')
        except Exception as e:
            print('表创建失败', repr(e))


    def save_data(self):
        while True:
            data_list = []
            for i in range(30):
                try:
                    data = self.save_queue.get(timeout=3)
                    data_list.append((0,) + data)
                    self.save_queue.task_done()
                except Exception as e:
                    print('队列消息数量不足30', repr(e))
                    break
            sql = """
                insert into amazon(id, classification_name, goods_url, good_url, good_title, good_price) 
                values(%s, %s, %s, %s, %s, %s)
            """
            try:
                self.cursor.executemany(sql, data_list)
                self.db.commit()
                print('保存成功')
            except Exception as e:
                self.db.rollback()
                print('保存失败')


    def main(self):
        self.create_table()
        threads = []
        t_ip = threading.Thread(target=self.get_ip)
        threads.append(t_ip)
        t_classification = threading.Thread(target=self.get_secondary_classification)
        threads.append(t_classification)
        for i in range(6):
            t_goods = threading.Thread(target=self.get_goods_info)
            threads.append(t_goods)
        for i in range(6):
            t_good = threading.Thread(target=self.get_good_data)
            threads.append(t_good)
        t_save = threading.Thread(target=self.save_data)
        threads.append(t_save)
        for t in threads:
            t.setDaemon(True)
            t.start()
        time.sleep(2)
        for q in [self.good_info_queue, self.good_info_queue, self.save_queue]:
            q.join()
        self.db.close()


if __name__ == '__main__':
    # 文件大于500M就会重新生成一个文件
    logger.add("runtime_{time}.log", rotation="500 MB")
    amazon = Amazon()
    amazon.main()


'''
不使用代理
'''
import requests
import pymysql
import threading
from lxml import etree
from queue import Queue
import random
import time
from pprint import pprint
import re
from feapder.network.user_agent import get


class Amazon:
    def __init__(self):
        self.base_url = 'https://www.amazon.cn/nav/ajax/hamburgerMainContent?ajaxTemplate=hamburgerMainContent&pageType=Gateway&hmDataAjaxHint=1&navDeviceType=desktop&isSmile=0&isPrime=0&isBackup=false&hashCustomerAndSessionId=c108bde04b677f19f2e5d7df74ff6ce0cad515fc&languageCode=zh_CN&environmentVFI=AmazonNavigationCards%2Fdevelopment%40B6122949553-AL2_x86_64&secondLayerTreeName=apparel_shoes%2Bcomputer_office%2Bhome_kitchen%2Bbeauty_pca%2Bkindle_ebook%2Bsports_outdoor%2Bgrocery%2Bbaby_toy%2Bphones_elec%2Bjewelry_watch%2Bhome_improvement%2Bvideo_game%2Bmusical_instrument%2Bcamera&customerCountryCode=null'
        self.headers = {
            "Connection": "keep-alive",
            "downlink": "10",
            "ect": "4g",
            "rtt": "50",
            "User-Agent": "xxx",
        }
        self.db = pymysql.connect(user='root', password='12345', host='localhost', database='python', port=3306, charset='utf8')
        self.cursor = self.db.cursor()
        self.ip_url = ''
        self.classification_info_queue = Queue()
        self.good_info_queue = Queue()
        self.save_queue = Queue()


    def get_secondary_classification(self):
        self.headers['User-Agent'] = get()
        response = requests.get(self.base_url, headers=self.headers)
        # pprint(response.json())
        html = response.json()['data']
        element = etree.HTML(html)
        li_list = element.xpath('//ul/li[position() > 2]')
        for li in li_list:
            classification_info = {}
            if li.xpath('./a/text()'):
                if '全部' in li.xpath('./a/text()')[0]:
                    continue
                if 'http' in li.xpath('./a/@href')[0]:
                    continue
                classification_info['classification_name'] = li.xpath('./a/text()')[0]
                classification_href = li.xpath('./a/@href')[0]
                classification_info['classification_keyid'] = re.findall('.*?node=(.*?)&ref_=.*?', classification_href)[0]
                self.classification_info_queue.put(classification_info)
            # print(classification_info)


    def get_goods_info(self):
        while True:
            classification_info = self.classification_info_queue.get()
            start_url = f"https://www.amazon.cn/s?rh=n%3A{classification_info['classification_keyid']}&fs=true"
            # try:
            #     response = self.test_ip(start_url)
            # except Exception as e:
            #     logger.error(start_url)
            #     continue
            # element = etree.HTML(response.text)
            # max_index = element.xpath('//span[@class="s-pagination-strip"]/span[last()]/text()')[0]
            # for i in range(1, int(max_index) + 1):
            for i in range(1, 2):
                self.headers['User-Agent'] = get()
                goods_url = start_url + f"&page={i}"
                response = requests.get(goods_url, headers=self.headers)
                element = etree.HTML(response.text)
                divs = element.xpath('//span/div[@class="s-main-slot s-result-list s-search-results sg-row"]/div[@data-component-type="s-search-result"]')
                for div in divs:
                    good_info = {}
                    good_info['classification_name'] = classification_info['classification_name']
                    good_info['goods_url'] = goods_url
                    h = div.xpath('.//div/h2/a[@class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"]/@href')
                    # print(good_info)
                    if h:
                        good_info['good_url'] = 'https://www.amazon.cn' + h[0]
                        self.good_info_queue.put(good_info)
                    else:
                        continue
            self.classification_info_queue.task_done()


    def get_good_data(self):
        while True:
            time.sleep(random.uniform(1, 2))
            self.headers['User-Agent'] = get()
            good_info = self.good_info_queue.get()
            response = requests.get(good_info['good_url'], headers=self.headers)
            # print(response.text)
            element = etree.HTML(response.text)
            good_title = element.xpath('//div[@id="centerCol"]//h1/span/text()')[0] if element.xpath('//div[@id="centerCol"]//h1/span/text()') else element.xpath('//title/text()')[0]
            good_price = element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()')[0] \
                        if element.xpath('//div[@class="a-section a-spacing-none aok-align-center"]/span/span[@class="a-offscreen"]/text()') \
                        else '-'.join(element.xpath('//td[@class="a-span12"]//span[@class="a-offscreen"]/text()'))
            good_tup = (good_info['classification_name'], good_info['goods_url'], good_info['good_url'], good_title, good_price)
            print('@'*50, good_info['good_url'], good_title, good_price)
            self.save_queue.put(good_tup)
            self.good_info_queue.task_done()


    def create_table(self):
        sql = """
        create table if not exists amazon(
            id int unsigned primary key auto_increment,
            classification_name varchar(50),
            goods_url varchar(400),
            good_url varchar(400),
            good_title varchar(400),
            good_price varchar(30)
        )
        """
        try:
            self.cursor.execute(sql)
            print('表创建成功')
        except Exception as e:
            print('表创建失败', repr(e))


    def save_data(self):
        while True:
            data_list = []
            for i in range(30):
                try:
                    data = self.save_queue.get(timeout=3)
                    data_list.append((0,) + data)
                    self.save_queue.task_done()
                except:
                    break
            sql = """
                insert into amazon(id, classification_name, goods_url, good_url, good_title, good_price) 
                values(%s, %s, %s, %s, %s, %s)
            """
            try:
                self.cursor.executemany(sql, data_list)
                self.db.commit()
                print('保存成功')
            except Exception as e:
                self.db.rollback()
                print('保存失败')


    def main(self):
        self.create_table()
        threads = []
        # t_ip = threading.Thread(target=self.get_ip)
        # threads.append(t_ip)
        t_classification = threading.Thread(target=self.get_secondary_classification)
        threads.append(t_classification)
        for i in range(6):
            t_goods = threading.Thread(target=self.get_goods_info)
            threads.append(t_goods)
        for i in range(6):
            t_good = threading.Thread(target=self.get_good_data)
            threads.append(t_good)
        t_save = threading.Thread(target=self.save_data)
        threads.append(t_save)
        for t in threads:
            t.setDaemon(True)
            t.start()
        time.sleep(2)
        for q in [self.good_info_queue, self.good_info_queue, self.save_queue]:
            q.join()
        self.db.close()


if __name__ == '__main__':
    amazon = Amazon()
    amazon.main()