python 爬取西刺ip代理池_python 获取西刺代理ip-CSDN博客

本文链接：https://blog.csdn.net/weixin_40508682/article/details/88859735

本文介绍如何爬取西刺免费IP代理并存储至MySQL数据库的方法，同时提供了设置requests库中的IP代理的具体步骤。此外，还实现了从数据库中随机或按速度选取有效IP的功能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1. 如何在requests中设置ip代理

最直接的用法，在get中添加proxies设置ip代理

proxies = {
                    'https': 'http://183.148.153.147:9999/'
                    'https': 'http://183.148.153.147:9999/'
                })
requests.get(url=url, headers=headers, proxies=proxies)

当ip被网站ban掉时，我们就需要使用大量的ip来进行替换，由此引出了下面的内容，爬取西刺提供的免费ip

2. 爬取西刺免费ip代理,并存入mysql

。。具体的字段分析，先待定吧，我ip被西刺ban了。。。233
直接上代码了

import time
from random import random

import requests
from scrapy.selector import Selector
import pymysql

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
}


# 创建数据库时，使用的字段是ip(varchar)(主键)  port(varchar)  proxy_type(varchar) speed(float) ，数据库名字(ips)，表名(ip_pond),user,passwd，这些都填自己的
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='ips', charset='utf8')
cursor = conn.cursor()

# 定义一个随机延时，因为西刺这网站太容易封ip了，，
def rand_sleep_time():
    sleep_time = random() * 100
    return time.sleep(sleep_time)

# 用于更新ip池
def update_ip_pond():
    # 这个网站目前一共有3637页，这里获取前面的10个页面
    for i in range(1, 11):
        resp = requests.get('https://www.xicidaili.com/nn/%s' % i, headers=headers)
        if resp.status_code != 200:
            print('第%s页获取失败' % i)
        else:
            print('已获取第%s页内容' % i)
        selector = Selector(text=resp.text)
        # 使用xpath找到ip_list这个id
        all_items = selector.xpath('//*[@id="ip_list"]//tr')
        ip_list = []
        #第一行不是我们需要的，过滤掉，从第1列开始
        for item in all_items[1:]:
            # 这里使用xpath从网页提取
            speed_str = item.xpath('td[7]/div/@title').get()
            if speed_str:
                speed = float(speed_str.split('秒')[0])
            ip = item.xpath('td[2]/text()').get()
            port = item.xpath('td[3]/text()').get()
            proxy_type = item.xpath('td[6]/text()').get().lower()

            ip_list.append((ip, port, proxy_type, speed))

            for ip_info in ip_list:
                # sql的作用为：插入并更新相应的字段
                cursor.execute(
                    "insert ip_pond(ip,port,proxy_type,speed) values ('{0}','{1}','{2}','{3}') ON DUPLICATE KEY UPDATE ip=VALUES(ip),port=VALUES(port),proxy_type=VALUES(proxy_type),speed=VALUES(speed)"
                        .format(ip_info[0], ip_info[1], ip_info[2], ip_info[3])
                )
        #延时，防拉黑
        rand_sleep_time()
    conn.commit()

3. 定义GetIp类，用于从mysql中取出ip

class GetIp(object):

    # 删除不可用的Ip
    def delete_ip(self, ip):
        delete_sql = """
        DELETE FROM ip_pond WHERE ip='{0}'
        """.format(ip)
        cursor.execute(delete_sql)
        conn.commit()
        return True

    # 验证ip是否可用
    def judge_ip(self, ip, port, proxy_type):
  		#通过百度来验证吧
        http_url = 'https://www.baidu.com'
        proxy_url = '{0}://{1}:{2}'.format(proxy_type, ip, port)
        try:
        	#对http.https进行区分
            if proxy_type == 'http':
                proxy_dict = {
                    'http': proxy_url,
                }
                response = requests.get(http_url, proxies=proxy_dict)
            else:
                proxy_dict = {
                    'https': proxy_url,
                }
                response = requests.get(http_url, proxies=proxy_dict, verify=False)
        except Exception as e:
            print('invalid ip and port')
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 and code < 300:
                print('effective ip')
                return True
            else:
                print('invalid ip and port')
                self.delete_ip(ip)
                return False

    # 从数据库中随机选择
    def get_random_ip(self):
        random_sql = """
        SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY RAND() LIMIT 1
        """
        cursor.execute(random_sql)
        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]
            proxy_type = ip_info[2]
            judge_re = self.judge_ip(ip, port, proxy_type)
            if judge_re:
                return '{0}://{1}:{2}'.format(proxy_type, ip, port)
            else:
                return self.get_random_ip()

    # 从数据库中选速度最快的 (大部分和上面的一样，只是sql语句不一样)
    def get_optimum_ip(self):
        optimum_sql = """
        SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY speed LIMIT 1
        """
        cursor.execute(optimum_sql)
        for ip_info in cursor.fetchall():
            ip = ip_info[0]
            port = ip_info[1]
            proxy_type = ip_info[2]
            judge_re = self.judge_ip(ip, port, proxy_type)
            if judge_re:
                return '{0}://{1}:{2}'.format(proxy_type, ip, port)
            else:
                return self.get_optimum_ip()

	#对获取的ip简单封装了下，方便使用
    def get_proxies(self):
        getip = GetIp()
        ip = getip.get_random_ip()
        print(ip)
        proxy_type = ip.split(':')[0]
        proxies = {
            proxy_type: ip
        }
        return proxies

4. 正确的使用方式

if __name__ == '__main__':

    # 当取到的ip是https的时候，会有点慢
    # 先确认是否存在ip_pond这表
    sql = """
    SELECT * FROM ip_pond
    """
    check_table = cursor.execute(sql)
    if check_table:
    	#测试用的url，这个随便写
        url = 'https://www.baidu.com'
        headers = {
            "User-Agent": "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
        }
        #上一步的简单封装，直接获取到proxies
        proxies = GetIp().get_proxies()
        res = requests.get(url=url, headers=headers, proxies=proxies)
    else:
        update_ip_pond()

源码请点击这里