Python怎么自建ip代理池

先找个免费代理的网站https://www.kuaidaili.com/free/inha/,再爬所有的ip。因为网站本身就会防爬虫,所以就用刚获取到的ip逐个尝试是否可用,找到可用ip后,就用这个ip代理继续爬下一页。

#!/usr/bin/env python
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pymysql
import time

global ip_num
ip_num = 1
base_url = "https://www.kuaidaili.com/free/inha/"
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
proxies={'http':'110.243.20.23:9999'}
def proxies_switch(url):
    print("正在进行ip的切换...")
    global ip_num
    status = False
    while status ==False: #找到合适的ip——post地址
        print("正在验证第%s个ip地址"%(ip_num))
        sql = "select ip,post from ip_proxy where id = %s" % (ip_num)
        cursor.execute(sql)
        items = cursor.fetchall()
        ip = items[0][0]
        post = items[0][1]
        ip_post = ip+":"+post
        response = requests.get(url,proxies = {'http':ip_post})
        status = response.ok
        ip_num = ip_num+1
    proxies['http'] = ip_post
    print("第%d个ip地址测试成功"%(ip_num))
    time.sleep(1) #切换成功之后sleep一秒 防止新的ip_post被封
def reptile_ip(url):
    list = []
    html = requests.get(url,headers = headers,proxies = proxies)
    print("连接情况为",html.ok)
    if html.ok == False:
        proxies_switch(url)
        html = requests.get(url, headers=headers, proxies=proxies)
    soup = BeautifulSoup(html.content,"html.parser")
    items = soup.find("table",class_ = "table table-bordered table-striped").find_all("tr")
    items =items[1:]
    for i in items:
        book = {}
        book["ip"] = i.find_all("td")[0].text
        book["post"] = i.find_all("td")[1].text
        book["type"] = i.find_all("td")[3].text
        book["place"] = i.find_all("td")[4].text
        book["response_time"] = i.find_all("td")[5].text
        list.append(book)
    return list

if __name__ == "__main__":
    url = base_url
    conn = pymysql.connect("数据库ip",user="用户名",passwd = "密码",db ="数据名")
    cursor = conn.cursor()
    cursor.execute("drop table if exists ip_proxy")
    createtab = """create table ip_proxy(
    id integer NOT NULL auto_increment PRIMARY KEY,
    ip char(50) not null ,
    post char(20) not null ,
    type char(20) not null,
    place char(50)not null,
    response_time char(20) not null)"""
    sql = "insert into ip_proxy (ip,post,type,place,response_time) values (%s,%s,%s,%s,%s)"
    cursor.execute(createtab)
    for i in range(1,70):
        print(i)
        lists = reptile_ip(url+str(i))
        for list in lists:
            try:
                cursor.execute(sql, (list["ip"], list["post"], list["type"], list["place"], list["response_time"]))
                conn.commit()
                print(str(list["ip"])+"has been keeped")
            except:
                conn.rollback()

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

昵称6550523

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值