python爬取和筛选ip代理,建立ip池

这段时间的学习,如有雷同,肯定不是抄袭 。hhhh......

建立IP池前需要把IP存储到本地

这里爬取的时西刺IP代理:

class GetForignProxy(object):
    # 伪装header
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
    }
    # item用来暂存数据
    items = {
        "ip": '',
        "post": '',
        "address": '',
        "speed": 0,
        "conn_time": 1000
    }
    # 创建数据库连接
    sql_con = MySQLdb.connect(host="localhost",
                              user="root",
                              passwd="12345",
                              db="spider_article",
                              charset="utf8"
                              )
    # 获取光标
    cursor = sql_con.cursor()

    def __init__(self, url):
        # 初始化
        # 请求
        self.resp = self.get_response(url)
        # 遍历迭代器,使process_response()的for循环执行下去
        for item_sql in self.process_response(self.resp):
            pass

    # 返回参数url的response
    def get_response(self, url):
        return requests.get(url, headers=self.headers)

    # 处理西刺的数据    
    def process_response(self, resp):
        sself.selet= Selector(text= resp.text)
        self.all_trs=self.selet.xpath("//table[@id= 'ip_list']//tr[@class= 'odd']")

        for tr in self.all_trs:
            self.items["ip"]=tr.xpath(".//td//text()").extract()[0]
            self.items["post"]=tr.xpath(".//td//text()").extract()[1]
            self.items["address"]=tr.xpath(".//td//text()").extract()[3]
            speed =tr.xpath(".//div/@title").extract()[0]
            conn_time= tr.xpath(".//div/@title").extract()[1]
            if speed:
                self.items["speed"]= float(speed.split('秒')[0])
                self.items["conn_time"] = float(conn_time.split('秒')[0])

            # 生成迭代,执行插入数据库操作
            yield self.insert_to_mysql(self.items)

    # 插入数据库
    def insert_to_mysql(self, items):
        sql_insert="""
                insert into xici_proxy
                values
                (%s,%s,%s,%s,%s)
        """
        self.cursor.execute(sql_insert,(items["ip"],items["post"],items["address"],items["speed"],items["conn_time"]))
        self.sql_con.commit()

取的时候,取随机IP:

class GetRandomIp(object):
    """
    通过.get_random_ip()获取返回随机IP
    """
    def __init__(self):
        self.sql_con = MySQLdb.connect(host="localhost",
                                  user="root",
                                  passwd="12345",
                                  db="spider_article",
                                  charset="utf8")
        self.cursor = self.sql_con.cursor()

    def get_random_ip(self):
        sql_select = """select ip,post 
                                        from xici_proxy
                                        ORDER BY RAND()
                                        LIMIT 1
                                        """

        while(True):
            self.cursor.execute(sql_select)
            #取光标中数据
            for select_info in self.cursor.fetchall():

                ip= select_info[0]
                post= select_info[1]
                proxy_ip="https://"+ ip+ ':'+ post
                #判断IP是否可用,是则返回
                if self.JudgeIp(proxy_ip,ip):
                    return proxy_ip
                else:
                    break

 判断IP是否可用:

# 通过参数proxies=字典,来设置代理,发送请求给百度

def JudgeIp(self,proxy_ip,ip):
        # 要判断代理IP是否可用,用它搜一下百度就可以
        post_url= "https://www.baidu.com/"
        header={
            "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"
        }
        try:
            proxy_dict= {"https":proxy_ip}
            print("...try to connect...")
            # 通过参数proxies=字典,来设置代理
            resp= requests.get(post_url, proxies= proxy_dict, headers=header)
            # sta_code= resp.status_code
            # if sta_code>=200 and sta_code<300:
            #     return True
            # else:
            #     return False
        except Exception as e:
            print("invalid proxy_dict")
            self.DeleteIp(ip)
            return False
        else:
            sta_code = resp.status_code
            if sta_code >= 200 and sta_code < 300:
                return True
            else:
                # 如果ip不可用,从数据库删除掉
                self.DeleteIp(ip)
                return False

    # 从数据库删除掉不可用IP
    def DeleteIp(self,ip):
        sql_delete="""delete from xici_proxy
                      where ip='{0}'""".format(ip)
        print("删除ip:" + ip+'\n')
        self.cursor.execute(sql_delete)
        self.sql_con.commit()
        return True

最后调用时需要用if __name__ == "__main__":在当前运行,这是python的一种格式


if __name__ == "__main__":
    #GetXiCiProxy("https://www.xicidaili.com/wn/")
    #GetForignProxy("http://www.data5u.com/free/gwpt/index.shtml")
    rand_ip= GetRandomIp()
    print(rand_ip.get_random_ip())

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值