python爬取和筛选ip代理，建立ip池

最新推荐文章于 2022-12-12 22:29:37 发布

wstm2016

最新推荐文章于 2022-12-12 22:29:37 发布

阅读量473

点赞数 1

本文链接：https://blog.csdn.net/wstm2016/article/details/88928073

版权

这段时间的学习，如有雷同，肯定不是抄袭。hhhh......

建立IP池前需要把IP存储到本地

这里爬取的时西刺IP代理：

class GetForignProxy(object):
    # 伪装header
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36"
    }
    # item用来暂存数据
    items = {
        "ip": '',
        "post": '',
        "address": '',
        "speed": 0,
        "conn_time": 1000
    }
    # 创建数据库连接
    sql_con = MySQLdb.connect(host="localhost",
                              user="root",
                              passwd="12345",
                              db="spider_article",
                              charset="utf8"
                              )
    # 获取光标
    cursor = sql_con.cursor()

    def __init__(self, url):
        # 初始化
        # 请求
        self.resp = self.get_response(url)
        # 遍历迭代器，使process_response()的for循环执行下去
        for item_sql in self.process_response(self.resp):
            pass

    # 返回参数url的response
    def get_response(self, url):
        return requests.get(url, headers=self.headers)

    # 处理西刺的数据    
    def process_response(self, resp):
        sself.selet= Selector(text= resp.text)
        self.all_trs=self.selet.xpath("//table[@id= 'ip_list']//tr[@class= 'odd']")

        for tr in self.all_trs:
            self.items["ip"]=tr.xpath(".//td//text()").extract()[0]
            self.items["post"]=tr.xpath(".//td//text()").extract()[1]
            self.items["address"]=tr.xpath(".//td//text()").extract()[3]
            speed =tr.xpath(".//div/@title").extract()[0]
            conn_time= tr.xpath(".//div/@title").extract()[1]
            if speed:
                self.items["speed"]= float(speed.split('秒')[0])
                self.items["conn_time"] = float(conn_time.split('秒')[0])

            # 生成迭代，执行插入数据库操作
            yield self.insert_to_mysql(self.items)

    # 插入数据库
    def insert_to_mysql(self, items):
        sql_insert="""
                insert into xici_proxy
                values
                (%s,%s,%s,%s,%s)
        """
        self.cursor.execute(sql_insert,(items["ip"],items["post"],items["address"],items["speed"],items["conn_time"]))
        self.sql_con.commit()

取的时候，取随机IP：

class GetRandomIp(object):
    """
    通过.get_random_ip()获取返回随机IP
    """
    def __init__(self):
        self.sql_con = MySQLdb.connect(host="localhost",
                                  user="root",
                                  passwd="12345",
                                  db="spider_article",
                                  charset="utf8")
        self.cursor = self.sql_con.cursor()

    def get_random_ip(self):
        sql_select = """select ip,post 
                                        from xici_proxy
                                        ORDER BY RAND()
                                        LIMIT 1
                                        """

        while(True):
            self.cursor.execute(sql_select)
            #取光标中数据
            for select_info in self.cursor.fetchall():

                ip= select_info[0]
                post= select_info[1]
                proxy_ip="https://"+ ip+ ':'+ post
                #判断IP是否可用，是则返回
                if self.JudgeIp(proxy_ip,ip):
                    return proxy_ip
                else:
                    break

判断IP是否可用：

# 通过参数proxies=字典，来设置代理，发送请求给百度

def JudgeIp(self,proxy_ip,ip):
        # 要判断代理IP是否可用，用它搜一下百度就可以
        post_url= "https://www.baidu.com/"
        header={
            "user-agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36"
        }
        try:
            proxy_dict= {"https":proxy_ip}
            print("...try to connect...")
            # 通过参数proxies=字典，来设置代理
            resp= requests.get(post_url, proxies= proxy_dict, headers=header)
            # sta_code= resp.status_code
            # if sta_code>=200 and sta_code<300:
            #     return True
            # else:
            #     return False
        except Exception as e:
            print("invalid proxy_dict")
            self.DeleteIp(ip)
            return False
        else:
            sta_code = resp.status_code
            if sta_code >= 200 and sta_code < 300:
                return True
            else:
                # 如果ip不可用，从数据库删除掉
                self.DeleteIp(ip)
                return False

    # 从数据库删除掉不可用IP
    def DeleteIp(self,ip):
        sql_delete="""delete from xici_proxy
                      where ip='{0}'""".format(ip)
        print("删除ip:" + ip+'\n')
        self.cursor.execute(sql_delete)
        self.sql_con.commit()
        return True

最后调用时需要用if __name__ == "__main__":在当前运行，这是python的一种格式


if __name__ == "__main__":
    #GetXiCiProxy("https://www.xicidaili.com/wn/")
    #GetForignProxy("http://www.data5u.com/free/gwpt/index.shtml")
    rand_ip= GetRandomIp()
    print(rand_ip.get_random_ip())

wstm2016

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬取和筛选ip代理，建立ip池

这段时间的学习，如有雷同，肯定不是抄袭。hhhh......建立IP池前需要把IP存储到本地这里爬取的时西刺IP代理：class GetForignProxy(object): # 伪装header headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWe...
复制链接

扫一扫