#coding:utf-8 import requests from lxml import etree from time import sleep import pandas as pd #爬取代理IP headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.54' } ip_poor = [] ip_str = '' for page in range(52,75): url = 'https://www.kuaidaili.com/free/inha/{}/'.format(page) ip_html = requests.get(url= url,headers = headers).text ip_tree = etree.HTML(ip_html) ip_message = ip_tree.xpath('//*[@id="list"]/table/tbody/tr') for i in ip_message: ip_dic = {} ip = i.xpath('./td[1]/text()')[0] port = i.xpath('./td[2]/text()')[0] http = i.xpath('./td[4]/text()')[0] a = http + ' '+ ip + ' ' + port +'\n' ip_str+= a ip_dic[http] = ip +':'+port+'' ip_poor.append(ip_dic) print(ip_poor) sleep(2) with open("./ip.csv",'w',encoding="utf-8") as fp: fp.write(ip_str) url = 'http://httpbin.org/ip' data = pd.read_csv('1.csv') ip_str = '' for ip in data['ip']: ip = {'http':ip.split(' ')[1]+":"+ip.split(' ')[2]} try: code = requests.get(url, headers=headers, proxies=ip).status_code if code == 200: ip_str += '{' + '\'http\':' + ip['http'] + '\'}\n' print(ip,"yes") print(ip_str) except: print(ip,'NO') with open("./2.csv",'w',encoding="utf-8") as fp: fp.write(ip_str)
requests爬取快代理ip及筛选
最新推荐文章于 2024-05-29 14:13:29 发布