from lxml import etree
from fake_useragent import UserAgent
import time
import json
import requests
import csv
def check_ip(li):
headers={
'User-Agent': UserAgent().random
}
ip_ok=[]
for l in li:
try:
response=requests.get('https://xa.anjuke.com/sale/weiyangq/?from=navigation',proxies=l,timeout=0.1)
if response.status_code==200:
ip_ok.append(l)
except Exception as e:
print(e)
return ip_ok
li=[]
for i in range(150,170):
print("==========正在爬取第{}页===========".format(i))
url='https://www.kuaidaili.com/free/inha/{}/'.format(i)
headers={
'User-Agent': UserAgent().random
}
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
div_list=tree.xpath('//*[@id="list"]/table//tr')
for div in div_list:
proxies_dict={}
try:
ip=div.xpath('./td/text()')[0]+":"+div.xpath('./td/text()')[1]
type=div.xpath('./td/text()')[3]
proxies_dict[type]=ip
if len(proxies_dict)!=0:
print(proxies_dict)
li.append(proxies_dict)
except:pass
time.sleep(1)
print("ip数量:",len(li))
ok_ip=check_ip(li)
print("ok的ip:",ok_ip)
print("数目:",len(ok_ip))
构建ip池
最新推荐文章于 2024-02-04 10:34:17 发布