import random
import time
import requests
from lxml import etree
from fake_useragent import UserAgent
import csv
proxies = {
'http':'http://187.87.39.247:31793',
'https':'https://187.87.39.247:31793'
}
def get_random_ua():
ua = UserAgent()
return ua.random
def get_ip_list(url):
headers = {'User-Agent':get_random_ua()}
html = requests.get(url=url,headers=headers).text
#解析
parse_html = etree.HTML(html)
r_list = parse_html.xpath('//tr')
proxy_list = []
# 依次遍历
for r in r_list[1:]:
ip = r.xpath('./td[1]/text()')[0].strip()
port = r.xpath('./td[2]/text()')[0].strip()
proxy_list.append(
{
'http': 'http://{}:{}'.format(ip, port),
'https': 'https://{}:{}'.format(ip, port)
}
)
return proxy_list
def proxy_pool(url):
proxy_list=get_ip_list(url)
useful_proxy=[]
for proxy in proxy_list:
headers = {'User-Agent':get_random_ua()}
try:
res = requests.get(
url='http://httpbin.org/get',
headers=headers,
proxies=proxy,
timeout=5
)
print(res.text)
useful_proxy.append(proxy)
print(useful_proxy)
except Exception as e:
print('{}不能用'.format(proxy))
continue
with open('ip.csv','a') as f:
for i in useful_proxy:
writer = csv.writer(f)
writer.writerow([i['http'],i['https']])
if __name__ == '__main__':
url = 'http://www.89ip.cn/index_{}.html'
for i in range(2,100):
url=url.format(i)
proxy_pool(url)
time.sleep(random.randint(1,3))
爬取代理ip
最新推荐文章于 2024-04-01 15:20:06 发布