from lxml import etree
list2=[]
# url = "http://www.ip3366.net/free/?stype=2&page=1"
#
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# response = requests.get(url, headers=headers)
# select = etree.HTML(response.text)
# ip_msg = select.xpath(r'//tbody//tr')
# for i in ip_msg:
#
# ip = i.xpath('//td/text()')
#
# break
# last_index = 0
# for e in range(7,len(ip),7):
# new_ip=ip[last_index:e]
# last_index=e
# if new_ip[3] =='HTTPS':
# https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
# print(https_ip)
import requests
from concurrent.futures import ThreadPoolExecutor
def download_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
return response.text
def parse_page(html):
select = etree.HTML(html)
ip_msg = select.xpath(r'//tbody//tr')
for i in ip_msg:
ip = i.xpath('//td/text()')
break
last_index = 0
for e in range(7, len(ip), 7):
new_ip = ip[last_index:e]
last_index = e
if new_ip[3] == 'HTTPS':
https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
with open('https_.txt','a') as f:
f.write(https_ip+'\n')
# 在这里添加解析网页内容的代码,例如提取标题、链接等信息
# ...
def crawl(url):
html = download_page(url)
parse_page(html)
if __name__ == '__main__':
urls = ['http://www.ip3366.net/free/?stype=2&page={}'.format(i) for i in range(1, 101)] # 替换为你要爬取的网站URL列表
with ThreadPoolExecutor(max_workers=20) as executor:
executor.map(crawl, urls)
使用python爬取代理
最新推荐文章于 2024-08-12 11:55:09 发布