使用python爬取代理

from lxml import etree
list2=[]
# url = "http://www.ip3366.net/free/?stype=2&page=1"
#
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# response = requests.get(url, headers=headers)
# select = etree.HTML(response.text)
# ip_msg = select.xpath(r'//tbody//tr')
# for i in ip_msg:
#
#         ip = i.xpath('//td/text()')
#
#         break
# last_index = 0
# for e in range(7,len(ip),7):
#     new_ip=ip[last_index:e]
#     last_index=e
#     if new_ip[3] =='HTTPS':
#         https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
#         print(https_ip)

import requests
from concurrent.futures import ThreadPoolExecutor


def download_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    return response.text

def parse_page(html):
    select = etree.HTML(html)
    ip_msg = select.xpath(r'//tbody//tr')
    for i in ip_msg:
        ip = i.xpath('//td/text()')

        break
    last_index = 0
    for e in range(7, len(ip), 7):
        new_ip = ip[last_index:e]
        last_index = e
        if new_ip[3] == 'HTTPS':
            https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
            with open('https_.txt','a') as f:
                f.write(https_ip+'\n')

    # 在这里添加解析网页内容的代码,例如提取标题、链接等信息
    # ...

def crawl(url):
    html = download_page(url)
    parse_page(html)

if __name__ == '__main__':
    urls = ['http://www.ip3366.net/free/?stype=2&page={}'.format(i) for i in range(1, 101)]  # 替换为你要爬取的网站URL列表

    with ThreadPoolExecutor(max_workers=20) as executor:
        executor.map(crawl, urls)
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值