使用python爬取代理

最新推荐文章于 2024-08-12 11:55:09 发布

qq_42307546

最新推荐文章于 2024-08-12 11:55:09 发布

阅读量121

点赞数 2

分类专栏： python 文章标签： python 开发语言

本文链接：https://blog.csdn.net/qq_42307546/article/details/140432713

版权

python 专栏收录该内容

82 篇文章 0 订阅

订阅专栏

from lxml import etree
list2=[]
# url = "http://www.ip3366.net/free/?stype=2&page=1"
#
# headers = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
# response = requests.get(url, headers=headers)
# select = etree.HTML(response.text)
# ip_msg = select.xpath(r'//tbody//tr')
# for i in ip_msg:
#
#         ip = i.xpath('//td/text()')
#
#         break
# last_index = 0
# for e in range(7,len(ip),7):
#     new_ip=ip[last_index:e]
#     last_index=e
#     if new_ip[3] =='HTTPS':
#         https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
#         print(https_ip)

import requests
from concurrent.futures import ThreadPoolExecutor


def download_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers)
    return response.text

def parse_page(html):
    select = etree.HTML(html)
    ip_msg = select.xpath(r'//tbody//tr')
    for i in ip_msg:
        ip = i.xpath('//td/text()')

        break
    last_index = 0
    for e in range(7, len(ip), 7):
        new_ip = ip[last_index:e]
        last_index = e
        if new_ip[3] == 'HTTPS':
            https_ip = f'https://{new_ip[0]}:{new_ip[1]}'
            with open('https_.txt','a') as f:
                f.write(https_ip+'\n')

    # 在这里添加解析网页内容的代码，例如提取标题、链接等信息
    # ...

def crawl(url):
    html = download_page(url)
    parse_page(html)

if __name__ == '__main__':
    urls = ['http://www.ip3366.net/free/?stype=2&page={}'.format(i) for i in range(1, 101)]  # 替换为你要爬取的网站URL列表

    with ThreadPoolExecutor(max_workers=20) as executor:
        executor.map(crawl, urls)