25python成功输出第一个66免费ip代理

最新推荐文章于 2023-09-05 20:01:01 发布

jidawanghao

最新推荐文章于 2023-09-05 20:01:01 发布

阅读量376

点赞数 1

本文链接：https://blog.csdn.net/jidawanghao/article/details/108668998

版权

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
# from selenium import webdriver
import subprocess as sp
from lxml import etree
import requests
import random
import re

   #requests的Session可以自动保持cookie,不需要自己维护cookie内容
S = requests.Session()
   #西祠代理高匿IP地址
target_url = 'http://www.66ip.cn/areaindex_15/1.html'
   #完善的headers
target_headers = {'Upgrade-Insecure-Requests':'1',
   'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
   'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
   'Referer':'http://www.66ip.cn/areaindex_15/1.html',
   'Accept-Encoding':'gzip, deflate, sdch',
   'Accept-Language':'zh-CN,zh;q=0.8',
}
   #get请求
target_response = S.get(url = target_url, headers = target_headers)
   #utf-8编码
target_response.encoding = 'utf-8'
   #获取网页信息
target_html = target_response.text
   #获取id为ip_list的table

bf1_ip_list = BeautifulSoup(target_html, 'lxml')




bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id='footer')), 'lxml')
ip_list_info = bf2_ip_list.table.contents

# 存储代理的列表
proxys_list = []
# 爬取每个代理信息
for index in range(len(ip_list_info)):
    if index % 2 == 1 and index != 1:
        dom = etree.HTML(str(ip_list_info[index]))
        ip = dom.xpath('//td[1]')
        port = dom.xpath('//td[2]')
        protocol = dom.xpath('//td[3]')
        proxys_list.append('http://' + '#' + ip[0].text + '#' + port[0].text)
        print(proxys_list)
# 返回代理列表