爬虫爬取快代理网站动态IP
import requests, time
from lxml import etree
import time
import random
cookie = """shshshfpa=baf64610-d2a6-0761-dd41-dd2abc541c0a-1602122238; __jdu=160212223903369816534; shshshfpb=yVpBghADH9esETOim4DLz2A%3D%3D; areaId=13; ipLoc-djd=13-1000-1002-0; TrackID=1mCGXQ_KDXhEAIfY7UTde_zOtAen6GoxzGZcMyBLRssMzPp_vDvgWkC7Kte0ayvtvvYNSwS7VUZ6jwYeVDwKbGOn33gEFv16Tt8xPIc-ivwo; pinId=N3uZ6aFqmKttEU4tp0fVXbV9-x-f3wj7; pin=jd_5fa1c56f34819; unick=%E9%83%AD%E5%BA%86%E6%B1%9D; ceshi3.com=000; _tp=fySfDglDqxh9lnk3%2FjqMKipqRmWr%2BuRWRKYMyntrQy4%3D; _pst=jd_5fa1c56f34819; jwotest_product=99; unpl=V2_ZzNtbUMERBQgAU9TeBxZAmJUEltLBxZGdVtGUS5NCANhBBINclRCFnUUR1NnGVUUZwUZXENcQxFFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXQZnABFeQF9CF3cPTl17HlgHYjMiWnJncxN2C0BQeSldNWYzUAkeVEYVdwpOGXseXQZnABFeQF9CF3cPTl17HlgHYjMTbUE%3d; __jdv=76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_0b60d88624464f178adb1b14dee7761a|1624006835098; user-key=7174d1cb-8908-4901-836d-63d1d1f7c712; __jdc=122270672; __jda=122270672.160212223903369816534.1602122239.1624021969.1624069754.39; JSESSIONID=CFA1610910BB0B51F37AF7CEAA734B41.s1; 3AB9D23F7A4B3C9B=LKOARNNSX36NKIRVUCN4FCFKQIJXETIXQCC5MXMYCSJRVSTV6DQBISY6BVWS7V6G3PRKAQ7SPN64BML4WMZUYFJAA4; shshshfp=a25e89bc2bc70d081ba26f4a3a6fd2e5; shshshsID=b98e34beb41756511992266c5feae031_5_1624069855785; __jdb=122270672.5.160212223903369816534|39.1624069754"""
class Get_ip(object):
def __init__(self,page_num):
self.url="https://www.kuaidaili.com/free/inha/{}/"
self.page_num=page_num
self.ip_list = self.get_ip(self.page_num)
def get_ip(self,page_num):
"""
获取动态ip列表
:param page_num: 需要爬取的页数
:return:
"""
proxies = list() # 代理ip列表
for page in range(1, page_num):
# 请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36"
, "Cookie": cookie
}
time.sleep(0.5)
# url = "https://www.kuaidaili.com/free/"
url = self.url.format(str(page))
response = requests.get(url, headers=headers)
response.encoding = "utf-8"
text = response.text
selector = etree.HTML(text)
# td_list = selector.xpath('/