from urllib import request
import re
# cookie管理模块,
# from http import cookiejar
# # 返回存储cookie对象
# cookie = cookiejar.CookieJar()
# # 返回一个cookie管理器
# cookie_handler = request.HTTPCookieProcessor(cookie)
# # 请求管理器
# opener = request.build_opener(cookie_handler)
base_url="http://www.66ip.cn/areaindex_{}/1.html"
head={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Cookie": "yd_cookie=f4f22cbd-08e8-4138eea6bfc1b8486f9466de3c547a0a8469; _ydclearance=77176a0de1034bf0a69ec632-693d-4013-880b-ba88e92b0124-1544185457; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1544170688,1544178225; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1544178225"}
for n in range(1,35):
gxs = ""
url = base_url.format(n)
req1 = request.Request(url=url,headers=head)
req = request.urlopen(req1)
index_html=(req.read().decode("gb2312"))
dizhi = re.findall("areaindex_{}/1.html\">(.*?)</a> </li>".format(n), index_html)
for d in dizhi:
gxs += d
patten=re.findall("<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>",index_html)
# print(patten)
for i in patten:
a = i[0] + "," + i[1] + "," + i[2] + "," + i[3] + "," + i[4] + ","+ "\n"
c = str(n)
with open("./IP"+gxs+".csv","a+",encoding="gb2312")as f:
f.write(a)
python爬虫爬取66代理
最新推荐文章于 2023-09-05 20:01:01 发布