python爬虫爬取66代理

最新推荐文章于 2023-09-05 20:01:01 发布

gxs294

最新推荐文章于 2023-09-05 20:01:01 发布

阅读量1.1k

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_37138738/article/details/84894231

版权

python 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

from urllib import request
import re
# cookie管理模块,
# from http import cookiejar
# # 返回存储cookie对象
# cookie = cookiejar.CookieJar()
# # 返回一个cookie管理器
# cookie_handler = request.HTTPCookieProcessor(cookie)
# # 请求管理器
# opener = request.build_opener(cookie_handler)
base_url="http://www.66ip.cn/areaindex_{}/1.html"
head={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
            "Cookie": "yd_cookie=f4f22cbd-08e8-4138eea6bfc1b8486f9466de3c547a0a8469; _ydclearance=77176a0de1034bf0a69ec632-693d-4013-880b-ba88e92b0124-1544185457; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1544170688,1544178225; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1544178225"}
for n in range(1,35):
      gxs = ""
      url = base_url.format(n)
      req1 = request.Request(url=url,headers=head)
      req = request.urlopen(req1)
      index_html=(req.read().decode("gb2312"))
      dizhi = re.findall("areaindex_{}/1.html\">(.*?)</a> </li>".format(n), index_html)
      for d in dizhi:
            gxs += d
      patten=re.findall("<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>",index_html)
      # print(patten)
      for i in patten:
            a = i[0] + "," + i[1] + "," + i[2] + "," + i[3] + "," + i[4] + ","+ "\n"
            c = str(n)
            with open("./IP"+gxs+".csv","a+",encoding="gb2312")as f:
                  f.write(a)