爬虫代理池 IP:PORT

1、代码部分

from urllib import request
import re
import sys
from http import client
import requests

#爬取的是国内能访问的代理

def spider_66():
    base_url="http://www.66ip.cn/areaindex_{}/1.html"
    head={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
                "Cookie": "yd_cookie=f4f22cbd-08e8-4138eea6bfc1b8486f9466de3c547a0a8469; _ydclearance=77176a0de1034bf0a69ec632-693d-4013-880b-ba88e92b0124-1544185457; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1544170688,1544178225; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1544178225"}

    result_66=[]

    for n in range(1,4):
        gxs = ""
        url = base_url.format(n)
        req1 = request.Request(url=url,headers=head)
        req = request.urlopen(req1)
        index_html=req.read().decode("gb2312")
        dizhi = re.findall("areaindex_{}/1.html\">(.*?)</a> </li>".format(n), index_html)
        for d in dizhi:
              gxs += d
        patten=re.findall("<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>",index_html)
        for i in patten[1:]:
            result = {}
            result['ip'] = i[0]
            result['port'] = i[1]
            result['area'] = i[2]
            result['type'] = i[3]
            result['time'] = i[4]
            yield result
        print('\n')


def test_66(ip='221.122.91.64',port='80',test_url='https://www.cnblogs.com/YinJay/p/10909442.html'):
    # conn = client.HTTPConnection(ip, port, timeout=5)
    # response = conn.getresponse('GET', url=test_url)
    # print(response.status,response.reason)
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    ip_url_next = '://' + ip + ':' + port
    proxies = {'http': 'http' + ip_url_next, 'https': 'https' + ip_url_next}
    r = requests.get(test_url, headers=header, proxies=proxies, timeout=3)

    print(r.status_code)
    print(r.encoding)
    print(r.content.decode('utf-8'))
    print(r.request.headers)

test_66()

# result_66 = spider_66()
# while True:
#     try:
#         print (next(result_66))
#     except StopIteration:
#         sys.exit()




2、拓展部份

(1)回顾requests库
requests.get()
request.post()
(2)python基础回顾
生成器
修饰器
迭代器
多线程
多进程
Scrapy

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值