1、代码部分
from urllib import request
import re
import sys
from http import client
import requests
#爬取的是国内能访问的代理
def spider_66():
base_url="http://www.66ip.cn/areaindex_{}/1.html"
head={"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36",
"Cookie": "yd_cookie=f4f22cbd-08e8-4138eea6bfc1b8486f9466de3c547a0a8469; _ydclearance=77176a0de1034bf0a69ec632-693d-4013-880b-ba88e92b0124-1544185457; Hm_lvt_1761fabf3c988e7f04bec51acd4073f4=1544170688,1544178225; Hm_lpvt_1761fabf3c988e7f04bec51acd4073f4=1544178225"}
result_66=[]
for n in range(1,4):
gxs = ""
url = base_url.format(n)
req1 = request.Request(url=url,headers=head)
req = request.urlopen(req1)
index_html=req.read().decode("gb2312")
dizhi = re.findall("areaindex_{}/1.html\">(.*?)</a> </li>".format(n), index_html)
for d in dizhi:
gxs += d
patten=re.findall("<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>",index_html)
for i in patten[1:]:
result = {}
result['ip'] = i[0]
result['port'] = i[1]
result['area'] = i[2]
result['type'] = i[3]
result['time'] = i[4]
yield result
print('\n')
def test_66(ip='221.122.91.64',port='80',test_url='https://www.cnblogs.com/YinJay/p/10909442.html'):
# conn = client.HTTPConnection(ip, port, timeout=5)
# response = conn.getresponse('GET', url=test_url)
# print(response.status,response.reason)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}
ip_url_next = '://' + ip + ':' + port
proxies = {'http': 'http' + ip_url_next, 'https': 'https' + ip_url_next}
r = requests.get(test_url, headers=header, proxies=proxies, timeout=3)
print(r.status_code)
print(r.encoding)
print(r.content.decode('utf-8'))
print(r.request.headers)
test_66()
# result_66 = spider_66()
# while True:
# try:
# print (next(result_66))
# except StopIteration:
# sys.exit()
2、拓展部份
(1)回顾requests库
requests.get()
request.post()
(2)python基础回顾
生成器
修饰器
迭代器
多线程
多进程
Scrapy