为了防止频繁访问而导致的IP被服务器封掉的可能,可以使用Ip代理来实现。
使用方法,在requests的函数中加个proxies参数即可:
page_text = requests.get(url=url, headers=headers, proxies={'http': '60.167.132.19:9999'} )
可以使用的免费IP代理可以从如下网站获取:
1.快代理:https://www.kuaidaili.com/free/
2.西刺代理:https://www.xicidaili.com/nn/
网上还有其它的。
模板:
proxies = getIpPool.getproxies()
MAX_num = 20 # Ip取值范围
openFlag = 1 # 0关闭Ip代理,1开启Ip代理
outTime = 10 # 超时时间
global proxies
global openFlag
ip = proxies[random.randint(0, MAX_num if len(proxies) > MAX_num else len(proxies)-1)]
if openFlag == 1:
response = requests.get(url, timeout=outTime, headers=headers, proxies={ip[0]: ip[1]}, verify=False).text
# print(ip)
else:
response = requests.get(url, timeout=outTime, headers = headers, verify=False).text
下面再给个抓取免费代理的代码,会将快代理网址的ip存入运行文件相同路径下的data文件夹中。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/5 10:42
# @Author : ystraw
# @Site :
# @File : getIpPool.py
# @Software: PyCharm Community Edition
# @function: 从快代理获取可用的代理IP
# https://www.kuaidaili.com/free/inha/2/
import requests
import time
import random
from bs4 import BeautifulSoup
# 写入文件:
def writeFile(filename, file):
with open(filename, 'w', encoding='utf-8') as f:
f.write(file)
print(filename, '已写入!')
f.close()
# 读入文件:
def readFile(filename):
with open(filename, 'r', encoding='utf-8') as f:
str = f.read()
print(filename, '已读入!')
f.close()
return str
# 获取获取ip池,保存到文件
def download_IP():
ua_list = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
]
agent = random.choice(ua_list)
headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/javascript, */*; q=0.01",
"Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
"User-Agent": agent,
}
proxies = { 'HTTP': '223.199.27.122:9999',
'HTTP': '223.199.30.236:9999' }
# sess = requests.session()
# 采集数据:
IPpool = ''
# 抓取的页数范围:
for i in range(1, 5):
try:
time.sleep(random.randint(1, 2))
url = 'https://www.kuaidaili.com/free/inha/'+str(i)+'/'
print('请求地址:', url)
response = requests.get(url, headers=headers).text
# print(response)
# 提取信息:
bs = BeautifulSoup(response, 'lxml')
# 获取ip列表:
tbody = bs.findAll('tbody')[0]
trList = tbody.findAll('tr')
for tr in trList:
# print(tr)
tdList = tr.findAll('td')
for td in tdList:
# print(td.string, end=' ')
IPpool += td.string + ','
IPpool += '\n'
except Exception as ex:
print('本次爬取ip失败', ex)
# print(IPpool)
if len(IPpool) > 3328:
writeFile('./data/IPpool.txt', IPpool)
else:
print('获取数量小于50个,未写入!')
# 读取IP文件并整理
def getIP():
ipstring = readFile('./data/IPpool.txt')
ipList = ipstring.split('\n')
proxies = [] # 'http': '223.199.27.122:9999'
for ip in ipList:
if ip == '' or ip == None:
continue
ip = ip.split(',')
try:
proxies.append((ip[3], ip[0] + ':' + ip[1]))
except Exception as ex:
print('ip pool 构建失败!', ex)
return proxies
# 返回IP池
def getproxies():
# 从网页获取ip
# download_IP()
# 读取保存的ip
proxies = getIP()
# [('http', '60.167.132.19:9999'), ('http', '60.167.132.19:9999')]
return proxies
if __name__ == '__main__':
# 从网页获取ip
download_IP()
# 读取保存的ip
proxies = getIP()
# 测试
url = 'https://www.baidu.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}
for i in range(len(proxies)):
ip = proxies[i]
page_text = requests.get(url=url, headers=headers, proxies={ip[0]: ip[1]} )
print(i, page_text)