1. 如何在requests中设置ip代理
最直接的用法,在get中添加proxies设置ip代理
proxies = {
'https': 'http://183.148.153.147:9999/'
'https': 'http://183.148.153.147:9999/'
})
requests.get(url=url, headers=headers, proxies=proxies)
当ip被网站ban掉时,我们就需要使用大量的ip来进行替换,由此引出了下面的内容,爬取西刺提供的免费ip
2. 爬取西刺免费ip代理,并存入mysql
。。具体的字段分析,先待定吧,我ip被西刺ban了。。。233
直接上代码了
import time
from random import random
import requests
from scrapy.selector import Selector
import pymysql
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER"
}
# 创建数据库时,使用的字段是ip(varchar)(主键) port(varchar) proxy_type(varchar) speed(float) ,数据库名字(ips),表名(ip_pond),user,passwd,这些都填自己的
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='ips', charset='utf8')
cursor = conn.cursor()
# 定义一个随机延时,因为西刺这网站太容易封ip了,,
def rand_sleep_time():
sleep_time = random() * 100
return time.sleep(sleep_time)
# 用于更新ip池
def update_ip_pond():
# 这个网站目前一共有3637页,这里获取前面的10个页面
for i in range(1, 11):
resp = requests.get('https://www.xicidaili.com/nn/%s' % i, headers=headers)
if resp.status_code != 200:
print('第%s页获取失败' % i)
else:
print('已获取第%s页内容' % i)
selector = Selector(text=resp.text)
# 使用xpath找到ip_list这个id
all_items = selector.xpath('//*[@id="ip_list"]//tr')
ip_list = []
#第一行不是我们需要的,过滤掉,从第1列开始
for item in all_items[1:]:
# 这里使用xpath从网页提取
speed_str = item.xpath('td[7]/div/@title').get()
if speed_str:
speed = float(speed_str.split('秒')[0])
ip = item.xpath('td[2]/text()').get()
port = item.xpath('td[3]/text()').get()
proxy_type = item.xpath('td[6]/text()').get().lower()
ip_list.append((ip, port, proxy_type, speed))
for ip_info in ip_list:
# sql的作用为:插入并更新相应的字段
cursor.execute(
"insert ip_pond(ip,port,proxy_type,speed) values ('{0}','{1}','{2}','{3}') ON DUPLICATE KEY UPDATE ip=VALUES(ip),port=VALUES(port),proxy_type=VALUES(proxy_type),speed=VALUES(speed)"
.format(ip_info[0], ip_info[1], ip_info[2], ip_info[3])
)
#延时,防拉黑
rand_sleep_time()
conn.commit()
3. 定义GetIp类,用于从mysql中取出ip
class GetIp(object):
# 删除不可用的Ip
def delete_ip(self, ip):
delete_sql = """
DELETE FROM ip_pond WHERE ip='{0}'
""".format(ip)
cursor.execute(delete_sql)
conn.commit()
return True
# 验证ip是否可用
def judge_ip(self, ip, port, proxy_type):
#通过百度来验证吧
http_url = 'https://www.baidu.com'
proxy_url = '{0}://{1}:{2}'.format(proxy_type, ip, port)
try:
#对http.https进行区分
if proxy_type == 'http':
proxy_dict = {
'http': proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict)
else:
proxy_dict = {
'https': proxy_url,
}
response = requests.get(http_url, proxies=proxy_dict, verify=False)
except Exception as e:
print('invalid ip and port')
self.delete_ip(ip)
return False
else:
code = response.status_code
if code >= 200 and code < 300:
print('effective ip')
return True
else:
print('invalid ip and port')
self.delete_ip(ip)
return False
# 从数据库中随机选择
def get_random_ip(self):
random_sql = """
SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY RAND() LIMIT 1
"""
cursor.execute(random_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
proxy_type = ip_info[2]
judge_re = self.judge_ip(ip, port, proxy_type)
if judge_re:
return '{0}://{1}:{2}'.format(proxy_type, ip, port)
else:
return self.get_random_ip()
# 从数据库中选速度最快的 (大部分和上面的一样,只是sql语句不一样)
def get_optimum_ip(self):
optimum_sql = """
SELECT ip,port,proxy_type,speed FROM ip_pond ORDER BY speed LIMIT 1
"""
cursor.execute(optimum_sql)
for ip_info in cursor.fetchall():
ip = ip_info[0]
port = ip_info[1]
proxy_type = ip_info[2]
judge_re = self.judge_ip(ip, port, proxy_type)
if judge_re:
return '{0}://{1}:{2}'.format(proxy_type, ip, port)
else:
return self.get_optimum_ip()
#对获取的ip简单封装了下,方便使用
def get_proxies(self):
getip = GetIp()
ip = getip.get_random_ip()
print(ip)
proxy_type = ip.split(':')[0]
proxies = {
proxy_type: ip
}
return proxies
4. 正确的使用方式
if __name__ == '__main__':
# 当取到的ip是https的时候,会有点慢
# 先确认是否存在ip_pond这表
sql = """
SELECT * FROM ip_pond
"""
check_table = cursor.execute(sql)
if check_table:
#测试用的url,这个随便写
url = 'https://www.baidu.com'
headers = {
"User-Agent": "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10"
}
#上一步的简单封装,直接获取到proxies
proxies = GetIp().get_proxies()
res = requests.get(url=url, headers=headers, proxies=proxies)
else:
update_ip_pond()