在python爬取网页时总是需要大量访问网站,但是有些网站会用上防止python大量访问的javascript代码,那我们就需要代理IP了,但是我们如何不用手动获取代理IP呢?
还是BoxTool
from bs4 import BeautifulSoup
import requests
def GetsAListOfProxyIPAndPorts(PageNumber:int=1):
def ifint(inta):
try:
int(inta)
except:
return False
return True
if ifint(PageNumber) and PageNumber<=4956 and PageNumber>=1:
pass
else:
return "Page number error"
TEST_URL = 'https://www.kuaidaili.com/free/inha/1/'
hea = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
html = requests.get(TEST_URL,headers = hea)
html.encoding = 'utf-8' #这一行是将编码转为utf-8否则中文会显示乱码。
get=html.text
bs4_html = BeautifulSoup(get,"lxml")
ip_list = []
ip_contents =bs4_html.find_all('td',attrs={"data-title":"IP"})
for ip_content in ip_contents:
ip_val = ip_content.text #获取ip值
ip_list.append(ip_val)
ip_list1=[]
ip_contents1 =bs4_html.find_all('td',attrs={"data-title":"PORT"})
for ip_content1 in ip_contents1:
ip_val1 = ip_content1.text #获取ip值
ip_list1.append(ip_val1)
ret=[]
for i in ip_list:
get=ip_list.index(i)
ret.append({"IP":i,"PORT":ip_list1[get]})
return ret
使用GetsAListOfProxyIPAndPorts函数(巴拉巴拉一堆,翻译是:"获取代理的IP和port")
只需要输入1~4956数字,就能获取一堆代理IP,比如输入1
返回:[{'IP': '183.236.232.160', 'PORT': '8080'}, {'IP': '202.109.157.61', 'PORT': '9000'}, {'IP': '210.5.10.87', 'PORT': '53281'}, {'IP': '117.114.149.66', 'PORT': '55443'}, {'IP': '121.13.252.61', 'PORT': '41564'}, {'IP': '61.216.156.222', 'PORT': '60808'}, {'IP': '112.14.47.6', 'PORT': '52024'}, {'IP': '117.94.119.75', 'PORT': '9000'}, {'IP': '121.13.252.62', 'PORT': '41564'}, {'IP': '222.74.73.202', 'PORT': '42055'}, {'IP': '27.42.168.46', 'PORT': '55481'}, {'IP': '202.109.157.60', 'PORT': '9000'}, {'IP': '121.13.252.58', 'PORT': '41564'}, {'IP': '182.139.111.68', 'PORT': '9000'}, {'IP': '61.216.185.88', 'PORT': '60808'}]
注意:要安装bs4,requests,lxml哦
明天见~