使用Python爬取网站内容的时候,容易受反爬虫的限制,所以我们使用IP代理。稳定IP代理的都花钱,所以用免费代理构建自己的代理池。
免费的IP代理
http://www.xicidaili.com
使用的模块
- import requests
- import threading
- import random
- from pyquery import PyQuery as pq
爬取的网站
主要分为4个步骤
- 获取ip(使用多线程)
- 爬取网站上的ip
- 检验ip的有效性
- 创建多个headers
完整代码
import requests
import threading
import random
from pyquery import PyQuery as pq
IP_list = []
Target_url = 'https://ip.cn' # 验证ip的地址
def getheaders():
user_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
UserAgent = random.choice(user_list)
headers = {'User-Agent':UserAgent}
return headers
def findip(type, pagenum):
"""
:param type: ip类型
:param pagenum: 页码
:return:
"""
list = {
'1': 'http://www.xicidaili.com/nt/', # 国内普通代理
'2': 'http://www.xicidaili.com/nn/', # 国内高匿代理
'3': 'http://www.xicidaili.com/wn/', # 国内https代理
'4': 'http://www.xicidaili.com/wt/' # 国外http代理
}
url = list[str(type)] + str(pagenum) # 配置url
headers = getheaders()
html = requests.get(url=url, headers=headers, timeout=5).text
doc = pq(html)
all = doc('#ip_list tr')
for i in all.items():
sumtd = i.find('td').text()
result = sumtd.split(' ')
if len(result) > 1:
get_ip = result[1] + ':' + result[2]
is_ok = checkip(get_ip)
if is_ok == True:
print(get_ip)
IP_list.append(get_ip)
def getip(): #获取ip
threads = []
for type in range(4): # 四种ip类型,每种类型取前三页
for pagenum in range(3):
t = threading.Thread(target=findip, args=(type + 1, pagenum + 1))
threads.append(t)
print('开始爬取代理ip')
for s in threads: # 使用多线程爬取
s.start()
for e in threads:
s.join()
print('爬取完成')
def checkip(ip): # 检验ip的有效性
headers = getheaders() #获取随机headers
proxies = {'http': f'http://{ip}', 'https': f'http://{ip}'} # 代理ip
try:
response = requests.get(url=Target_url, proxies=proxies, headers=headers, timeout=1).status_code
if response == 200:
return True
else:
return False
except:
return False
if __name__ == '__main__':
getip()
with open('result.txt', 'w') as f:
for IP in IP_list:
f.writelines(IP + '\n')