- 学习什么是IP
IP(Internet Protocol)计算机之间的通信地址
- 为什么会出现IP被封
访问频次过高,造成恶意攻击
- 如何应对IP被封的问题
伪造User-Agent
爬取时注意时间间隔
- 抓取西刺代理,并构建自己的代理池
import requests
import traceback
import re
def get_ip_list(resp):
try:
root_pattren = 'alt="Cn" /></td>([\d\D]*?)</tr>'
root = re.findall(root_pattren,resp)
list_ip = []
#再次匹配数据的正则
for i in range(len(root)):
key = re.findall('<td>([\d\D]*?)</td>',root[i])
list_ip.append(key[3].lower()+'://'+key[0]+':'+key[1])
return list_ip
except Exception:
print('解析IP地址出错')
traceback.print_exc()
def main():
url ='https://www.xicidaili.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3521.2 Safari/537.36'
}
resp = requests.get(url, headers=headers)
t1=resp.text
info = get_ip_list(t1)
for i in info:
print(i)
if __name__ == '__main__':
main()