批量抓取免费代理ip
目标网址:https://www.freeip.top/?page=1
工具:谷歌浏览器,pycharm,python3.8
这里没写验证ip是否存活,这里可以get百度之类的大型网站,看状态码为200就为可用
import requests
import re
import time
import random
from bs4 import BeautifulSoup
#获取代理ip和端口 https://www.freeip.top/?page=1
#文件保存地址
txtPath ="AgencyIP.txt"
def getHTML(url):
try:
time.sleep(5)
r = requests.get(url,timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print(r.status_code)
return ""
#重新获取ip 并保存
def RestGetIP():
url = 'https://www.freeip.top/?page='
with open(txtPath, "w") as file: # 只需要将之前的”w"改为“a"即可,代表追加内容
file.write("")
file.close()
for i in range(1, 10):
nowUrl = url + str(i)
print(nowUrl)
html = getHTML(nowUrl)
soup = BeautifulSoup(html, 'lxml')
table = soup.find('body').find('table')
# print(str(table))
ip_reg = re.compile('ip="(.*?)"')
ip_arry = ip_reg.findall(str(table)) # h获取ip list
port_reg = re.compile('<td>(\d*)</td>')
port_array = port_reg.findall(str(table)) # 获取ip对应的端口号
# print(port_array)
print(ip_arry)
for i, ip in enumerate(ip_arry):
# AgencyIP.txt
with open("AgencyIP.txt", "a") as ipfile: # 只需要将之前的”w"改为“a"即可,代表追加内容
s = str(ip) + ':' + port_array[i] + "\n"
print(s)
ipfile.write(s)
ipfile.close()
#读取文件
def redDataTolist():
f = open(txtPath, "r")
data =[]
for line in f: # 设置文件对象并读取每一行文件
line = line[:-1]
dic ={'http': line}
data.append(dic)
f.close()
# print(data)
return data
if __name__ == '__main__':
RestGetIP()
procxy = redDataTolist()
print(procxy)