利用Python爬取代理IP并存入本地数据库
引入头部文件
import requests
from bs4 import BeautifulSoup
from dboper.mysqloper import MysqlOper
定义必要的静态变量
url = 'https://www.xicidaili.com/nn/'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
}
分析网页结构,并且编写爬取函数
def get_proxy_list():
response = ''
try:
response = requests.get(url, headers=headers, timeout=10)
except requests.Timeout as e:
print(e.args)
soup = BeautifulSoup(response.text, 'html.parser')
content = soup.find('table', {'id': 'ip_list'})
ip_list = content.find_all('tr')
del_ip_list()
for idx, item in enumerate(ip_list):
if idx < 1:
continue
ip = item.find_all_next('td')[1].text
port = item.find_all_next('td')[2].text
protocol = item.find_all_next('td')[5].text
save_ip(protocol, ip, port)
调用函数
if __name__ == '__main__':
get_proxy_list()