1.爬取分析
打开西刺代理,可以看到代理列表。
打开浏览器开发者工具,我们要提取IP地址、端口和类型。
2.代码实现
直接上代码,需要注意的是第一个< tr >标签是表头,解析时需要跳过。
import csv
import time
import requests
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery as pq
from requests.exceptions import RequestException
class XiCiSpyder(object):
def __init__(self):
self.url = "https://www.xicidaili.com/nn/{}"
self.headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
}
self.ip_lists = []
def get_one_page(self, url):
try:
response = requests.get(url,headers=self.headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_data_xpath(self, html):
xpath_html = etree.HTML(html)
for each_proxy in xpath_html.xpath('//tr[td]'):
ip_list = []
ip_address = each_proxy.xpath('./td')[1].text.strip()
ip_port = each_proxy.xpath('./td')[2].text.strip()
ip_type = each_proxy.xpath('./td')[5].text.strip()
ip_list = [ip_address,ip_port,ip_type]
self.ip_lists.append(ip_list)
return self.ip_lists
def parse_data_bs4(self, html):
soup = BeautifulSoup(html, 'lxml')
all_proxies = soup.find('table', id='ip_list')
for each_proxy in all_proxies.find_all('tr')[1:]:
ip_list = []
all_td = each_proxy.find_all('td')
ip_address = all_td[1].text
ip_port = all_td[2].text
ip_type = all_td[5].text
ip_list = [ip_address,ip_port,ip_type]
self.ip_lists.append(ip_list)
return self.ip_lists
def parse_data_pyquery(self, html):
pq_html = pq(html)
head = True
for each_proxy in pq_html('tr').items():
if head:
head = False
continue
ip_list = []
ip_address = each_proxy.find('td:nth-child(2)').text()
ip_port = each_proxy.find('td:nth-child(3)').text()
ip_type = each_proxy.find('td:nth-child(6)').text()
ip_list = [ip_address,ip_port,ip_type]
self.ip_lists.append(ip_list)
return self.ip_lists
def save_data(self):
with open('proxy.csv', 'a', newline='') as csvfile:
fieldnames = ['ip_address', 'ip_port', 'ip_type']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in self.ip_lists:
writer.writerow({'ip_address': item[0], 'ip_port': item[1], 'ip_type': item[2]})
print("Sava data successfully")
def run(self):
for i in range(1,11):
url = self.url.format(i)
print("正在爬取第{}页".format(i))
html = self.get_one_page(url)
#data = self.parse_data_xpath(html)
#data = self.parse_data_bs4(html)
data = self.parse_data_pyquery(html)
time.sleep(1)
print('共爬取到{}条数据'.format(len(data)))
self.save_data()
XiCiSpyder().run()
爬取到的数据:
3.解析库的使用方法
留坑