import requests
from bs4 import BeautifulSoup
import xlwt
from xlrd import open_workbook
from telnetlib import Telnet # 这是用来验证IP是否可用
class XiciProxy():
def __init__(self):
self.baseUrl = 'https://www.xicidaili.com/nn/'
def getDataList(self, num=10):
print('爬取中...')
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537. (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36",
'Host': 'www.xicidaili.com',
'Referer': 'https://www.xicidaili.com/nn/'
}
totalList = []
ableList = []
unableList = []
page = 1
while len(ableList) <= num:
self.url = self.baseUrl + str(page)
req = requests.get(self.url, headers=headers)
html_doc = BeautifulSoup(req.text, 'html.parser')
lists = html_doc.select('#ip_list tr')[1:]
print('数据分析中,请稍等...')
for li in lists[1:]:
self.li = li
ip = self.getText(1) + ':' + self.getText(2)
obj = {
'ip': ip,
'address': self.getText(3),
'anonymous': self.getText(4),
'type': self.getText(5),
'date': self.getText(9)
}
totalList.append(obj)
try:
Telnet(self.getText(1), self.getText(2), timeout=1)
ableList.append(obj)
except:
unableList.append(obj)
page += 1
self.totalList = totalList
self.ableList = ableList
self.unableList = unableList
def getText(self, index):
return self.li.select('td')[index].text.strip()
def writeXls(self):
wb = xlwt.Workbook(encoding='ascii')
ws = wb.add_sheet('ip列表')
ws.write(0, 0, 'IP地址')
ws.write(0, 1, '服务器地址')
ws.write(0, 2, '是否匿名')
ws.write(0, 3, '类型')
ws.write(0, 4, '验证时间')
self.getDataList(60) # 60为可用数据的长度
print('爬取总数据{}条'.format(len(self.totalList)))
print('{}条可用'.format(len(self.ableList)))
print('{}条不可用'.format(len(self.unableList)))
for i, data in enumerate(self.ableList):
ws.write(i+1, 0, data['ip'])
ws.write(i+1, 1, data['address'])
ws.write(i+1, 2, data['anonymous'])
ws.write(i+1, 3, data['type'])
ws.write(i+1, 4, data['date'])
wb.save('西刺代理ip.xls')
print('录入西刺代理ip.xls-成功')
def readXls(self, index):
book = open_workbook('西刺代理ip.xls')
sheet = book.sheet_by_index(0)
row_con = sheet.row_values(index) # 行的操作
return row_con
if __name__ == '__main__':
xiciProxy = XiciProxy()
xiciProxy.writeXls()
Python 爬取西刺代理IP并写入xls文件
最新推荐文章于 2022-10-18 18:07:12 发布