Python 爬取西刺代理IP并写入xls文件

import requests
from bs4 import BeautifulSoup
import xlwt
from xlrd import open_workbook
from telnetlib import Telnet  # 这是用来验证IP是否可用


class XiciProxy():
    def __init__(self):
        self.baseUrl = 'https://www.xicidaili.com/nn/'

    def getDataList(self, num=10):
        print('爬取中...')
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.     (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36",
            'Host': 'www.xicidaili.com',
            'Referer': 'https://www.xicidaili.com/nn/'
        }
        totalList = []
        ableList = []
        unableList = []
        page = 1
        while len(ableList) <= num:
            self.url = self.baseUrl + str(page)
            req = requests.get(self.url, headers=headers)
            html_doc = BeautifulSoup(req.text, 'html.parser')
            lists = html_doc.select('#ip_list tr')[1:]
            print('数据分析中,请稍等...')
            for li in lists[1:]:
                self.li = li
                ip = self.getText(1) + ':' + self.getText(2)
                obj = {
                    'ip': ip,
                    'address': self.getText(3),
                    'anonymous': self.getText(4),
                    'type': self.getText(5),
                    'date': self.getText(9)
                }
                totalList.append(obj)
                try:
                    Telnet(self.getText(1), self.getText(2), timeout=1)
                    ableList.append(obj)
                except:
                    unableList.append(obj)
            page += 1
        self.totalList = totalList
        self.ableList = ableList
        self.unableList = unableList

    def getText(self, index):
        return self.li.select('td')[index].text.strip()

    def writeXls(self):
        wb = xlwt.Workbook(encoding='ascii')
        ws = wb.add_sheet('ip列表')
        ws.write(0, 0, 'IP地址')
        ws.write(0, 1, '服务器地址')
        ws.write(0, 2, '是否匿名')
        ws.write(0, 3, '类型')
        ws.write(0, 4, '验证时间')
        self.getDataList(60)  # 60为可用数据的长度
        print('爬取总数据{}条'.format(len(self.totalList)))
        print('{}条可用'.format(len(self.ableList)))
        print('{}条不可用'.format(len(self.unableList)))
        for i, data in enumerate(self.ableList):
            ws.write(i+1, 0, data['ip'])
            ws.write(i+1, 1, data['address'])
            ws.write(i+1, 2, data['anonymous'])
            ws.write(i+1, 3, data['type'])
            ws.write(i+1, 4, data['date'])
        wb.save('西刺代理ip.xls')
        print('录入西刺代理ip.xls-成功')

    def readXls(self, index):
        book = open_workbook('西刺代理ip.xls')
        sheet = book.sheet_by_index(0)
        row_con = sheet.row_values(index)  # 行的操作
        return row_con


if __name__ == '__main__':
    xiciProxy = XiciProxy()
    xiciProxy.writeXls()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值