爬取西*网的ip 并验证其有效性

#encoudin:utf-8
from bs4 import BeautifulSoup
import requests as req
import http.client as httplib
import threading
import sys
import random
inFile = open('proxy.txt','r')
outFile = open('verified.txt', 'w')
lock = threading.Lock()

def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
    import time as tim_e
    countNum = 0
    proxyFile = open('proxy.txt' , 'a')
    
    Header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    
    
    for page in range(5, 15):

        url = targeturl + str(page)
        #print url
        html_doc = req.get(url,headers=Header).content.decode()
        tim_e.sleep(random.random())

        soup = BeautifulSoup(html_doc, "html.parser")
        #print soup
        trs = soup.find('table', id='ip_list').find_all('tr')
        for tr in trs[1:]:
            tds = tr.find_all('td')
            #国家
            if tds[1].find('img') is None :
                nation = '未知'
                locate = '未知'
            else:
                nation =   tds[1].find('img')['alt'].strip()
                locate  =   tds[4].text.strip()
            ip      =   tds[2].text.strip()
            port    =   tds[3].text.strip()
            anony   =   tds[5].text.strip()
            protocol=   tds[6].text.strip()
            speed   =   tds[7].find('div')['title'].strip()
            time    =   tds[9].text.strip()
            
            proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol,speed, time) )
            #print '%s=%s:%s' % (protocol, ip, port)
            countNum += 1
    proxyFile.close()
    return countNum
    
def verifyProxyList():
    '''
    验证代理的有效性
    '''
    requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值