爬取西*网的ip 并验证其有效性

最新推荐文章于 2021-08-18 19:35:46 发布

LKJLKJKL

最新推荐文章于 2021-08-18 19:35:46 发布

阅读量299

点赞数

分类专栏：爬虫公开文章标签： python

本文链接：https://blog.csdn.net/LKJLKJKL/article/details/90518150

版权

#encoudin:utf-8
from bs4 import BeautifulSoup
import requests as req
import http.client as httplib
import threading
import sys
import random
inFile = open('proxy.txt','r')
outFile = open('verified.txt', 'w')
lock = threading.Lock()

def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
    import time as tim_e
    countNum = 0
    proxyFile = open('proxy.txt' , 'a')
    
    Header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    
    
    for page in range(5, 15):

        url = targeturl + str(page)
        #print url
        html_doc = req.get(url,headers=Header).content.decode()
        tim_e.sleep(random.random())

        soup = BeautifulSoup(html_doc, "html.parser")
        #print soup
        trs = soup.find('table', id='ip_list').find_all('tr')
        for tr in trs[1:]:
            tds = tr.find_all('td')
            #国家
            if tds[1].find('img') is None :
                nation = '未知'
                locate = '未知'
            else:
                nation =   tds[1].find('img')['alt'].strip()
                locate  =   tds[4].text.strip()
            ip      =   tds[2].text.strip()
            port    =   tds[3].text.strip()
            anony   =   tds[5].text.strip()
            protocol=   tds[6].text.strip()
            speed   =   tds[7].find('div')['title'].strip()
            time    =   tds[9].text.strip()
            
            proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol,speed, time) )
            #print '%s=%s:%s' % (protocol, ip, port)
            countNum += 1
    proxyFile.close()
    return countNum
    
def verifyProxyList():
    '''
    验证代理的有效性
    '''
    requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36