#encoudin:utf-8
from bs4 import BeautifulSoup
import requests as req
import http.client as httplib
import threading
import sys
import random
inFile = open('proxy.txt','r')
outFile = open('verified.txt', 'w')
lock = threading.Lock()
def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
import time as tim_e
countNum = 0
proxyFile = open('proxy.txt' , 'a')
Header = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
for page in range(5, 15):
url = targeturl + str(page)
#print url
html_doc = req.get(url,headers=Header).content.decode()
tim_e.sleep(random.random())
soup = BeautifulSoup(html_doc, "html.parser")
#print soup
trs = soup.find('table', id='ip_list').find_all('tr')
for tr in trs[1:]:
tds = tr.find_all('td')
#国家
if tds[1].find('img') is None :
nation = '未知'
locate = '未知'
else:
nation = tds[1].find('img')['alt'].strip()
locate = tds[4].text.strip()
ip = tds[2].text.strip()
port = tds[3].text.strip()
anony = tds[5].text.strip()
protocol= tds[6].text.strip()
speed = tds[7].find('div')['title'].strip()
time = tds[9].text.strip()
proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol,speed, time) )
#print '%s=%s:%s' % (protocol, ip, port)
countNum += 1
proxyFile.close()
return countNum
def verifyProxyList():
'''
验证代理的有效性
'''
requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36
爬取西*网的ip 并验证其有效性
最新推荐文章于 2021-08-18 19:35:46 发布