背景:无论是自己爬虫爬取的IP代理还是购买的IP代理服务都会存在IP不稳定的情况。通过不断的校验来判断IP是否可用。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
#!/usr/bin/env python2.7
#coding:utf8
import
os,time,sys
#import mymodul
#from lxml import etree
import
re
import
random
import
requests,os
import
sys
#每10分钟验证一次IP的可用性
def
__extract_ipaddress(text_content):
result_list
=
[]
for
line
in
text_content.split(
'\n'
):
#从每一行提取ip
m
=
re.search(
"((?:(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d)))\.){3}(?:25[0-5]|2[0-4]\d|((1\d{2})|([1-9]?\d))))"
,line)
ip,port
=
'
','
'
if
m:
ip
=
m.group(
0
)
# 从包含IP的行提取端口
m
=
re.search(
"\D(\d{2,5})\D*"
,line[line.find(ip)
+
len
(ip):])
if
m:
port
=
m.group(
1
)
result_list.append((ip, m.group(
1
)))
return
result_list
#提取IP,验证黄页
def
test_poxy((_ip,_port),test_url
=
'http://www.yp.net.cn/schinese//about/AboutCnYPol.asp'
):
import
requests
try
:
session
=
requests.session()
session.proxies
=
{
'http'
:
'{}:{}'
.
format
(_ip,_port)}
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0'
}
resp
=
session.get(test_url, timeout
=
10
, headers
=
headers)
#print len(resp.content)
except
BaseException as e:
#print 'test_poxy error:',e
return
(_ip,_port,
-
1
,
-
1
)
# if len(resp.content)<100000:
# print '######',resp.content
return
(_ip,_port,resp.elapsed.seconds,
len
(resp.content))
if
__name__
=
=
'__main__'
:
from
multiprocessing.dummy
import
Pool as ThreadPool
#while 1:
#验证Proxy.txt中的IP,将可用IP写入proxy_ip.txt
with
open
(
'/root/scrit/Proxy.txt'
,
'r'
) as f ,
open
(
'/root/scrit/proxy_ip.txt'
,
'wb'
) as fout:
ip_list
=
__extract_ipaddress(f.read())
#print('ipcount',len(ip_list))
pool
=
ThreadPool(
30
)
result
=
pool.
map
(test_poxy, ip_list)
pool.close()
pool.join()
result
=
sorted
(result, key
=
lambda
d:d[
3
], reverse
=
True
)
result
=
set
(result)
for
item
in
result:
if
int
(item[
3
])>
7000
:
#print item
fout.write(
'{}:{}\n'
.
format
(item[
0
],item[
1
]))
fout.close()
#print mymodul.get_time_str(),u'一次检测结束'
#time.sleep(3)
Data
=
open
(
"/root/scrit/proxy_ip.txt"
).read()
langth
=
len
(Data)
if
not
langth:
get_IP()
else
:
sys.exit(
0
)
|
本文转自YU文武貝 51CTO博客,原文链接:http://blog.51cto.com/linuxerxy/1893906
,如需转载请自行联系原作者