制作ip地址池

从网上的免费代理(http://www.xicidaili.com/nn/1)中筛选了能用,寿命长,速度快的代理。

 1 from bs4 import BeautifulSoup
 2 import re,time,requests
 3 from requests.exceptions import ReadTimeout,HTTPError,RequestException,ConnectionError
 4 from selenium import webdriver
 5 from selenium.common.exceptions import TimeoutException
 6 def estimate_time(ip_test_date,ip_rest_time):#rest_time+test_date-current_time
 7     new_ip_test_date='20'+ip_test_date+':00'
 8     time_stamp = time.mktime(time.strptime(new_ip_test_date, '%Y-%m-%d %H:%M:%S'))
 9     d_ip_rest_time=re.findall('\d',ip_rest_time)
10     a_ip_rest_time=re.findall('\D',ip_rest_time)
11     if a_ip_rest_time[0]=='分钟':
12         da_ip_rest_time=int(d_ip_rest_time[0])*60
13     elif a_ip_rest_time[0]=='小时':
14         da_ip_rest_time = int(d_ip_rest_time[0]) * 3600
15     else:
16         da_ip_rest_time = int(d_ip_rest_time[0]) * 86400
17     result_time=time_stamp+da_ip_rest_time-time.time()
18     return result_time
19 def status_code(ip_type,ip_add,ip_port):
20     proxies = {
21         "%s" % ip_type: "%s://%s:%s" % (ip_type, ip_add, ip_port)
22     }
23     try:
24         response = requests.get("https://www.baidu.com", proxies=proxies, timeout=2)
25         return response.status_code
26     except ReadTimeout:
27         return ('Timeout')
28     except HTTPError:
29         return ('Timeout')
30     except RequestException:
31         return ('Timeout')
32     except ConnectionError:
33         return ('Timeout')
34 try:
35     chrome_options = webdriver.ChromeOptions()
36     chrome_options.add_argument('--headless')
37     chrome_options.add_argument('--disable-gpu')
38     browser = webdriver.Chrome(chrome_options=chrome_options)
39     browser.get("http://www.xicidaili.com/nn")
40     html=browser.page_source
41     browser.quit()
42 except TimeoutException:
43     print('Time Out')
44 dict_http={}
45 dict_https={}
46 soup=BeautifulSoup(html,'lxml')
47 lists=soup.tbody.contents
48 count=0
49 while count<199:
50     count+=2
51     need_jiexi=str(lists[count])
52     pattern=re.compile('<td>(.*?)</td>',re.S)
53     items=re.findall(pattern,need_jiexi)
54     pattern2=re.compile('<div class="bar_inner fast" style="width:(.*?)%">',re.S)
55     items2=re.findall(pattern2,need_jiexi)
56     ip_place_list=re.findall('<a href.*?">(.*?)</a>',items[2])
57     if len(ip_place_list)==1:
58         ip_place = ip_place_list[0]
59     else:
60         continue
61     if len(items2)==2:
62         ip_speed = items2[0]
63         ip_connect_time = items2[1]
64         if int(ip_speed) and int(ip_connect_time) > 79:
65             ip_speed=ip_speed
66             ip_connect_time=ip_connect_time
67         else:
68             continue
69     else:
70         continue
71     ip_rest_time = items[4]
72     ip_test_date = items[5]
73     if estimate_time(ip_test_date,ip_rest_time)>1020:
74         ip_test_date=ip_test_date
75         ip_rest_time=ip_rest_time
76     else:
77         continue
78     ip_type = items[3].lower()
79     ip_add = items[0]
80     ip_port = items[1]
81     if status_code(ip_type,ip_add,ip_port)==200:
82          ip_type = ip_type
83          ip_add = ip_add
84          ip_port = ip_port
85     else:
86          continue
87     if ip_type == 'http':
88         name = 'ip_address_%d' % (len(dict_http) + 1)
89         dict_http.update({name: [ip_add, ip_port, ip_place]})
90     else:
91         name = 'ip_address_%d' % (len(dict_https) + 1)
92         dict_https.update({name: [ip_add, ip_port, ip_place]})
93 print('http:',dict_http)
94 print('https:',dict_https)
代码
1 http: {'ip_address_1': ['27.209.19.71', '61202', '山东淄博'], 'ip_address_2': ['14.112.76.68', '61234', '广东惠州市惠东县'], 'ip_address_3': ['122.114.31.177', '808', '河南郑州'], 'ip_address_4': ['61.135.217.7', '80', '北京'], 'ip_address_5': ['116.55.77.81', '61202', '云南丽江'], 'ip_address_6': ['223.246.238.147', '61202', '安徽宿州'], 'ip_address_7': ['58.216.202.149', '8118', '江苏常州'], 'ip_address_8': ['182.247.75.106', '61202', '云南'], 'ip_address_9': ['39.78.30.207', '61202', '山东'], 'ip_address_10': ['119.191.31.22', '61202', '山东潍坊'], 'ip_address_11': ['125.127.79.4', '61202', '浙江台州市温岭'], 'ip_address_12': ['123.161.153.59', '40435', '河南许昌'], 'ip_address_13': ['121.237.53.10', '61202', '江苏南京'], 'ip_address_14': ['218.66.149.224', '8118', '福建厦门'], 'ip_address_15': ['218.4.46.45', '61202', '江苏苏州'], 'ip_address_16': ['113.128.10.120', '61234', '山东济南'], 'ip_address_17': ['14.112.76.201', '61234', '广东惠州市惠东县'], 'ip_address_18': ['117.64.238.30', '61202', '安徽合肥'], 'ip_address_19': ['113.121.240.77', '61234', '山东德州'], 'ip_address_20': ['180.136.56.33', '61202', '广西桂林'], 'ip_address_21': ['113.122.34.190', '61234', '山东菏泽'], 'ip_address_22': ['125.121.118.86', '6666', '浙江杭州'], 'ip_address_23': ['112.248.7.102', '61234', '山东枣庄'], 'ip_address_24': ['58.209.38.75', '8118', '江苏苏州']}
2 https: {}
输出形式如下

纯100%自己做的,累死我了

转载于:https://www.cnblogs.com/pakhm/p/8572039.html

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值