Python基础爬虫之抓取可用的IP
url_open
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read().decode('gbk')
return html
伪装成浏览器访问
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
得到IP
def get_ip(html):
'''
p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
iplist = re.findall(p,html)
p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
'''
p = r'<td class="style1">([^<]+)'
iplist = re.findall(p,html)
p1 = r'<td class="style2">([^<]+)'
portlist = re.findall(p1,html)
ipaddrs = []
if len(iplist) == len(portlist): #IP与端口个数匹配
for i in range(len(iplist)):
ipaddrs.append(iplist[i] + ':' + portlist[i])
else:
print('数据收集有误!')
#print(ipaddrs)
return ipaddrs
p = r'<td class="style1">([^<]+)'
使用正则表达式匹配IP
p1 = r'<td class="style2">([^<]+)'
使用正则表达式匹配端口号
检验IP是否可用
def proxy_check(url,ipaddrs):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
try:
response = urllib.request.urlopen(req,timeout=3)
except:
#except urllib.error.URLError as e:
#print('timeout')
'''if hasattr(e,'reason'):
print('Reason:',e.reason)
elif hasattr(e,'code'):
print('Error code:',e.code)'''
else:
print(ipaddrs)
return ipaddrs
检验IP是否可用,设定参数 timeout 检查异常
try:
response = urllib.request.urlopen(req,timeout=3)
所有代码
import urllib.request
import re
import urllib.error
def url_open(url):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
response = urllib.request.urlopen(url)
html = response.read().decode('gbk')
return html
def get_ip(html):
'''
p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
iplist = re.findall(p,html)
p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
'''
p = r'<td class="style1">([^<]+)'
iplist = re.findall(p,html)
p1 = r'<td class="style2">([^<]+)'
portlist = re.findall(p1,html)
ipaddrs = []
if len(iplist) == len(portlist):
for i in range(len(iplist)):
ipaddrs.append(iplist[i] + ':' + portlist[i])
else:
print('数据收集有误!')
#print(ipaddrs)
return ipaddrs
def proxy_check(url,ipaddrs):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
try:
response = urllib.request.urlopen(req,timeout=3)
except:
#except urllib.error.URLError as e:
#print('timeout')
'''if hasattr(e,'reason'):
print('Reason:',e.reason)
elif hasattr(e,'code'):
print('Error code:',e.code)'''
else:
print(ipaddrs)
return ipaddrs
if __name__ == '__main__':
url = 'http://www.yun-daili.com/free.asp'
ipaddrs = get_ip(url_open(url))
useipaddrs = []
for i in range(len(ipaddrs)):
useipaddrs.append(proxy_check(url,ipaddrs[i]))
if useipaddrs.count(None):
for i in range(useipaddrs.count(None)):
useipaddrs.remove(None)
#for each in useipaddrs:
print(useipaddrs)