Python基础爬虫之抓取可用的IP

Python基础爬虫之抓取可用的IP

url_open

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read().decode('gbk')
    return html

伪装成浏览器访问

req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)

得到IP

def get_ip(html):
    '''
    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p,html)
    p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
    '''
    p = r'<td class="style1">([^<]+)'  
    iplist = re.findall(p,html)
    p1 = r'<td class="style2">([^<]+)'
    portlist = re.findall(p1,html)
    ipaddrs = []
    if len(iplist) == len(portlist):        #IP与端口个数匹配
        for i in range(len(iplist)):
            ipaddrs.append(iplist[i] + ':' + portlist[i])
    else:
        print('数据收集有误!')

    #print(ipaddrs)
    return ipaddrs

这里写图片描述
这里写图片描述

p = r'<td class="style1">([^<]+)'  

使用正则表达式匹配IP

 p1 = r'<td class="style2">([^<]+)'

使用正则表达式匹配端口号

检验IP是否可用

def proxy_check(url,ipaddrs):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    try:
        response = urllib.request.urlopen(req,timeout=3)
    except:
    #except urllib.error.URLError as e:
        #print('timeout')
        '''if hasattr(e,'reason'):
            print('Reason:',e.reason)
        elif hasattr(e,'code'):
            print('Error code:',e.code)'''
    else:
        print(ipaddrs)
        return ipaddrs

检验IP是否可用,设定参数 timeout 检查异常

    try:
        response = urllib.request.urlopen(req,timeout=3)

所有代码

import urllib.request
import re
import urllib.error

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read().decode('gbk')
    return html

def get_ip(html):
    '''
    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p,html)
    p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
    '''
    p = r'<td class="style1">([^<]+)'
    iplist = re.findall(p,html)
    p1 = r'<td class="style2">([^<]+)'
    portlist = re.findall(p1,html)
    ipaddrs = []
    if len(iplist) == len(portlist):
        for i in range(len(iplist)):
            ipaddrs.append(iplist[i] + ':' + portlist[i])
    else:
        print('数据收集有误!')

    #print(ipaddrs)
    return ipaddrs

def proxy_check(url,ipaddrs):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    try:
        response = urllib.request.urlopen(req,timeout=3)
    except:
    #except urllib.error.URLError as e:
        #print('timeout')
        '''if hasattr(e,'reason'):
            print('Reason:',e.reason)
        elif hasattr(e,'code'):
            print('Error code:',e.code)'''
    else:
        print(ipaddrs)
        return ipaddrs

if __name__ == '__main__':
    url = 'http://www.yun-daili.com/free.asp'
    ipaddrs = get_ip(url_open(url))
    useipaddrs = []
    for i in range(len(ipaddrs)):
        useipaddrs.append(proxy_check(url,ipaddrs[i]))
    if useipaddrs.count(None):
        for i in range(useipaddrs.count(None)):
            useipaddrs.remove(None)
    #for each in useipaddrs:
    print(useipaddrs)

结果

这里写图片描述

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值