Python基础爬虫之抓取可用的IP

最新推荐文章于 2024-04-18 14:59:13 发布

小亦折

最新推荐文章于 2024-04-18 14:59:13 发布

阅读量504

点赞数 1

分类专栏： python 文章标签： python url 爬虫

本文链接：https://blog.csdn.net/xfrlij/article/details/77896607

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Python基础爬虫之抓取可用的IP

url_open

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read().decode('gbk')
    return html

伪装成浏览器访问

req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)

得到IP

def get_ip(html):
    '''
    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p,html)
    p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
    '''
    p = r'<td class="style1">([^<]+)'  
    iplist = re.findall(p,html)
    p1 = r'<td class="style2">([^<]+)'
    portlist = re.findall(p1,html)
    ipaddrs = []
    if len(iplist) == len(portlist):        #IP与端口个数匹配
        for i in range(len(iplist)):
            ipaddrs.append(iplist[i] + ':' + portlist[i])
    else:
        print('数据收集有误！')

    #print(ipaddrs)
    return ipaddrs

这里写图片描述

p = r'<td class="style1">([^<]+)'

使用正则表达式匹配IP

 p1 = r'<td class="style2">([^<]+)'

使用正则表达式匹配端口号

检验IP是否可用

def proxy_check(url,ipaddrs):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    try:
        response = urllib.request.urlopen(req,timeout=3)
    except:
    #except urllib.error.URLError as e:
        #print('timeout')
        '''if hasattr(e,'reason'):
            print('Reason:',e.reason)
        elif hasattr(e,'code'):
            print('Error code:',e.code)'''
    else:
        print(ipaddrs)
        return ipaddrs

检验IP是否可用，设定参数 timeout 检查异常

    try:
        response = urllib.request.urlopen(req,timeout=3)

所有代码

import urllib.request
import re
import urllib.error

def url_open(url):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    response = urllib.request.urlopen(url)
    html = response.read().decode('gbk')
    return html

def get_ip(html):
    '''
    p = r'(?:(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0-1]?\d?\d|2[0-4]\d|25[0-5])'
    iplist = re.findall(p,html)
    p = r'<span class="tbBottomLine" style="width:140px;">[^\d]+([^\r]+)'
    '''
    p = r'<td class="style1">([^<]+)'
    iplist = re.findall(p,html)
    p1 = r'<td class="style2">([^<]+)'
    portlist = re.findall(p1,html)
    ipaddrs = []
    if len(iplist) == len(portlist):
        for i in range(len(iplist)):
            ipaddrs.append(iplist[i] + ':' + portlist[i])
    else:
        print('数据收集有误！')

    #print(ipaddrs)
    return ipaddrs

def proxy_check(url,ipaddrs):
    req = urllib.request.Request(url)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36')
    proxy_support = urllib.request.ProxyHandler({'http':ipaddrs})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)
    try:
        response = urllib.request.urlopen(req,timeout=3)
    except:
    #except urllib.error.URLError as e:
        #print('timeout')
        '''if hasattr(e,'reason'):
            print('Reason:',e.reason)
        elif hasattr(e,'code'):
            print('Error code:',e.code)'''
    else:
        print(ipaddrs)
        return ipaddrs

if __name__ == '__main__':
    url = 'http://www.yun-daili.com/free.asp'
    ipaddrs = get_ip(url_open(url))
    useipaddrs = []
    for i in range(len(ipaddrs)):
        useipaddrs.append(proxy_check(url,ipaddrs[i]))
    if useipaddrs.count(None):
        for i in range(useipaddrs.count(None)):
            useipaddrs.remove(None)
    #for each in useipaddrs:
    print(useipaddrs)