【实用工具系列之爬虫】python实现爬取代理IP（防 ‘反爬虫’）

szZack

已于 2022-06-28 09:16:01 修改

阅读量5.2k

点赞数 1

分类专栏：实用工具爬虫文章标签： python 爬虫 tcp/ip

于 2019-07-27 15:02:51 首次发布

本文链接：https://blog.csdn.net/zengNLP/article/details/97433146

版权

实用工具同时被 2 个专栏收录

22 篇文章 3 订阅

订阅专栏

爬虫

4 篇文章 0 订阅

订阅专栏

系列

【实用工具系列之爬虫】python实现爬取代理IP（防 ‘反爬虫’）
【实用工具系列之爬虫】python实现快速爬取财经资讯（防 ‘反爬虫’）

本文使用python实现代理IP的爬取，并可以防‘反爬虫’。

环境

Ubuntu16.04
python3

爬取方法

代理IP网站：https://www.xicidaili.com

步骤
1、按照页面id顺序爬取页面内容
2、使用正则表达式解析ip、port
3、保存ip、port信息
防 ‘反爬虫’ 方法
针对https://www.xicidaili.com有反爬虫，对上面步骤进行改进
1、先爬取第1页，提取其中的ip和端口
2、使用1中的ip及端口作为代理
3、爬取剩余的页面的ip、端口
代码实战
crawl_proxy_ip.py

import sys, os
import urllib.request
import time, random
import re
from urllib import request, parse
import pickle

def crawl_proxy_ip(url, proxy_ip_dict=None):
    
    #添加header模仿浏览器操作
    if proxy_ip_dict is None:
        headers = {
                    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
                  }
        req = request.Request(url=url, data=None, headers=headers, method='GET') #这里要注意method是POST还是GET，都试一下
        response = request.urlopen(req)
        html = response.read().decode('utf-8')
    
    else:
        html = download_by_proxy(url, proxy_ip_dict)
    
    ip_prot_list = extract_ip(html)
    
    #print(ip_prot_list)
    print(url, len(ip_prot_list))
    return ip_prot_list
    
    
def download_by_proxy(url, proxy_ip_dict):

    headers = {
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
                'Connection':'close'
              }
              
    proxy_handler = urllib.request.ProxyHandler( proxy_ip_dict )
    opener = urllib.request.build_opener(proxy_handler)
    req = urllib.request.Request(url, headers = headers)
    response = opener.open(req, timeout=60)
    html = response.read().decode('utf-8')
    #print(len(html))
    return html
    
    
def extract_ip(html):
    
    html = html.replace(' ', '')
    html = html.replace('\r', '')
    html = html.replace('\n', '')
    
    #<td>222.89.32.150</td><td>9999</td>
    
    ip_prot_list = []
    res = re.search('<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td><td>(\d{1,5})</td>', html)
    while res is not None:
        ip_prot_list.append(res.groups())
        
        html = html.replace('<td>%s</td><td>%s</td>' %(res.groups()[0], res.groups()[1]), '')
        res = re.search('<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td><td>(\d{1,5})</td>', html)
    
    return ip_prot_list
    
    

def crawl_web(first_url, max_number, proxy_ip_list):
    
    data = []
    i = 1
    while i <= max_number:
        try:
            proxy_ip_dict = random.choice(proxy_ip_list)
            ip_prot_list = crawl_proxy_ip(first_url + str(i), proxy_ip_dict)
            data += ip_prot_list
        except:
            print('error:', 'https://www.xicidaili.com/nn/%d' %(i))
            continue
        
        i += 1
        
    with open('proxy_ip.pkl', 'wb') as f:
        pickle.dump(data, f)
    
    print('done!')
    
    
def load_proxy_ip(path):
    
    with open(path, 'rb') as f:
        data = pickle.load(f)
        
    proxy_ip_list = []
    for item in data:
        proxy_ip_dict = {}
        proxy_ip_dict['http'] = 'http://%s:%s' %(item[0], item[1])
        proxy_ip_list.append(proxy_ip_dict)
        
    return proxy_ip_list
    
    
    
if __name__=='__main__':

    #爬取第1页，作为代理IP
    if not os.path.exists('proxy_ip-1.pkl'):
        ip_prot_list = crawl_proxy_ip('https://www.xicidaili.com/nn/1')
        with open('proxy_ip-1.pkl', 'wb') as f:
            pickle.dump(ip_prot_list, f)
    
    #使用第1页的代理IP爬取剩余的
    proxy_ip_list = load_proxy_ip('proxy_ip-1.pkl')
    crawl_web('https://www.xicidaili.com/nn/', 50, proxy_ip_list)#爬取前50个网页