基于Python，IP代理池，多线程，mongoDB的第一个反反爬

最新推荐文章于 2024-06-17 13:24:29 发布

陈文青-

最新推荐文章于 2024-06-17 13:24:29 发布

阅读量861

点赞数 2

分类专栏： python 爬虫文章标签：反反爬爬虫 ip代理池 mongoDB 多线程

本文链接：https://blog.csdn.net/Destiny_Forever/article/details/82284811

版权

python 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

爬虫

5 篇文章 0 订阅

订阅专栏

get_ip.py 获取ip代理

import requests
from lxml import etree
import urllib
import random


"""
从西刺Ip网站，得到一些可用的代理Ip 
"""
url = r'http://www.xicidaili.com/wn'
ip_list=[]  # 存放爬取列表

def get_ip_list(url = url):

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    # 访问成功，response会返回200
    # resquests.get()函数获取页面
    response = requests.get(url, headers=headers)
    # response.content.decode()
    # xlml 中的 etree.HTML() 解析得到的网页数据
    html_str = response.content.decode()
    html = etree.HTML(html_str)


    """
    判断网页的设计，得出需要取得ip内容的位置，保存至ip_list
    """
    # ip,port  tr改变：odd: 1,2,3,4      没有odd：3，5，7，9
    # ip1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[2]")
    # ip2 = html.xpath("//table[@id='ip_list']/tr[3]/td[2]")
    # port  tr改变：同上
    # port1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[3]")
    # potr2 = html.xpath("//table[@id='ip_list']/tr[3]/td[3]")
    # 使用xpath helper工具得到xpath
    i = 1
    while True:
        ip = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[2]/text()".format(i))
        port = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[3]/text()".format(i))
        if ip:
            ip_list.append(ip[0]+':'+port[0])
            i += 1
        else:
            break

    i = 3
    while True:
        ip = html.xpath("//table[@id='ip_list']/tr[{}]/td[2]/text()".format(i))
        port = html.xpath("//table[@id='ip_list']/tr[{}]/td[3]/text()".format(i))
        if ip:
            ip_list.append(ip[0]+':'+port[0])
            i += 2
        else:
            break

    # print(ip_list)

    """验证Ip的可用性"""
    for ip in ip_list:
            try:
              proxy_host = "https://" + ip
              proxy_temp = {"https": proxy_host}
              res = urllib.urlopen(url, proxies=proxy_temp).read()
            except Exception as e:
              ip_list.remove(ip)
              continue
    # print(ip_list)
    return ip_list

"""
随机从代理池中取ip
"""
def get_ip():
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies

"""测试"""
if __name__ == '__main__':
    url = 'http://www.xicidaili.com/wn/'
    ip_list = get_ip_list(url)
    proxies = get_ip()
    print(proxies)

my_parser.py 解析函数

import requests
from bs4 import BeautifulSoup
import lxml

"""解析函数，返回数据"""
def parser(url, headers , proxies, timeout = 2):
    info = {}   #保存数据
    res = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)
    # res = requests.get(url,headers=headers)
    # print(res.status_code)

    # 获取数据
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'lxml')
        # codl.class 下 dd dt标签的文本
        key = [b.text for b in soup.select('.codl dd')]
        value = [p.text for p in soup.select('.codl dt')]
        # 存入 info字典
        for k, v in zip(value, key):
            info[k.strip('：')] = v

    return info

shunqi_spider.py 主程序

import urllib.request
import requests
from bs4 import BeautifulSoup
import random
import pymongo
from multiprocessing.dummy import Pool as ThreadPool
import get_ip
import my_parser
# 'User-Agent'
user_agent_list=[
    'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
    'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
    'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',
    'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
    'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)',
    'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;AvantBrowser)',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
]

# 定义数据库
client = pymongo.MongoClient('localhost',27017)
conpany_info = client['conpany_info']  # 给数据库命名
sheet_table = conpany_info['sheet_table']  # 创建表单

def get_data(url):
    # 每个网页尝试次数
    for i in range(10):
        try:
            # 解析函数
            proxies = get_ip.get_ip()
            headers = {
                'User-Agent': random.choice(user_agent_list)
            }
            # 等待可能网页波动造成的延迟 1 秒
            info = my_parser.parser(url=url, headers=headers, proxies=proxies, timeout=1)
            if info:
                # 存储到数据库
                sheet_table.insert(info)
                # infos[url] = info
                print("数据捕获完成")
                break
        except Exception as e:
            print(e, url)



if __name__ == "__main__":

    # infos = {}  # 要得到的信息

    # 待爬取的网页，range(2,160998))
    urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,30))
    # 代理池页面
    ip_url = 'http://www.xicidaili.com/wn/'
    ip_list = get_ip.get_ip_list(ip_url)    # 得到ip代理列表

    # # 存储数据
    # path = r'D:\code\shunqi_spider\shunqi.txt'
    # with open(path,'w') as file:
    #     for key,value in infos.items():
    #         file.write(key + ' : ' + str(value) + '\n')
    # print("程序运行结束")
    # 存储数据到数据库

    # 创建线程池
    pool = ThreadPool(20)
    results = pool.map(get_data,urls)
    pool.close()
    pool.join()

总结：

1.xpath-helper 解析数据位置

2.通过多个User-Agent和多个ip的随机访问，解决反反爬

3.设置等待时间

4.解析网页的几种方式

5.mongoDB数据库

6.线程池

7.random.choise 随机数

8.进修cookie session js

陈文青-

关注

2
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
基于Python，IP代理池，多线程，mongoDB的第一个反反爬

get_ip.py 获取ip代理import requestsfrom lxml import etreeimport urllibimport random"""从西刺Ip网站，得到一些可用的代理Ip """url = r'http://www.xicidaili.com/wn'ip_list=[] # 存放爬取列表def get_ip_list(url =...
复制链接

扫一扫

专栏目录