基于Python,IP代理池,多线程,mongoDB的第一个反反爬

get_ip.py    获取ip代理

import requests
from lxml import etree
import urllib
import random


"""
从西刺Ip网站,得到一些可用的代理Ip 
"""
url = r'http://www.xicidaili.com/wn'
ip_list=[]  # 存放爬取列表

def get_ip_list(url = url):

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    # 访问成功,response会返回200
    # resquests.get()函数获取页面
    response = requests.get(url, headers=headers)
    # response.content.decode()
    # xlml 中的 etree.HTML() 解析得到的网页数据
    html_str = response.content.decode()
    html = etree.HTML(html_str)


    """
    判断网页的设计,得出需要取得ip内容的位置,保存至ip_list
    """
    # ip,port  tr改变:odd: 1,2,3,4      没有odd:3,5,7,9
    # ip1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[2]")
    # ip2 = html.xpath("//table[@id='ip_list']/tr[3]/td[2]")
    # port  tr改变:同上
    # port1 = html.xpath("//table[@id='ip_list']/tr[@class='odd'][1]/td[3]")
    # potr2 = html.xpath("//table[@id='ip_list']/tr[3]/td[3]")
    # 使用xpath helper工具得到xpath
    i = 1
    while True:
        ip = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[2]/text()".format(i))
        port = html.xpath("//table[@id='ip_list']/tr[@class='odd'][{}]/td[3]/text()".format(i))
        if ip:
            ip_list.append(ip[0]+':'+port[0])
            i += 1
        else:
            break

    i = 3
    while True:
        ip = html.xpath("//table[@id='ip_list']/tr[{}]/td[2]/text()".format(i))
        port = html.xpath("//table[@id='ip_list']/tr[{}]/td[3]/text()".format(i))
        if ip:
            ip_list.append(ip[0]+':'+port[0])
            i += 2
        else:
            break

    # print(ip_list)

    """验证Ip的可用性"""
    for ip in ip_list:
            try:
              proxy_host = "https://" + ip
              proxy_temp = {"https": proxy_host}
              res = urllib.urlopen(url, proxies=proxy_temp).read()
            except Exception as e:
              ip_list.remove(ip)
              continue
    # print(ip_list)
    return ip_list

"""
随机从代理池中取ip
"""
def get_ip():
    proxy_list = []
    for ip in ip_list:
        proxy_list.append('http://' + ip)
    proxy_ip = random.choice(proxy_list)
    proxies = {'http': proxy_ip}
    return proxies

"""测试"""
if __name__ == '__main__':
    url = 'http://www.xicidaili.com/wn/'
    ip_list = get_ip_list(url)
    proxies = get_ip()
    print(proxies)

my_parser.py    解析函数

import requests
from bs4 import BeautifulSoup
import lxml

"""解析函数,返回数据"""
def parser(url, headers , proxies, timeout = 2):
    info = {}   #保存数据
    res = requests.get(url, headers=headers, proxies=proxies, timeout=timeout)
    # res = requests.get(url,headers=headers)
    # print(res.status_code)

    # 获取数据
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'lxml')
        # codl.class 下 dd dt标签的文本
        key = [b.text for b in soup.select('.codl dd')]
        value = [p.text for p in soup.select('.codl dt')]
        # 存入 info字典
        for k, v in zip(value, key):
            info[k.strip(':')] = v

    return info

shunqi_spider.py    主程序

import urllib.request
import requests
from bs4 import BeautifulSoup
import random
import pymongo
from multiprocessing.dummy import Pool as ThreadPool
import get_ip
import my_parser
# 'User-Agent'
user_agent_list=[
    'Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
    'Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50',
    'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0',
    'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
    'Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)',
    'Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;360SE)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)',
    'User-Agent:Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;AvantBrowser)',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
]

# 定义数据库
client = pymongo.MongoClient('localhost',27017)
conpany_info = client['conpany_info']  # 给数据库命名
sheet_table = conpany_info['sheet_table']  # 创建表单

def get_data(url):
    # 每个网页尝试次数
    for i in range(10):
        try:
            # 解析函数
            proxies = get_ip.get_ip()
            headers = {
                'User-Agent': random.choice(user_agent_list)
            }
            # 等待可能网页波动造成的延迟 1 秒
            info = my_parser.parser(url=url, headers=headers, proxies=proxies, timeout=1)
            if info:
                # 存储到数据库
                sheet_table.insert(info)
                # infos[url] = info
                print("数据捕获完成")
                break
        except Exception as e:
            print(e, url)



if __name__ == "__main__":

    # infos = {}  # 要得到的信息

    # 待爬取的网页,range(2,160998))
    urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,30))
    # 代理池页面
    ip_url = 'http://www.xicidaili.com/wn/'
    ip_list = get_ip.get_ip_list(ip_url)    # 得到ip代理列表

    # # 存储数据
    # path = r'D:\code\shunqi_spider\shunqi.txt'
    # with open(path,'w') as file:
    #     for key,value in infos.items():
    #         file.write(key + ' : ' + str(value) + '\n')
    # print("程序运行结束")
    # 存储数据到数据库

    # 创建线程池
    pool = ThreadPool(20)
    results = pool.map(get_data,urls)
    pool.close()
    pool.join()

总结:

1.xpath-helper 解析数据位置

2.通过多个User-Agent和多个ip的随机访问,解决反反爬

3.设置等待时间

4.解析网页的几种方式

5.mongoDB数据库

6.线程池

7.random.choise 随机数

8.进修cookie  session  js  

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值