爬虫防被封,直连数据库

今天花了一上午的时间,研究一下如何让爬虫避免IP封死。
主要设计到PROXY和PROXYTYPE
以下是代码:

#-*- coding:utf-8 -*-
#Filename:最终版搜狗数据
#Author:Guan
#Datetime:2018/12/8

from selenium.webdriver.common.proxy import Proxy
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
import random
import requests
from bs4 import BeautifulSoup
import time

import re


import happybase
from _md5 import md5
import datetime

# #配置数据库链接池
# HBASE_HOST = '10.8.23.6'
# HBASE_PORT = 9090
# HBASE_TABLE = 'bt_t77_sougou'
#
# host = HBASE_HOST
# port = HBASE_PORT
# table_name = HBASE_TABLE
#
# connection = happybase.Connection(host=host,port=port,table_name=table_name)


#获得IP
def get_ip():

    # 设置请求ip
    proxie = ['10.6.198.114:808', '10.6.198.115:808', '10.6.198.117:808', '10.6.198.118:808', '10.6.198.172:808']
    # 循环控制翻页
    while True:
        # 随机选择IP
        ip = random.choice(proxie)
        print("代理IP:", ip)

        # 添加一个代理类型(手动代理类型)
        print("代理类型:", ProxyType.MANUAL)

        try:

            # 创建一个代理对象
            proxy = Proxy({
                # 代理方式
                'proxyType': ProxyType.MANUAL,
                # HTTP代理(即IP)
                'httpProxy': ip
            })
            # 新建一个“期望技能”,哈哈
            desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
            # 新功能给代理
            proxy.add_to_capabilities(desired_capabilities)

            # 实例化一个浏览器对象
            driver1 = webdriver.Chrome(
                desired_capabilities=desired_capabilities
            )

            return driver1

        except Exception as e:
            # 代理IP请求错误,直接发送请求到浏览其
            print("代理IP请求失败" + ip)

def get_url(url):
    return url

#设置消息头
def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36",
        "Cookie": "CXID=2C7D3DCAAA31333F0CA6B9F1D42B448E; SUID=07E1EB7C5B68860A5BEA43270009A690; ad=oujQFyllll2bf6GXlllllVs$yDolllllKnsPxZllllylllllRv7ll5@@@@@@@@@@; ABTEST=7|1543394206|v17; SUV=1543394206395727; browerV=3; osV=1; pgv_pvi=6458014720; SUIR=A2444FD8A4A1D8F49FA7A706A50AC1B8; sst0=544; sct=219; PHPSESSID=psls2mcv8gs1r4h3grcnpct066; UM_distinctid=1678c33873e0-05c303d0d58b1e-75133b4f-144000-1678c338740405; CNZZDATA1271442956=429218166-1544242285-%7C1544242285; Hm_lvt_f5df380d5163c1cc4823c8d33ec5fa49=1544100309,1544245446; Hm_lpvt_f5df380d5163c1cc4823c8d33ec5fa49=1544245770; IPLOC=CN1100; ld=tkllllllll2b@On0lllllVZazJZlllllKnsPxZlllxlllllljllll5@@@@@@@@@@; SNUID=FB2B97C1F9FF8687B5AEF045FAAAF705; seccodeRight=success; successCount=1|Sat, 08 Dec 2018 07:07:10 GMT"
    }
    response = requests.get(url=url,headers=headers).content.decode()
    return response

#解析网页
def get_cont(html):
    soup = BeautifulSoup(html,'lxml')

    #规则1
    regulation1 = soup.select('div[class="rb"]')
    reg_list=[]
    cx_url = get_url(url)
    for i in regulation1:
        reg_cont = {}
        reg_cont['chexing'] = re.sub('&from','',(re.split('=', cx_url, maxsplit=2))[1])
        reg_cont['title'] = i.find_all('h3')[0].text.strip()
        reg_cont['cont'] = i.select('div[class="ft"]')[0].text.strip()
        reg_cont['source'] = re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[0]
        try:
            reg_cont['pub_date'] = re.sub('翻译此页', '', (re.split('-', (re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[1]), maxsplit=1)[1]))
        except Exception:
            reg_cont['pub_date'] =  re.sub('翻译此页', '', (re.split('-', (i.find_all('cite')[0].text.strip()), maxsplit=1)[1]))
        reg_list.append(reg_cont)

    #规则2
    regulation2 = soup.select('div[style="width:548px"]')
    reg_list2 = []
    for j in regulation2:
        reg_cont2 = {}
        reg_cont2['chexing'] = re.sub('&from', '', (re.split('=', cx_url, maxsplit=2))[1])
        reg_cont2['title'] = j.find_all('h3')[0].text.strip()
        reg_cont2['cont'] = j.find_all('p')[0].text.strip()
        reg_cont2['source'] = re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[0]
        try:
            reg_cont2['pub_date'] = re.split('-',(re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[1]),maxsplit=1)[1].strip()
        except Exception:
            reg_cont2['pub_date'] = re.split('-',j.find_all('cite')[0].text.strip(),maxsplit=1)[1].strip()
        reg_list2.append(reg_cont2)

    #将两个规则数据合并
    news = reg_list+reg_list2

    #测试数据:
    for new in news:
        print(new)

    #写入数据库
    # for new in news:
    #     # print(new)
    #     rowkey = datetime.datetime.now().strftime('%Y%m%d')
    #     connection.open()
    #     table = connection.table(table_name)
    #     table.put(md5(rowkey.encode('utf-8')).hexdigest(),
    #             {
    #                 'cf1:chexing':new['chexing'],
    #                 'cf1:title': new['title'],
    #                 'cf1:cont': new['cont'],
    #                 'cf1:source': new['source'],
    #                 'cf1:pub_date': new['pub_date'],
    #             }
    #             )
    #     connection.close()

if __name__ == '__main__':
    # 读取车型
    file = open('D:\公司文件\.PyCharmCE2018.2\config\scratches\拓展\搜狗数据\搜狗配置车型', encoding='utf-8')
    cont = file.readlines()
    chexing1 = []
    for i in cont:
        new_chexing = i.split(',')
        for j in new_chexing:
            chexing1.append(j)
    print(chexing1)

    for cx in chexing1:

        url = 'https://www.sogou.com/web?query=%s&from=index-nologin&sugsuv=1543394206395727&tsn=1'%cx
        driver = get_ip()

        driver.get(url)

        get_url(url)

        # 开始网页解析
        html = get_html(url)

        # 加载网页

        get_cont(html)
        while True:
            time.sleep(1)
            driver.implicitly_wait(3)
            try:
                # 翻页动作
                driver.find_element_by_xpath('//*[@id="sogou_next"]').click()
                # 加载翻页后数据
                url = driver.current_url
                get_url(url)
                html = get_html(url)
                get_cont(html)

            except Exception:
                print('下一个车型')
                break
        driver.quit()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值