selenium+python+scrapy百度点击

百度模拟点击python+selenium+scrapy方案

目前可以完成模拟点击,翻页搜索。。32g主机极限一天大概5000次频率,64g主机极限1万次

 

import random
import re
import socket
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlparse

import scrapy
import requests,json
import time

from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.project import get_project_settings
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from twisted.internet.error import DNSLookupError, TCPTimedOutError

from domainPro.items import ProItem
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

class SopcSpider(scrapy.Spider):
    name = 'baidupc'
    # allowed_domains = ['www.xxx.com']
    so_pc ='https://www.baidu.com/s?wd={_q}'
    input_return_url =''
    input_upload_url= ''
    proxy_url =''
    page =''
    server =''
    is_snapshoot =''
    page_num ={}
    def __init__(self,name):
        self.server =name['server']
        self.proxy_url =name['proxy_url']
        self.data =name['data']
        self.page =name['page']
        self.rw =name['rw']
        self.is_snapshoot =name['is_snapshoot']
        self.input_return_url =name['input_return_url']
        self.input_upload_url =name['input_upload_url']
        self.rand_time = random.uniform(4,7)
    def start_requests(self):
        if len(self.data)>0:
            print('开始执行')
            for val in self.data:
                try:
                   print(val['Wordone'])
                   print(val['url'])
                   print(val['id'])
                   new_url = self.so_pc.format(_q=val['Wordone'])+'&id='+str(val['id'])
                   print(new_url)
                except ZeroDivisionError as e:
                    print(e)
                self.page_num[val['id']] =1
                yield scrapy.Request(url=new_url, meta={'retry_times':1,'proxy_url':self.proxy_url,'url': val['url'], 'id': val['id'],'Wordone':val['Wordone'],'page':val['page']}, callback=self.parse,errback = self.errback_httpbin)


    def parse(self, response):
        # url = response.meta['url']
        # if url.startswith('www.'):
        #     url =url.lstrip('www.')
        #new_url = response.meta['new_url']
        id = response.meta['id']
        print(response.meta)
        #浏览器配置
        firefox_options = webdriver.FirefoxOptions()
        firefox_options.add_argument('--headless')
        result = requests.get(response.meta['proxy_url']).json().get('data').get('proxy_list')
        proxy = random.choice(result)
        print(proxy)
        firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
        firefox_capabilities['marionette'] = True
        firefox_capabilities['proxy'] = {'proxyType': 'MANUAL', 'httpProxy': proxy,
                                         'sslProxy': proxy}
        try:
           driver = webdriver.Firefox(capabilities=firefox_capabilities,options=firefox_options)
        except:
           print('浏览器启动失败')
           return  False

        try:
            # 隐藏式等待
           driver.implicitly_wait(5)
           driver.get('https://baidu.com/')
        except:
           print('访问百度失败')
           driver.quit()
           print('关闭浏览器')
           return  False
        # 截取百度按钮
        try:
           btn1 = driver.find_element_by_id('su')
        except:
           print('未打开百度页面')
           driver.quit()
           print('关闭浏览器')
           return False
        # btn1.screenshot(r'/Users/guanfangbaidu.png')
        time.sleep(self.rand_time)
        try:
          input1 = driver.find_element_by_id('kw')
        except:
            print('未接受到输入信息')
            driver.quit()
            print('关闭浏览器')
            return False
        # 输入信息
        lens = len(response.meta['Wordone'])
        s = 0;
        for i in range(0,lens):
            key = response.meta['Wordone'][s:s+1]
            input1.send_keys(key)
            s +=1
            time.sleep(1)

        btn = driver.find_element_by_id('su')
        btn.click()
        # 识别关键词
        key_seo = response.meta['url']
        # 二级关键词
        # key_seo_two = '注册'
        result = 0
        pages = response.meta['page'].split('|')
        page = int(pages[1]) - int(pages[0])
        #获取当前页面
        url = driver.current_url
        if int(pages[0]) > 1:
            try:
              url_new = url + '&pn=' + str(int(pages[0]) * 10 - 10)
              driver.get(url_new)
            except NameError:
              print('链接不存在')
        # print(page)
        rank =0
        pms = int(pages[0]);
        status = 0
        for i in range(page+1):
            # 当前页数
            print(str(pms)+"当前页数")
            pms = pms + 1
            try:
                item = driver.find_elements_by_xpath('//*[@id="content_left"]/div[@class="result c-container new-pmd"]')
            except:
                print('未找到搜索结果')
                break
            for val in item:
                driver.implicitly_wait(3)
                try:
                   click_title = val.find_element_by_xpath('.//div[@class="f13 c-gap-top-xsmall se_st_footer user-avatar"]/a')
                   pattern = re.compile(r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
                   url = ''
                   if pattern.match(click_title.text):
                       host = urlparse("https://"+click_title.text)
                       url =  host.netloc
                except:
                    print('未找到a链接')
                    break
                if key_seo == url:
                    print("匹配到" + click_title.text)
                    rank = val.get_attribute('id')
                    print('当前排名' + rank)
                    item = ProItem()
                    # 当前页
                    item['rw'] =  self.rw

                    item['zt'] = 1
                    # 当前页排名多少
                    item['pm'] = pms
                    # id
                    item['id'] = response.meta['id']
                    # 服务器编号
                    item['bh'] = self.server
                    status = 1
                    click_title.send_keys(Keys.ENTER)
                    #点击记录日志
                    ip = proxy
                    log = open('click.log','a+')
                    log.write('\n点击成功:%s'%key_seo+',ip:%s'%ip)
                    yield item
                    break
            if status ==1:
                break
            driver.implicitly_wait(5)
            try:
               driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
               time.sleep(self.rand_time)
               driver.find_element_by_partial_link_text("下一页").click()
            except:
               print('未找到下一页')
               # 关闭浏览器
               try:
                   time.sleep(self.rand_time)
                   driver.quit()
                   print('关闭浏览器')
               except NameError:
                   print('无法关闭浏览器')
               break
            #item
        if rank == 0:
            item = ProItem()
            item['rw'] = self.rw
            # 当前页
            item['zt'] = 0
            # 当前页排名多少
            item['pm'] = 0
            # id
            item['id'] = response.meta['id']
            # 服务器编号
            item['bh'] = self.server
            yield item
        else:
           windows = driver.window_handles
           # 获取最新打开窗口
           driver.switch_to.window(windows[-1])
           driver.implicitly_wait(5)
           time.sleep(self.rand_time)
           keys = ['关于我们','产品中心','新闻资讯','联系我们']
           try:
              key_click = random.choice(keys)
              a_link = driver.find_element_by_link_text(key_click)
              a_link.click()
              print("%s点击成功"%key_click)
           except:
              print('%s未找到内页'%key_click)
           # 随机下拉
           # self.moveDown(driver)
           # links = driver.find_elements_by_tag_name('a')
           # link_count = 0
           # for val in links:
           #     # val.click()
           #     # break
           #     link_count += 1
           # click_link = random.uniform(1, link_count)
           # print("链接数量%s"%link_count)
           # st = 0
           # for val in links:
           #     st += 1
           #     if click_link == st:
           #         print("点击%s"%val)
           #         #val.click()
           #         val.send_keys(Keys.ENTER)
           #         break
        # time.sleep(3)
        # 关闭浏览器
        try:
            time.sleep(self.rand_time)
            driver.quit()
            print('关闭浏览器1')
        except BaseException:
            print('无法关闭浏览器')
    def errback_httpbin(self,failure):
        # log all failures
        self.logger.info(repr(failure))
        # in case you want to do something special for some errors,
        # you may need the failure's type:

        if failure.check(HttpError):
            # these exceptions come from HttpError spider middleware
            # you can get the non-200 response
            response = failure.value.response
            self.logger.info('HttpError错误 on %s', response.url)

        elif failure.check(DNSLookupError):
            # this is the original request
            request = failure.request
            self.logger.info('DNSLookupError错误 on %s', request.url)

        elif failure.check(TimeoutError, TCPTimedOutError):
            request = failure.request
            self.logger.info('TimeoutError错误 on %s', request.url)

 

火狐浏览器,明明每次都quit了,但是内存会不断增高,一直到90%以上,会出现浏览器无法打开,然后无限循环。。目前情况就是隔一天,就要把挂掉的虚拟机全部重启,再开启进程。。这样维护成本太高,本来想用

QtWebEngineWidgets 去内置浏览器,但是无法用selenium 驱动会很麻烦,如果做成鼠标键盘模拟,python 的性能也会比较差,而且稳定性也不得而知

还有一种方案,requests 模拟效率高,稳定性强,缺点是容易被封,有没有更好的方案

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值