百度模拟点击python+selenium+scrapy方案
目前可以完成模拟点击,翻页搜索。。32g主机极限一天大概5000次频率,64g主机极限1万次
import random
import re
import socket
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import urlparse
import scrapy
import requests,json
import time
from scrapy.spidermiddlewares.httperror import HttpError
from scrapy.utils.project import get_project_settings
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from twisted.internet.error import DNSLookupError, TCPTimedOutError
from domainPro.items import ProItem
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class SopcSpider(scrapy.Spider):
name = 'baidupc'
# allowed_domains = ['www.xxx.com']
so_pc ='https://www.baidu.com/s?wd={_q}'
input_return_url =''
input_upload_url= ''
proxy_url =''
page =''
server =''
is_snapshoot =''
page_num ={}
def __init__(self,name):
self.server =name['server']
self.proxy_url =name['proxy_url']
self.data =name['data']
self.page =name['page']
self.rw =name['rw']
self.is_snapshoot =name['is_snapshoot']
self.input_return_url =name['input_return_url']
self.input_upload_url =name['input_upload_url']
self.rand_time = random.uniform(4,7)
def start_requests(self):
if len(self.data)>0:
print('开始执行')
for val in self.data:
try:
print(val['Wordone'])
print(val['url'])
print(val['id'])
new_url = self.so_pc.format(_q=val['Wordone'])+'&id='+str(val['id'])
print(new_url)
except ZeroDivisionError as e:
print(e)
self.page_num[val['id']] =1
yield scrapy.Request(url=new_url, meta={'retry_times':1,'proxy_url':self.proxy_url,'url': val['url'], 'id': val['id'],'Wordone':val['Wordone'],'page':val['page']}, callback=self.parse,errback = self.errback_httpbin)
def parse(self, response):
# url = response.meta['url']
# if url.startswith('www.'):
# url =url.lstrip('www.')
#new_url = response.meta['new_url']
id = response.meta['id']
print(response.meta)
#浏览器配置
firefox_options = webdriver.FirefoxOptions()
firefox_options.add_argument('--headless')
result = requests.get(response.meta['proxy_url']).json().get('data').get('proxy_list')
proxy = random.choice(result)
print(proxy)
firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
firefox_capabilities['proxy'] = {'proxyType': 'MANUAL', 'httpProxy': proxy,
'sslProxy': proxy}
try:
driver = webdriver.Firefox(capabilities=firefox_capabilities,options=firefox_options)
except:
print('浏览器启动失败')
return False
try:
# 隐藏式等待
driver.implicitly_wait(5)
driver.get('https://baidu.com/')
except:
print('访问百度失败')
driver.quit()
print('关闭浏览器')
return False
# 截取百度按钮
try:
btn1 = driver.find_element_by_id('su')
except:
print('未打开百度页面')
driver.quit()
print('关闭浏览器')
return False
# btn1.screenshot(r'/Users/guanfangbaidu.png')
time.sleep(self.rand_time)
try:
input1 = driver.find_element_by_id('kw')
except:
print('未接受到输入信息')
driver.quit()
print('关闭浏览器')
return False
# 输入信息
lens = len(response.meta['Wordone'])
s = 0;
for i in range(0,lens):
key = response.meta['Wordone'][s:s+1]
input1.send_keys(key)
s +=1
time.sleep(1)
btn = driver.find_element_by_id('su')
btn.click()
# 识别关键词
key_seo = response.meta['url']
# 二级关键词
# key_seo_two = '注册'
result = 0
pages = response.meta['page'].split('|')
page = int(pages[1]) - int(pages[0])
#获取当前页面
url = driver.current_url
if int(pages[0]) > 1:
try:
url_new = url + '&pn=' + str(int(pages[0]) * 10 - 10)
driver.get(url_new)
except NameError:
print('链接不存在')
# print(page)
rank =0
pms = int(pages[0]);
status = 0
for i in range(page+1):
# 当前页数
print(str(pms)+"当前页数")
pms = pms + 1
try:
item = driver.find_elements_by_xpath('//*[@id="content_left"]/div[@class="result c-container new-pmd"]')
except:
print('未找到搜索结果')
break
for val in item:
driver.implicitly_wait(3)
try:
click_title = val.find_element_by_xpath('.//div[@class="f13 c-gap-top-xsmall se_st_footer user-avatar"]/a')
pattern = re.compile(r'(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
url = ''
if pattern.match(click_title.text):
host = urlparse("https://"+click_title.text)
url = host.netloc
except:
print('未找到a链接')
break
if key_seo == url:
print("匹配到" + click_title.text)
rank = val.get_attribute('id')
print('当前排名' + rank)
item = ProItem()
# 当前页
item['rw'] = self.rw
item['zt'] = 1
# 当前页排名多少
item['pm'] = pms
# id
item['id'] = response.meta['id']
# 服务器编号
item['bh'] = self.server
status = 1
click_title.send_keys(Keys.ENTER)
#点击记录日志
ip = proxy
log = open('click.log','a+')
log.write('\n点击成功:%s'%key_seo+',ip:%s'%ip)
yield item
break
if status ==1:
break
driver.implicitly_wait(5)
try:
driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(self.rand_time)
driver.find_element_by_partial_link_text("下一页").click()
except:
print('未找到下一页')
# 关闭浏览器
try:
time.sleep(self.rand_time)
driver.quit()
print('关闭浏览器')
except NameError:
print('无法关闭浏览器')
break
#item
if rank == 0:
item = ProItem()
item['rw'] = self.rw
# 当前页
item['zt'] = 0
# 当前页排名多少
item['pm'] = 0
# id
item['id'] = response.meta['id']
# 服务器编号
item['bh'] = self.server
yield item
else:
windows = driver.window_handles
# 获取最新打开窗口
driver.switch_to.window(windows[-1])
driver.implicitly_wait(5)
time.sleep(self.rand_time)
keys = ['关于我们','产品中心','新闻资讯','联系我们']
try:
key_click = random.choice(keys)
a_link = driver.find_element_by_link_text(key_click)
a_link.click()
print("%s点击成功"%key_click)
except:
print('%s未找到内页'%key_click)
# 随机下拉
# self.moveDown(driver)
# links = driver.find_elements_by_tag_name('a')
# link_count = 0
# for val in links:
# # val.click()
# # break
# link_count += 1
# click_link = random.uniform(1, link_count)
# print("链接数量%s"%link_count)
# st = 0
# for val in links:
# st += 1
# if click_link == st:
# print("点击%s"%val)
# #val.click()
# val.send_keys(Keys.ENTER)
# break
# time.sleep(3)
# 关闭浏览器
try:
time.sleep(self.rand_time)
driver.quit()
print('关闭浏览器1')
except BaseException:
print('无法关闭浏览器')
def errback_httpbin(self,failure):
# log all failures
self.logger.info(repr(failure))
# in case you want to do something special for some errors,
# you may need the failure's type:
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
self.logger.info('HttpError错误 on %s', response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
self.logger.info('DNSLookupError错误 on %s', request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
self.logger.info('TimeoutError错误 on %s', request.url)
火狐浏览器,明明每次都quit了,但是内存会不断增高,一直到90%以上,会出现浏览器无法打开,然后无限循环。。目前情况就是隔一天,就要把挂掉的虚拟机全部重启,再开启进程。。这样维护成本太高,本来想用
QtWebEngineWidgets 去内置浏览器,但是无法用selenium 驱动会很麻烦,如果做成鼠标键盘模拟,python 的性能也会比较差,而且稳定性也不得而知
还有一种方案,requests 模拟效率高,稳定性强,缺点是容易被封,有没有更好的方案