利用python+selenium+request爬取cortellis上的所有deals记录

爬取cortellis上的所有deals记录

需求:
登陆cortellis账号,按照日期从1980-1-1到2019-1-1爬取所有的deals文档以及deals对应的principle company与partner company的文档,保存至本地。

思路:利用selenium模拟浏览器实现登陆后,用Select创建下拉菜单对象,然后按照输入筛选日期,修改日期利用了Selenium执行js的函数:execute_script,另外注意,为了能修改日历的from-to日期,必须用控制js删除其’readonly’属性。
然后就是每个deals的下载,用requests请求下载,form是固定格式的,另外requests需要有效的cookie,由get_current_cookie函数返回。

下面是代码:

from selenium import webdriver
import sys,io
from selenium.webdriver.common.by import By
import time
import xlrd
import json
import re,os
import requests
from selenium.webdriver.support.ui import Select

# from openpyxl import Workbook as wb

#防止出现命令行无法识别
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')

#定义headers
headers = {
		'authority': 'www.cortellis.com',
		'method': 'POST',
		'path': '/intelligence/exportReport.do',
		'scheme': 'https',
		'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
		'accept-encoding': 'gzip, deflate, br',
		'accept-language': 'zh-CN,zh;q=0.9',
		'cache-control': 'max-age=0',
		'content-length': '358',
		'content-type': 'application/x-www-form-urlencoded',
		'Cookie': '',
		'origin': 'https://www.cortellis.com',
		'upgrade-insecure-requests': '1',
		'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
		}

#获取当前页面的cookie
def get_current_cookie():
	cookie = browser.get_cookies()
	jsonCookies = json.dumps(cookie)
	#拼接
	cookie_current = [item['name'] + '=' + item['value'] for item in cookie]
	#这里注意每一个字段之间要有"; "分隔,否则无效
	cookiestr = '; '.join(item for item in cookie_current)
	return cookiestr

#定义浏览器载体
browser = webdriver.Chrome()
browser.get("https://www.cortellis.com/intelligence/login.do")
browser.switch_to.frame('loginIframe')

#定位登陆模块
login_block = browser.find_elements_by_class_name('wui-material-input')
#定位用户名模块
user_name_block = login_block[0].find_element_by_tag_name('input')
user_name_block.send_keys('我的账号')
time.sleep(1)
#定位密码模块
password_block = login_block[1].find_element_by_tag_name('input')
password_block.send_keys('我的密码')
time.sleep(1)

#定位登陆按钮模块
login_button = browser.find_element_by_class_name('pull-right')
login_button = login_button.find_element_by_tag_name('button')
login_button.click()
#点击按钮后必须switch_to才能定位成功新的窗口
browser.switch_to.window(browser.window_handles[0])
time.sleep(15)

#定位搜索框
search_block = browser.find_element_by_class_name('homepage-header-container')
search_block = search_block.find_element_by_tag_name('header')
search_block = search_block.find_element_by_class_name('cortellis-header-top')
#定位高级搜索
advanced_block = search_block.find_element_by_class_name('quick-search-bar-bottom').find_element_by_tag_name('a')
advanced_block.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)

#进入高级搜索页面,选择tab选项deal
deal_block = browser.find_element_by_id('advancedSearch').find_element_by_class_name('tabs').find_elements_by_tag_name('li')[2]
deal_block.click()
time.sleep(3)

#下拉菜单选择date
date_select = Select(browser.find_elements_by_class_name("advSrchType")[0])
date_select.select_by_value('date')
time.sleep(0.5)

#分别定位from下拉菜单和to下拉菜单
from_block = browser.find_element_by_class_name('dateFields').find_elements_by_tag_name('input')[0]
to_block = browser.find_element_by_class_name('dateFields').find_elements_by_tag_name('input')[1]

#利用js去掉readonly属性,使其可以被修改
js = 'document.getElementsByClassName("dateFields")[0].getElementsByTagName("input")[0].removeAttribute("readonly");'
browser.execute_script(js)
js = 'document.getElementsByClassName("dateFields")[0].getElementsByTagName("input")[1].removeAttribute("readonly");'
browser.execute_script(js)

#直接通过js修改日期
js_value = "document.getElementsByClassName('dateFields')[0].getElementsByTagName('input')[0].value='{}'" .format("01-Jan-1980")
browser.execute_script(js_value)
js_value = "document.getElementsByClassName('dateFields')[0].getElementsByTagName('input')[1].value='{}'" .format("01-Jan-2019")
browser.execute_script(js_value)


#需要再次点击修改才能生效
from_block.click()
time.sleep(1)
to_block.click()
time.sleep(1)

#解决窗口无法定位到搜索按钮
js="window.scrollTo(1500,0)"
browser.execute_script(js)
time.sleep(1)

#点击搜索按钮
search_block = browser.find_element_by_id('advSearchUtilityBar').find_elements_by_tag_name('a')[2]
search_block.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(10)


#进入到deal搜索结果页面
#选择按照deal start date排序
deal_start_date_block = Select(browser.find_element_by_class_name('dataTableActions').find_element_by_class_name('sortByPnl').find_element_by_id('formfieldSortBy'))
deal_start_date_block.select_by_value('3')
time.sleep(5)


#点击进入第一个deal的详情页面
deal_title_block = browser.find_element_by_id('resultsTableFixed')
deal_title_block = deal_title_block.find_elements_by_class_name('ellipse')
ele = browser.find_element_by_id('resultsTableFixed').find_elements_by_class_name('ellipse')[0]
open_deal_title = ele.find_element_by_tag_name('a')
open_deal_title.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)
i = 0
while 1:
	#取出当前url中包括的deal的id
	current_url = browser.current_url
	current_url = current_url.split('/')
	deal_id = current_url[-1]

	#取出当前deal的名称deal_title
	table_content = browser.find_element_by_class_name('definitiontable')
	tr = table_content.find_elements_by_tag_name('tr')
	deal_title = tr[0].find_element_by_class_name('ng-binding').text

	#处理principle company为空的情况
	try:
		principal_company = tr[1].find_element_by_class_name('ng-binding').text
		principal_company_id = tr[1].find_element_by_tag_name("a").get_attribute('href')
		principal_company_id = principal_company_id.split('/')[-1]
	except :
		principal_company = 'None'
		principal_company_id = 0

	#处理partner company为空的情况
	try:
		partner_company = tr[2].find_element_by_class_name('ng-binding').text
		partner_company_id = tr[2].find_element_by_tag_name("a").get_attribute('href')
		partner_company_id = partner_company_id.split('/')[-1]
	except :
		partner_company = 'None'
		partner_company_id = 0

	#获取add_date
	deal_add_date = tr[-1].find_element_by_class_name('ng-binding').text

	deal_start_date = tr[-5].find_element_by_class_name('ng-binding').text
	print(deal_start_date)

	deal_add_date = deal_add_date.split('-')
	deal_add_date = deal_add_date[-1] + '-' + deal_add_date[-2]

	# 三种文件的表单数据
	deal_form = {
		'id': deal_id,
		'exportFormat': 'PDF',
		'isCustomized': 'false',
		'exportReportName': deal_title + '.pdf',
		'selectedFields': 'dealSnapShot,drugs,events,financial[dealFinanceSummary&principal&partner&],',
		'entityType': 'nextgendealall',
		'exportSubmit': '提交'
	}
	principal_form = {
		'id': principal_company_id,
		'exportFormat': 'PDF',
		'isCustomized': 'false',
		'exportReportName': principal_company + '.pdf',
		'selectedFields': 'companySnapShot,companyFinancials[],companyContacts,companyDrugs[companyDrugCountsByIndications&companyDrugCountsByIndicationsChart&companyDrugCountsByPhasesHighest&companyDrugCountsByPhasesHighestChart&],deals,companyTrials[companyTrialCountsByIndications&companyTrialCountsByPhases&],patents,companyChangeHistory',
		'entityType': 'company',
		'exportSubmit': '提交'
	}
	partner_form = {
		'id': partner_company_id,
		'exportFormat': 'PDF',
		'isCustomized': 'false',
		'exportReportName': partner_company + '.pdf',
		'selectedFields': 'companySnapShot,companyFinancials[],companyContacts,companyDrugs[companyDrugCountsByIndications&companyDrugCountsByIndicationsChart&companyDrugCountsByPhasesHighest&companyDrugCountsByPhasesHighestChart&],deals,companyTrials[companyTrialCountsByIndications&companyTrialCountsByPhases&],patents,companyChangeHistory',
		'entityType': 'company',
		'exportSubmit': '提交'
	}

	#下载url
	download_url = 'https://www.cortellis.com/intelligence/exportReport.do'

	#这里获取一次cookie,作为requests的参数之一
	cookie = get_current_cookie()
	headers['Cookie'] = cookie
	#请求下载三个文件
	deal_film = requests.post(download_url,headers=headers,data=deal_form)
	principle_film = requests.post(download_url,headers=headers,data=principal_form)
	partner_film = requests.post(download_url,headers=headers,data=partner_form)

	new_dir = './result/' + deal_add_date + '/' + str(deal_id) + '-deal'
	isExists = os.path.exists(new_dir) 
	#保存文件
	if isExists:
		with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' + str(deal_id) + '-' + 'deal.pdf' ,'wb') as f1:
			f1.write(deal_film.content)
		if principal_company != 'None':
			with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' +str(principal_company_id) + '-' + 'principal.pdf' ,'wb') as f2:
				f2.write(principle_film.content)
		if partner_company != 'None':
			with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal'+ '/' +str(partner_company_id) + '-' + 'partner.pdf' ,'wb') as f3:
				f3.write(partner_film.content)
	else:
		os.makedirs(new_dir)
		with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' + str(deal_id) + '-' + 'deal.pdf' ,'wb') as f1:
			f1.write(deal_film.content)
		if principal_company != 'None':
			with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' +str(principal_company_id) + '-' + 'principal.pdf' ,'wb') as f2:
				f2.write(principle_film.content)
		if partner_company != 'None':
			with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal'+ '/' +str(partner_company_id) + '-' + 'partner.pdf' ,'wb') as f3:
				f3.write(partner_film.content)
	
	#点击下一个deal的按钮
	next_page = browser.find_element_by_class_name('reportPointer').find_elements_by_tag_name('a')[1]
	next_page.click()
	browser.switch_to.window(browser.window_handles[0])
	time.sleep(5)	

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值