爬取cortellis上的所有deals记录
需求:
登陆cortellis账号,按照日期从1980-1-1到2019-1-1爬取所有的deals文档以及deals对应的principle company与partner company的文档,保存至本地。
思路:利用selenium模拟浏览器实现登陆后,用Select创建下拉菜单对象,然后按照输入筛选日期,修改日期利用了Selenium执行js的函数:execute_script,另外注意,为了能修改日历的from-to日期,必须用控制js删除其’readonly’属性。
然后就是每个deals的下载,用requests请求下载,form是固定格式的,另外requests需要有效的cookie,由get_current_cookie函数返回。
下面是代码:
from selenium import webdriver
import sys,io
from selenium.webdriver.common.by import By
import time
import xlrd
import json
import re,os
import requests
from selenium.webdriver.support.ui import Select
# from openpyxl import Workbook as wb
#防止出现命令行无法识别
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
#定义headers
headers = {
'authority': 'www.cortellis.com',
'method': 'POST',
'path': '/intelligence/exportReport.do',
'scheme': 'https',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'content-length': '358',
'content-type': 'application/x-www-form-urlencoded',
'Cookie': '',
'origin': 'https://www.cortellis.com',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
}
#获取当前页面的cookie
def get_current_cookie():
cookie = browser.get_cookies()
jsonCookies = json.dumps(cookie)
#拼接
cookie_current = [item['name'] + '=' + item['value'] for item in cookie]
#这里注意每一个字段之间要有"; "分隔,否则无效
cookiestr = '; '.join(item for item in cookie_current)
return cookiestr
#定义浏览器载体
browser = webdriver.Chrome()
browser.get("https://www.cortellis.com/intelligence/login.do")
browser.switch_to.frame('loginIframe')
#定位登陆模块
login_block = browser.find_elements_by_class_name('wui-material-input')
#定位用户名模块
user_name_block = login_block[0].find_element_by_tag_name('input')
user_name_block.send_keys('我的账号')
time.sleep(1)
#定位密码模块
password_block = login_block[1].find_element_by_tag_name('input')
password_block.send_keys('我的密码')
time.sleep(1)
#定位登陆按钮模块
login_button = browser.find_element_by_class_name('pull-right')
login_button = login_button.find_element_by_tag_name('button')
login_button.click()
#点击按钮后必须switch_to才能定位成功新的窗口
browser.switch_to.window(browser.window_handles[0])
time.sleep(15)
#定位搜索框
search_block = browser.find_element_by_class_name('homepage-header-container')
search_block = search_block.find_element_by_tag_name('header')
search_block = search_block.find_element_by_class_name('cortellis-header-top')
#定位高级搜索
advanced_block = search_block.find_element_by_class_name('quick-search-bar-bottom').find_element_by_tag_name('a')
advanced_block.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)
#进入高级搜索页面,选择tab选项deal
deal_block = browser.find_element_by_id('advancedSearch').find_element_by_class_name('tabs').find_elements_by_tag_name('li')[2]
deal_block.click()
time.sleep(3)
#下拉菜单选择date
date_select = Select(browser.find_elements_by_class_name("advSrchType")[0])
date_select.select_by_value('date')
time.sleep(0.5)
#分别定位from下拉菜单和to下拉菜单
from_block = browser.find_element_by_class_name('dateFields').find_elements_by_tag_name('input')[0]
to_block = browser.find_element_by_class_name('dateFields').find_elements_by_tag_name('input')[1]
#利用js去掉readonly属性,使其可以被修改
js = 'document.getElementsByClassName("dateFields")[0].getElementsByTagName("input")[0].removeAttribute("readonly");'
browser.execute_script(js)
js = 'document.getElementsByClassName("dateFields")[0].getElementsByTagName("input")[1].removeAttribute("readonly");'
browser.execute_script(js)
#直接通过js修改日期
js_value = "document.getElementsByClassName('dateFields')[0].getElementsByTagName('input')[0].value='{}'" .format("01-Jan-1980")
browser.execute_script(js_value)
js_value = "document.getElementsByClassName('dateFields')[0].getElementsByTagName('input')[1].value='{}'" .format("01-Jan-2019")
browser.execute_script(js_value)
#需要再次点击修改才能生效
from_block.click()
time.sleep(1)
to_block.click()
time.sleep(1)
#解决窗口无法定位到搜索按钮
js="window.scrollTo(1500,0)"
browser.execute_script(js)
time.sleep(1)
#点击搜索按钮
search_block = browser.find_element_by_id('advSearchUtilityBar').find_elements_by_tag_name('a')[2]
search_block.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(10)
#进入到deal搜索结果页面
#选择按照deal start date排序
deal_start_date_block = Select(browser.find_element_by_class_name('dataTableActions').find_element_by_class_name('sortByPnl').find_element_by_id('formfieldSortBy'))
deal_start_date_block.select_by_value('3')
time.sleep(5)
#点击进入第一个deal的详情页面
deal_title_block = browser.find_element_by_id('resultsTableFixed')
deal_title_block = deal_title_block.find_elements_by_class_name('ellipse')
ele = browser.find_element_by_id('resultsTableFixed').find_elements_by_class_name('ellipse')[0]
open_deal_title = ele.find_element_by_tag_name('a')
open_deal_title.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)
i = 0
while 1:
#取出当前url中包括的deal的id
current_url = browser.current_url
current_url = current_url.split('/')
deal_id = current_url[-1]
#取出当前deal的名称deal_title
table_content = browser.find_element_by_class_name('definitiontable')
tr = table_content.find_elements_by_tag_name('tr')
deal_title = tr[0].find_element_by_class_name('ng-binding').text
#处理principle company为空的情况
try:
principal_company = tr[1].find_element_by_class_name('ng-binding').text
principal_company_id = tr[1].find_element_by_tag_name("a").get_attribute('href')
principal_company_id = principal_company_id.split('/')[-1]
except :
principal_company = 'None'
principal_company_id = 0
#处理partner company为空的情况
try:
partner_company = tr[2].find_element_by_class_name('ng-binding').text
partner_company_id = tr[2].find_element_by_tag_name("a").get_attribute('href')
partner_company_id = partner_company_id.split('/')[-1]
except :
partner_company = 'None'
partner_company_id = 0
#获取add_date
deal_add_date = tr[-1].find_element_by_class_name('ng-binding').text
deal_start_date = tr[-5].find_element_by_class_name('ng-binding').text
print(deal_start_date)
deal_add_date = deal_add_date.split('-')
deal_add_date = deal_add_date[-1] + '-' + deal_add_date[-2]
# 三种文件的表单数据
deal_form = {
'id': deal_id,
'exportFormat': 'PDF',
'isCustomized': 'false',
'exportReportName': deal_title + '.pdf',
'selectedFields': 'dealSnapShot,drugs,events,financial[dealFinanceSummary&principal&partner&],',
'entityType': 'nextgendealall',
'exportSubmit': '提交'
}
principal_form = {
'id': principal_company_id,
'exportFormat': 'PDF',
'isCustomized': 'false',
'exportReportName': principal_company + '.pdf',
'selectedFields': 'companySnapShot,companyFinancials[],companyContacts,companyDrugs[companyDrugCountsByIndications&companyDrugCountsByIndicationsChart&companyDrugCountsByPhasesHighest&companyDrugCountsByPhasesHighestChart&],deals,companyTrials[companyTrialCountsByIndications&companyTrialCountsByPhases&],patents,companyChangeHistory',
'entityType': 'company',
'exportSubmit': '提交'
}
partner_form = {
'id': partner_company_id,
'exportFormat': 'PDF',
'isCustomized': 'false',
'exportReportName': partner_company + '.pdf',
'selectedFields': 'companySnapShot,companyFinancials[],companyContacts,companyDrugs[companyDrugCountsByIndications&companyDrugCountsByIndicationsChart&companyDrugCountsByPhasesHighest&companyDrugCountsByPhasesHighestChart&],deals,companyTrials[companyTrialCountsByIndications&companyTrialCountsByPhases&],patents,companyChangeHistory',
'entityType': 'company',
'exportSubmit': '提交'
}
#下载url
download_url = 'https://www.cortellis.com/intelligence/exportReport.do'
#这里获取一次cookie,作为requests的参数之一
cookie = get_current_cookie()
headers['Cookie'] = cookie
#请求下载三个文件
deal_film = requests.post(download_url,headers=headers,data=deal_form)
principle_film = requests.post(download_url,headers=headers,data=principal_form)
partner_film = requests.post(download_url,headers=headers,data=partner_form)
new_dir = './result/' + deal_add_date + '/' + str(deal_id) + '-deal'
isExists = os.path.exists(new_dir)
#保存文件
if isExists:
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' + str(deal_id) + '-' + 'deal.pdf' ,'wb') as f1:
f1.write(deal_film.content)
if principal_company != 'None':
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' +str(principal_company_id) + '-' + 'principal.pdf' ,'wb') as f2:
f2.write(principle_film.content)
if partner_company != 'None':
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal'+ '/' +str(partner_company_id) + '-' + 'partner.pdf' ,'wb') as f3:
f3.write(partner_film.content)
else:
os.makedirs(new_dir)
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' + str(deal_id) + '-' + 'deal.pdf' ,'wb') as f1:
f1.write(deal_film.content)
if principal_company != 'None':
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal' + '/' +str(principal_company_id) + '-' + 'principal.pdf' ,'wb') as f2:
f2.write(principle_film.content)
if partner_company != 'None':
with open('./result/' + deal_add_date + '/' + str(deal_id) + '-deal'+ '/' +str(partner_company_id) + '-' + 'partner.pdf' ,'wb') as f3:
f3.write(partner_film.content)
#点击下一个deal的按钮
next_page = browser.find_element_by_class_name('reportPointer').find_elements_by_tag_name('a')[1]
next_page.click()
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)