写这个是为了寻求一个可以实现模拟登录功能的大佬,我已经实现了,爬取第一页,但是到了第二页需要登录,我就不会了。或者有想法一起探讨。会写先登录然后再爬取的也行。
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import openpyxl as op
import csv
from urllib.parse import urljoin
ws=op.Workbook()
wb=ws.create_sheet(index=0)
wb.cell(row=1,column=1,value='企业名称')
wb.cell(row=1,column=2,value='统一社会信用代码')
wb.cell(row=1,column=3,value='法定代表人')
wb.cell(row=1,column=4,value='经营状态')
wb.cell(row=1,column=5,value='成立日期')
wb.cell(row=1,column=6,value='行政区划')
wb.cell(row=1,column=7,value='注册资本')
wb.cell(row=1,column=8,value='实缴资本')
wb.cell(row=1,column=9,value='企业类型')
wb.cell(row=1,column=10,value='所属行业')
wb.cell(row=1,column=11,value='工商注册号')
wb.cell(row=1,column=12,value='组织机构代码')
wb.cell(row=1,column=13,value='纳税人识别号')
wb.cell(row=1,column=14,value='纳税人资质')
wb.cell(row=1,column=15,value='营业期限')
wb.cell(row=1,column=16,value='核准日期')
wb.cell(row=1,column=17,value='登记机关')
wb.cell(row=1,column=18,value='参保人数')
wb.cell(row=1,column=19,value='曾用名')
wb.cell(row=1,column=20,value='注册地址')
wb.cell(row=1,column=21,value='经营范围')
wb.cell(row=1,column=22,value='主管部门名称')
wb.cell(row=1,column=23,value='持股比例')
wb.cell(row=1,column=24,value='认缴出资额')
wb.cell(row=1,column=25,value='认缴出资日期')
#get直接返回,不再等待界面加载完成
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
# 设置谷歌驱动器的环境
options = webdriver.ChromeOptions()
# 设置chrome不加载图片,提高速度
#options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 设置不显示窗口
#options.add_argument('--headless')
# 创建一个谷歌驱动器
driver = webdriver.Chrome(options=options)
# 设置搜索主题
theme = "施工图审查"
# 设置所需篇数
papers_need = 20
# 打开页面
#driver.get("https://aiqicha.baidu.com/s?q=%E6%96%BD%E5%B7%A5%E5%9B%BE%E5%AE%A1%E6%9F%A5&t=0")
driver.get("https://aiqicha.baidu.com/s?q=%E6%96%BD%E5%B7%A5%E5%9B%BE%E5%AE%A1%E6%9F%A5")
# 传入关键字
WebDriverWait( driver, 100 ).until( EC.presence_of_element_located( (By.XPATH ,'''//*[@id="aqc-header-search-input"]''') ) ).send_keys(theme)
# 点击搜索
WebDriverWait( driver, 100 ).until( EC.presence_of_element_located( (By.XPATH ,"/html/body/div[1]/div[1]/header/div/div[2]/button") ) ).click()
time.sleep(3)
# 点击切换中文文献
# 获取总文献数和页数
# 赋值序号, 控制爬取的文章数量
count = 1
# 当,爬取数量小于需求时,循环网页页码
while count <= papers_need:
# 等待加载完全,休眠3S
time.sleep(3)
title_list = WebDriverWait( driver,1 ).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > div > div:nth-child(-n+10) > div.info > div > h3 > a") ) )
# 循环网页一页中的条目
for i in range(len(title_list)):
try:
term = count%10 # 本页的第几个条目
title_xpath = "/html/body/div[1]/div[2]/div/div[1]/div[3]/div[2]/div/div[@class='card']/div[2]/div/h3/text()"
#author_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='author']/td[3]/text()"
#source_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='source']/td[4]/text()"
#date_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='data']/td[5]/text()"
#database_xpath = "/html/body/div[3]/div[2]/div[2]/div[2]/form/div/table/tbody/tr[@class='data']/td[6]/text()"
#title = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,title_xpath) ) ).click()
#authors = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,author_xpath) ) ).text
#source = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,source_xpath) ) ).text
#date = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,date_xpath) ) ).text
#database = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,database_xpath) ) ).text
#date = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[2]/div[1]/div[3]/div/div/div[3]/div/h1") ) ).text
# 点击条目
title_list[i].click()
# 获取driver的句柄
n = driver.window_handles
# driver切换至最新生产的页面
driver.switch_to.window(n[-1])
# 开始获取页面信息
title = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[1]/td[2]/span") ) ).text
code = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[1]/td[4]") ) ).text
fadindaibiao = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,"/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[2]/td[2]/div[2]/a[1]") ) ).text
jingyingzhuangtai = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[2]/td[4]') ) ).text
chenglidata = WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[3]/td[2]') ) ).text
area= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[3]/td[4]') ) ).text
zhuceziben= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[2]') ) ).text
shijiaoziben= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[4]') ) ).text
qiyeleixing= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[5]/td[2]') ) ).text
suosuhangye= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[5]/td[4]') ) ).text
gongshangzhucehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[6]/td[2]/span') ) ).text
zuzhijigodaima= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[6]/td[4]/span') ) ).text
nasuorenshibiehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[2]/span') ) ).text
nasuorenzizhi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[4]') ) ).text
yingyeqingxian= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[2]') ) ).text
hezhunriqi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[4]') ) ).text
dengjijiguan= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[2]') ) ).text
canbaorenshu= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[4]') ) ).text
cengyongming= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[10]/td[2]/p') ) ).text
address= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[11]/td[2]/span[1]') ) ).text
jingyingfanwei= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[12]/td[2]/div') ) ).text
gudongmingcheng= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[2]/div/div/div[2]/div[1]/div/div[1]/a') ) ).text
chigubili= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[3]/div/div/span') ) ).text
money= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[4]/div/span') ) ).text
data= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[5]/div/span') ) ).text
try:
zhuceziben= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[2]') ) ).text
shijiaoziben= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[4]/td[4]') ) ).text
nasuorenshibiehao= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[2]/span') ) ).text
nasuorenzizhi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[7]/td[4]') ) ).text
hezhunriqi= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[8]/td[4]') ) ).text
dengjijiguan= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[9]/td[2]') ) ).text
cengyongming= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[3]/table/tbody/tr[10]/td[2]/p') ) ).text
gudongmingcheng= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[2]/div/div/div[2]/div[1]/div/div[1]/a') ) ).text
chigubili= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[3]/div/div/span') ) ).text
money= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[4]/div/span') ) ).text
data= WebDriverWait( driver, 10 ).until( EC.presence_of_element_located((By.XPATH ,'/html/body/div[1]/div[2]/div/div[7]/div[1]/div[4]/div[1]/table/tbody/tr/td[5]/div/span') ) ).text
except:
zhuceziben= '无'
shijiaoziben= '无'
nasuorenshibiehao= '无'
nasuorenzizhi= '无'
hezhunriqi= '无'
dengjijiguan= '无'
cengyongming= '无'
gudongmingcheng= '无'
chigubili= '无'
money= '无'
data= '无'
url = driver.current_url
# 获取下载链接
# link = WebDriverWait( driver, 10 ).until( EC.presence_of_all_elements_located((By.CLASS_NAME ,"btn-dlcaj") ) )[0].get_attribute('href')
# link = urljoin(driver.current_url, link)
# 写入文件
#res = "+str(title)+str(authors)+str(institute)+str(abstract)+str(keywords)".replace("\n","")+"\n"
#print(res)
print("企业名称:"+str(title)+"统一社会信用代码:"+str(code)+"法定代表人:"+str(fadindaibiao)+"经营状态:"+str(jingyingzhuangtai)+"成立日期:"+str(chenglidata)+"行政区划:"+str(area)+
"注册资本:"+str(zhuceziben)+"实缴资本:"+str(shijiaoziben)+
"企业类型:"+str(qiyeleixing)+"所属行业:"+str(suosuhangye)+"工商注册号:"+str(gongshangzhucehao)+"组织机构代码:"+str(zuzhijigodaima)+"纳税人识别号:"+str(nasuorenshibiehao)+
"纳税人资质:"+str(nasuorenzizhi)+"营业期限:"+str(yingyeqingxian)+"核准日期:"+str(hezhunriqi)+"登记机关:"+str(dengjijiguan)+"参保人数:"+str(canbaorenshu)+
"曾用名:"+str(cengyongming)+"注册地址:"+str(address)+"经营范围:"+str(jingyingfanwei)+
"股东:"+str(gudongmingcheng)+"持股比列:"+str(chigubili)+"认缴出资额:"+str(money)+"认缴出资日期:"+str(data))
wb.cell(row=count,column=1,value=title)
wb.cell(row=count,column=2,value=code)
wb.cell(row=count,column=3,value=fadindaibiao)
wb.cell(row=count,column=4,value=jingyingzhuangtai)
wb.cell(row=count,column=5,value=chenglidata)
wb.cell(row=count,column=6,value=area)
wb.cell(row=count,column=7,value=zhuceziben)
wb.cell(row=count,column=8,value=shijiaoziben)
wb.cell(row=count,column=9,value=qiyeleixing)
wb.cell(row=count,column=10,value=suosuhangye)
wb.cell(row=count,column=11,value=gongshangzhucehao)
wb.cell(row=count,column=12,value=zuzhijigodaima)
wb.cell(row=count,column=13,value=nasuorenshibiehao)
wb.cell(row=count,column=14,value=nasuorenzizhi)
wb.cell(row=count,column=15,value=yingyeqingxian)
wb.cell(row=count,column=16,value=hezhunriqi)
wb.cell(row=count,column=17,value=dengjijiguan)
wb.cell(row=count,column=18,value=canbaorenshu)
wb.cell(row=count,column=19,value=cengyongming)
wb.cell(row=count,column=20,value=address)
wb.cell(row=count,column=21,value=jingyingfanwei)
wb.cell(row=count,column=22,value=gudongmingcheng)
wb.cell(row=count,column=23,value=chigubili)
wb.cell(row=count,column=24,value=money)
wb.cell(row=count,column=25,value=data)
#with open('CNKI_res.txt', 'a', encoding='gbk') as f:
#f.write(keywords)
#except:
except:
print(count)
print("失败")
# 跳过本条,接着下一个
continue
finally:
n2 = driver.window_handles
if len(n2) > 1:
driver.close()
driver.switch_to.window(n2[0])
# 计数,判断需求是否足够
count += 1
if count == papers_need:break
#except:
#print(" 第{count} 条爬取失败\n")
# 跳过本条,接着下一个
#continue
ws.save('施工图企业.xls')
WebDriverWait( driver, 10 ).until( EC.presence_of_element_located( (By.CSS_SELECTOR ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > ul > li.ivu-page-next > a > i") ) ).click()
#WebDriverWait( driver, 10 ).until( EC.presence_of_element_located( (By.CSS_SELECTOR ,"body > div.base.page-search.has-search-tab > div.aqc-content-wrapper.has-footer > div > div.main > div.list-wrap > div.company-list > div > div > div > button > span") ) ).click()
#time.sleep(50)#等待20s,完成手动登录操作
# 关闭浏览器
driver.close()
第一页爬取成功后出现这个
急需大佬解决。在我的基础上添加一部分。