爬数需谨慎,违法就芭比Q了。
准备工作需要用到selenium框架,同时也得有chromedrive(因为我用的是chrome浏览器,根据自己的浏览器版本下载对应的浏览器驱动,这里不做详解)。
browser = webdriver.Chrome()
# 打开博客这里用的是知网数据测试,大家千万不敢乱爬知网
browser.get('https://kns.cnki.net/kns8/defaultresult/index')
time.sleep(0.3)
# 进入搜索,输入主题
#通过ID确定txt_search输入框,把关键词放进去
browser.find_element(By.ID,'txt_search').send_keys('图书馆管理系统')
#通过class_name找到搜索按钮,并触法点击事件
browser.find_element(By.CLASS_NAME,'search-btn').click()
#睡觉是为了等待查询结果,不然下边的代码可能找不到对应的标签。
time.sleep(1)
#这里不做解释了和上边找输入框和查询按钮功能一样,根据找到的text把找到的‘条数’搞出来
num = browser.find_element(By.CLASS_NAME,'pagerTitleCell').text
num = num.replace('共找到','')
num = num.replace('条结果','')
num = num.replace(',','')
num = int(num)
#不解释
browser.find_element(By.CLASS_NAME,'icon-detail').click()
time.sleep(1)
#定义几个数组:序号、作者、作者的单位、标题、期刊、时间、引用次数、下载次数、摘要、关键词,把爬到的数据放到数组里边
xuhao = []
author = []
authorcompany = []
title =[]
qikan =[]
shijian = []
yinyongcount = []
downloadcount = []
zhaiyao = []
guanjianci = []
#这个n有大用为了控制切换页面后重新循环
n = 0
#知网查完默认是20条为一页,while是为了循环找到每一页
while n<=(num/20):
#因为是20条数据为一页,每一条数据中的作者、标题等标签除了12345678...不通其他的都一样
arr = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
for i in arr:
#这里的字符串都是XPATH路径,获取XPATH路径方法放在代码后边
ddxuhao = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[1]/i'
ddtitle = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/h6/a'
ddauthor = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/div/p/a'
ddauthorcompany = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/div/p/span/a'
ddqikan = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[1]/span[1]/a'
dddate = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[1]/span[2]'
ddyinyongcount = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[1]/span[4]/a'
dddownloadcount = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[1]/span[3]/a'
ddzhaiyao = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[2]'
ddguanjianci = '//*[@id="gridTable"]/dl/dd[' +str(i)+']/div[2]/p[3]'
#通过XPATH找到对应的标签,取到标签里的内容。写try catch的原因是,可能有些标签不存在。
try:
ddXuhao = int(browser.find_element(By.XPATH, ddxuhao).text)
except Exception as e:
ddXuhao =0
print('序号获取失败')
try:
ddTitle = browser.find_element(By.XPATH, ddtitle).text
except Exception as e:
ddTitle = ''
print('标题获取失败')
try:
ddQikan = browser.find_element(By.XPATH, ddqikan).text
except Exception as e:
ddQikan = ''
print('期刊获取失败')
try:
ddDate = browser.find_element(By.XPATH, dddate).text
except Exception as e:
ddDate = '0000-00-00 00:00:00'
print('发表时间获取失败')
try:
ddYinyongcount =int(browser.find_element(By.XPATH, ddyinyongcount).text)
except Exception as e:
ddYinyongcount = 0
print('引用次数获取失败')
try:
ddDownloadcount = int(browser.find_element(By.XPATH, dddownloadcount).text)
except Exception as e:
ddDownloadcount = 0
print('下载次数获取失败')
try:
ddZhaiyao = browser.find_element(By.XPATH, ddzhaiyao).text
except Exception as e:
ddZhaiyao = ''
print('摘要获取失败')
try:
ddGuanjianci = browser.find_element(By.XPATH, ddguanjianci).text
except Exception as e:
ddGuanjianci = ''
print('关键词获取失败')
try:
ddAuthor = browser.find_element(By.XPATH, ddauthor).text
except Exception as e:
ddAuthor = ''
print('作者获取失败')
try:
ddAuthorCompany = browser.find_element(By.XPATH, ddauthorcompany).text
except Exception as e:
ddAuthorCompany = ''
print('作者单位获取失败')
print(ddXuhao)
print(ddTitle)
print(ddAuthor)
print(ddAuthorCompany)
print(ddQikan)
print(ddDate)
print(ddYinyongcount)
print(ddDownloadcount)
print(ddZhaiyao)
print(ddGuanjianci)
#把序号、标题、作者都放到对应的数组里,具体后边的处理根据自己的需求来搞。
xuhao.append(ddXuhao)
title.append(ddTitle)
author.append(ddAuthor)
authorcompany.append(ddAuthorCompany)
qikan.append(ddQikan)
shijian.append(ddDate)
yinyongcount.append(ddYinyongcount)
downloadcount.append(ddDownloadcount)
zhaiyao.append(ddZhaiyao)
guanjianci.append(ddGuanjianci)
#判断如果遍历完本页的20条数据,者通过ID找到‘下一页’按钮切到下一页
if i==20:
browser.find_element(By.ID, 'PageNext').click()
time.sleep(1)
n = n + 1
获取XPATH路径 :在开发者工具里,右键需要的标签,选择copy->copy xpath
如果期间发现页面中间有iframe标签,通过id、classname或者xpath找不到对应的标签了,还得注意切iframe
frame1 = browser.find_element(By.ID,"iframeResult")
browser.switch_to.frame(frame1)
还可能用到新打开页面,切换到打开页面,关闭页面的功能
#通过按钮事件打开新的页面
browser.find_element(By.XPATH, '//*[@id="ctl00"]/table/tbody/tr[2]/td/table/tbody/tr['+str(i)+']/td[2]/a').click()
time.sleep(3)
windows = browser.window_handles # 获取当前所有页面句柄
browser.switch_to.window(windows[1]) # 切换当新页面
#关闭新打开的页面
browser.close()
哇咔咔,搞定了,有其他问题可以留言讨论。