远见🔍,站得更高,看得更远!本程序是基于远见搜索的CNKI论文爬虫程序,下面介绍它的用处以及功能。
一、远见搜索
以“双支持向量机”为例,使用远见搜索检索得到如下结果:
可以查看页面代码
其页面代码简单,容易进行爬虫。
二、程序代码
基于远见搜索的CNKI爬虫代码如下:
import os
import random
import openpyxl
import time
import requests
from bs4 import BeautifulSoup
import re
def CNKI(filename, title_input, keyword_input, author_input, unit_input, content_input):
url = 'http://yuanjian.cnki.net/Search/ListResult'
user_agent = {
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
html = ''
results = []
time_start = time.time()
title_link_result = openpyxl.Workbook()
sheet = title_link_result.active
sheet.title = 'result'
col_name = ['title', 'keywords', 'authors', 'journal', 'journal_type', 'year', 'issue', 'download_num', 'cited_num', 'link']
sheet.append(col_name)
for i in range(274, 280):
for j in range(i, i + 1):
try:
paramas = {
'searchType': 'MulityTermsSearch',
'ArticleType': '0',
'ParamIsNullOrEmpty': 'true',
'Islegal': 'false',
'Content': content_input,
'Title': title_input,
'Author': author_input,
'Unit': unit_input,
'Keyword': keyword_input,
'Type': '1',
'Order': '1',
'Page': str(j)
}
res = requests.get(url, params = paramas)
# print(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
items = soup.find_all('div', class_='list-item')
for item in items:
title = item.find('a')['title']
try:
try:
keywords = item.find('div', class_='info').find('p',class_='info_left left').find_all('a')[0]['data-key']
article_info = item.find('p', class_='source').text.replace('\n',' ')
article_info = article_info.replace('...','等')
article_info_array = article_info.split(' ')
for num_empty in range(article_info_array.count('')):
article_info_array.remove('')
article_info_array_len = len(article_info_array)
article_type = article_info_array[article_info_array_len - 1]
article_year_issue = article_info_array[article_info_array_len - 2]
article_year_issue = article_year_issue.replace('年',' ')
article_year_issue = article_year_issue.replace('期','')
article_year_issue = article_year_issue.split(' ')
if len(article_year_issue) == 2:
article_year = article_year_issue[0]
article_issue = article_year_issue[1]
else:
article_year = article_year_issue[0]
article_issue = ''
article_journal = article_info_array[article_info_array_len - 3]
article_authors = ''
for article_author_num in range(article_info_array_len - 3):
if article_author_num == 0:
article_authors = article_info_array[article_author_num]
else:
article_authors = article_authors + ', ' + article_info_array[article_author_num]
download_num = re.findall('\d{1,10}',item.find('div',class_='info').find('span', class_='time1').text)[0]
cited_num = re.findall('\d{1,10}',item.find('div',class_='info').find('span', class_='time2').text)[0]
CNKI_link = item.find('a')['href']
except IndexError:
keywords = 'None'
article_info = 'None'
article_journal = 'None'
article_authors = 'None'
article_year = 'None'
article_issue = 'None'
article_type = 'None'
download_num = 'None'
cited_num = 'None'
CNKI_link = 'None'
except AttributeError:
keywords = 'None'
article_info = 'None'
article_journal = 'None'
article_authors = 'None'
article_year = 'None'
article_issue = 'None'
article_type = 'None'
download_num = 'None'
cited_num = 'None'
CNKI_link = 'None'
if html.find(title) == -1:
html = html + res.text
log = [title, keywords, article_authors, article_journal, article_type, article_year, article_issue, download_num, cited_num, CNKI_link]
if results == '':
results.append(log)
sheet.append(log)
print(log)
else:
if log not in results:
results.append(log)
sheet.append(log)
print(log)
time_end = time.time()
print('成功爬取:', len(results), '条 耗时:', str(time_end-time_start), '秒')
time.sleep(0)
except TimeoutError:
title_link_result.save(filename + '_break.xlsx')
break
time.sleep(0)
title_link_result.save(filename + '_all.xlsx')
file = open(filename + '.html', 'w', encoding = 'utf-8')
file.write(html)
file.close()
return html
if __name__ == '__main__':
filename = 'result'
title = ''
keyword = '双支持向量机'
content = ''
unit = ''
author = ''
results = CNKI(filename, title, keyword, author, unit, content)