# -*- coding: utf-8 -*-
from configparser import ConfigParser
from urllib.parse import quote
import socket
import os
import math
import urllib.request
from bs4 import BeautifulSoup
import time
import spider_search_page
import spider_paper
if __name__ == '__main__':
print("123")
start = time.clock()
cf = ConfigParser()
cf.read("Config.conf", encoding='utf-8')
keyword = cf.get('base', 'keyword')# 关键词
maxpage = cf.getint('base', 'maxpage')# 最大页码
searchlocation = cf.get('base', 'searchlocation') #搜索位置
currentpage = cf.getint('base', 'currentpage')
if os.path.exists('data-detail.txt') and currentpage == 0:
print('存在输出文件,删除该文件')
os.remove('data-detail.txt')
#构造不同条件的关键词搜索
values = {
'全文': 'qw',
'主题': 'theme',
'篇名': 'title',
'作者': 'author',
'摘要':'abstract'
}
keywordval = str(values[searchlocation])+':'+str(keyword)
index_url='http://search.cnki.com.cn/Search.aspx?q='+quote(keywordval)+'&rank=&cluster=&val=&p='#quote方法把汉字转换为encodeuri?
print(index_url)
#获取最大页数
html = urllib.request.urlopen(index_url).read()
soup = BeautifulSoup(html, 'html.parser')
pagesum_text = soup.find('span', class_='page-sum').get_text()
maxpage = math.ceil(int(pagesum_text[7:-1]) / 15)
#print(maxpage)
cf = ConfigParser()
cf.read("Config.conf", encoding='utf-8')
cf.set('base', 'maxpage', str(maxpage))
cf.write(open('Config.conf', 'w', encoding='utf-8'))
for i in range(currentpage, maxpage):
page_num=15
page_str_num=i*page_num
page_url=index_url+str(page_str_num)
print(page_url)
attempts = 0
success = False
while attempts < 50 and not success:
try:
spider_search_page.get_paper_url(page_url)
socket.setdefaulttimeout(10) # 设置10秒后连接超时
success = True
except socket.error:
attempts += 1
print("第"+str(attempts)+"次重试!!")
if attempts == 50:
break
except urllib.error:
attempts += 1
print("第"+str(attempts)+"次重试!!")
if attempts == 50:
break
cf.set('base', 'currentpage', str(i))
cf.write(open("Config.conf", "w", encoding='utf-8'))
spider_paper.spider_paper()# spider_paper补全文章信息
end = time.clock()
print ('Running time: %s Seconds'%(end-start))
# -*- coding: utf-8 -*-
import socket
from bs4 import BeautifulSoup
import urllib
import requests
import time
import xlwt
from configparser import ConfigParser
def spider_paper():
start = time.clock()
# f=urllib2.urlopen(url, timeout=5).read()
# soup=BeautifulSoup(html)
# tags=soup.find_all('a')
file = open("data-detail.txt", encoding='utf8')
cf = ConfigParser()
cf.read("Config.conf", encoding='utf-8')
keyword = cf.get('base', 'keyword')# 关键词
# 写入Excel
wb = xlwt.Workbook("data_out.xls")
sheet = wb.add_sheet("data-out")
sheet.write(0, 0, '下载网址')
sheet.write(0, 1, '标题')
sheet.write(0, 2, '来源')
sheet.write(0, 3, '引用')
sheet.write(0, 4, '作者')
sheet.write(0, 5, '作者单位')
sheet.write(0, 6, '关键词')
sheet.write(0, 7, '摘要')
sheet.write(0, 8, '共引文献')
lines = file.readlines()
txt_num = 1
lin_num = 1
paper_list = []
for line in lines:
object = line.split('\t')
paper_url = object[0]
if paper_url in paper_list:
continue
paper_list.append(paper_url)
attempts = 0
success = False
while attempts < 50 and not success:
try:
html = urllib.request.urlopen(paper_url).read()
soup = BeautifulSoup(html, 'html.parser')
socket.setdefaulttimeout(10) # 设置10秒后连接超时
success = True
except socket.error:
attempts += 1
print("第"+str(attempts)+"次重试!!")
if attempts == 50:
break
except urllib.error:
attempts += 1
print("第"+str(attempts)+"次重试!!")
if attempts == 50:
break
title = soup.find_all('div', style="text-align:center; width:740px; font-size: 28px;color: #0000a0; font-weight:bold; font-family:'宋体';")
abstract = soup.find_all('div', style='text-align:left;word-break:break-all')
author = soup.find_all('div', style='text-align:center; width:740px; height:30px;')
#获取作者名字
for item in author:
author = item.get_text()
# print(item)
#获取摘要信息
tmp = ''
for thing in abstract:
a = thing.strings
for string in a:
tmp = tmp + string
txt_num += 1
result = tmp.split(' ')
tstr = ''
for t in result:
test = t.split('\n')
# print(test)
if test != '\t' and test != '\n' and test != '\r' and test != '':
for i in test:
if len(i) > 1:
item = i.split('\r')
for j in item:
object = j.split('\t')
for k in object:
tstr += k
ifreferen = soup.find_all('td', class_='b14', rowspan='2')
ref = ''
for i in range(len(ifreferen)):
if ('【共引文献】' in ifreferen[i].get_text()):
referenceList = soup.find_all('div', id='div_Ref') # 共引文献列表
if len(referenceList) == 0:
referenceList = soup.find_all('div', class_='div_Ref')
referenceList = referenceList[i]
for tdref in referenceList.find_all('td', width='676'):
refitem = tdref.a.get("href")
refitem = refitem.strip()
print(refitem)
ref = ref + refitem + ' ,'
# 获取作者单位,处理字符串匹配
authorUnitScope = soup.find('div', style='text-align:left;', class_='xx_font')
author_unit = ''
author_unit_text = authorUnitScope.get_text()
# print(author_unit_text)
if '【作者单位】:' in author_unit_text:
auindex = author_unit_text.find('【作者单位】:', 0)
else:
auindex = author_unit_text.find('【学位授予单位】:', 0)
for k in range(auindex, len(author_unit_text)):
if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
author_unit_text[k] == '】':
continue
if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
continue
if author_unit_text[k] != '【':
author_unit = author_unit + author_unit_text[k]
if author_unit_text[k] == '【' and k != auindex:
break
# 获取关键字
key_word = ''
kwindex = author_unit_text.find('【关键词】:', 0)
for k in range(kwindex, len(author_unit_text)):
if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
author_unit_text[k] == '】':
continue
if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
continue
if author_unit_text[k] != '【':
key_word = key_word + author_unit_text[k]
if author_unit_text[k] == '【' and k != kwindex:
break
# print(author_unit)
# print(key_word)
line = line.strip('\n')
line = line + '\t' + str(author) + '\t' + str(author_unit) + '\t'+ str(key_word) + '\t'+ str(tstr) + '\t' + str(ref) + '\n'
outstring = line.split('\t')
for i in range(len(outstring)):
sheet.write(lin_num, i, outstring[i])
print('写入第'+str(lin_num)+'行')
lin_num += 1
wb.save('data_out_'+str(keyword)+'.xls')
file.close()
end = time.clock()
print('Running time: %s Seconds' % (end - start))
from bs4 import BeautifulSoup
import urllib
import urllib.request
import sys
import io
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
page_num=15
def get_paper_url(page_url):
html = urllib.request.urlopen(page_url).read()
soup = BeautifulSoup(html,'html.parser')
f = open('data-detail.txt','a+', encoding='utf-8')
all = soup.find_all('div', class_='wz_content')
for string in all:
item = string.find('a', target='_blank')#文章标题与链接
href = item.get('href')# 获取文章url
title = item.get_text() # 获取文章标题
year_count = string.find('span', class_='year-count')#获取文章出处与引用次数
#year_count = year_count.get_text()
publish = ''
reference = ''
for item in year_count:
item = item.string
item = item.replace('\n','')
item = item.replace('\r', '')
if '被引次数' in item:
reference = item# 获取被引次数
elif '年' in item: # 获取文章出处
publish = item
#print(publish)
#print(reference)
#print(year_count)
f.write(href + '\t' + title + '\t' + publish + '\t' + reference +'\n')
f.close()