python 中国知网爬虫项目
import requests
from bs4 import BeautifulSoup
import time
from urllib import parse
import bs4
search_url = 'https://link.springer.com/search/page/'
base_url = 'https://link.springer.com'
header_info={'Connection': 'close'}
value = input('请输入关键词:')
page = input('请输入页码:')
data={
'query':value
}
param = parse.urlencode(data)
def get_list(url,param='',page=1):
while 1:
try:
res = requests.get(url=url+page,params=param,headers=header_info)
res.encoding = res.apparent_encoding
return res
except:
print("连接被拒绝..")
print("休息五秒钟zzZ")
time.sleep(5)
print("休息好了开始吧...")
continue
def get_detail(url):
while 1:
try:
res = requests.get(url=url,headers=header_info)
res.encoding = res.apparent_encoding
return res
except:
print("连接被拒绝..")
print("休息五秒钟zzZ")
time.sleep(5)
print("休息好了开始吧...")
continue
def start(url,param,page):
print('程序开始...')
res = get_list(url,param,page)
soup = BeautifulSoup(res.text, 'html.parser')
ol = soup.find(name='ol', attrs={"id":"results-list","class":"content-item-list"})
li_list = ol.find_all(name='li')
for li in enumerate(li_list):
a = li.find('a')
# print(base_url + a.get('href'))
detail_res = get_detail(base_url + a.get('href'))
detail_soup = BeautifulSoup(detail_res.text, 'html.parser')
h1 = detail_soup.find(name='h1',attrs={'class':'c-article-title u-h1'})
h1_p = detail_soup.find(name='h1',attrs={'class':'c-article-title u-h1'})
h2 = detail_soup.find(name='div',attrs={'id':'Abs1-content'})
f=open('./论文'+ str(time.time())[:10] + r'.txt',"a")
div = detail_soup.find(name='div',attrs={'id':'Abs2-content','class':'c-article-section__content'})
ref = detail_soup.find(name='h2',attrs={'id':'Bib1'})
ref_div = detail_soup.find(name='div',attrs={'id':'Bib1-content','class':'c-article-section__content'})
if isinstance(div,bs4.element.Tag):
h3_list = div.find_all(name='h3')
p_list = div.find_all(name='p')
if h1:
f.write(h1.get_text()+'\n\n')
else:
f.write(h1_p.get_text()+'\n\n')
if h2:
f.write(h2.get_text()+'\n\n')
if len(p_list) == 1:
f.write(p_list[0].get_text()+'\n\n')
else:
for inx, val in enumerate(p_list):
if inx < len(h3_list):
f.write(h3_list[inx].get_text()+'\n')
if val:
f.write(val.get_text()+'\n\n')
if ref:
f.write('参考文献:\n')
if isinstance(ref_div,bs4.element.Tag):
ref_p = ref_div.find_all(name='p')
if len(ref_p) > 0:
for ind, item in enumerate(ref_p):
f.write('【' +str(ind+1) + '】' +item.get_text()+'\n')
f.close()
print('\r已抓取 -> 论文'+str(time.time())[:10])
print('\r任务已完成')
if __name__ == '__main__':
start(search_url,param,page)