python 中国知网爬虫项目

最新推荐文章于 2024-05-12 16:39:42 发布

-常见-

最新推荐文章于 2024-05-12 16:39:42 发布

阅读量1.2k

点赞数 5

分类专栏：前端

本文链接：https://blog.csdn.net/qq_37985836/article/details/116035579

版权

前端专栏收录该内容

15 篇文章 0 订阅

订阅专栏

python 中国知网爬虫项目

import requests
from bs4 import BeautifulSoup
import time
from urllib import parse
import bs4

search_url = 'https://link.springer.com/search/page/'
base_url = 'https://link.springer.com'
header_info={'Connection': 'close'}

value = input('请输入关键词：')
page = input('请输入页码：')
data={
    'query':value 
}
param = parse.urlencode(data)

def get_list(url,param='',page=1):
    while 1:
        try:
            res = requests.get(url=url+page,params=param,headers=header_info)
            res.encoding = res.apparent_encoding 
            return res
            
        except:
            print("连接被拒绝..")
            print("休息五秒钟zzZ")
            time.sleep(5)
            print("休息好了开始吧...")
            continue

def get_detail(url):
    while 1:
        try:
            res = requests.get(url=url,headers=header_info)
            res.encoding = res.apparent_encoding 
            return res
        except:
            print("连接被拒绝..")
            print("休息五秒钟zzZ")
            time.sleep(5)
            print("休息好了开始吧...")
            continue

def start(url,param,page):
    print('程序开始...')
    res = get_list(url,param,page)
    soup = BeautifulSoup(res.text, 'html.parser')
    ol = soup.find(name='ol', attrs={"id":"results-list","class":"content-item-list"})
    li_list = ol.find_all(name='li')
     
    for li in enumerate(li_list):
        a = li.find('a')
        # print(base_url + a.get('href'))
        detail_res = get_detail(base_url + a.get('href'))
        detail_soup = BeautifulSoup(detail_res.text, 'html.parser') 
        h1 = detail_soup.find(name='h1',attrs={'class':'c-article-title u-h1'})
        h1_p = detail_soup.find(name='h1',attrs={'class':'c-article-title u-h1'})
        h2 = detail_soup.find(name='div',attrs={'id':'Abs1-content'})
        f=open('./论文'+ str(time.time())[:10] + r'.txt',"a")
        div = detail_soup.find(name='div',attrs={'id':'Abs2-content','class':'c-article-section__content'})
        ref = detail_soup.find(name='h2',attrs={'id':'Bib1'})
        ref_div = detail_soup.find(name='div',attrs={'id':'Bib1-content','class':'c-article-section__content'})
        
        if isinstance(div,bs4.element.Tag):
            h3_list = div.find_all(name='h3')
            p_list = div.find_all(name='p')
            if h1:
                f.write(h1.get_text()+'\n\n')  
            else:
                f.write(h1_p.get_text()+'\n\n') 
            if h2:
                f.write(h2.get_text()+'\n\n')

            if len(p_list) == 1:
                f.write(p_list[0].get_text()+'\n\n') 
            else:
                for inx, val in enumerate(p_list):
                    if inx < len(h3_list):
                        f.write(h3_list[inx].get_text()+'\n') 
                    if val:
                        f.write(val.get_text()+'\n\n') 

        if ref:
            f.write('参考文献：\n')   
        if isinstance(ref_div,bs4.element.Tag):
            ref_p = ref_div.find_all(name='p')
            if len(ref_p) > 0:
                for ind, item in enumerate(ref_p):
                    f.write('【' +str(ind+1) + '】' +item.get_text()+'\n') 
        f.close()
        print('\r已抓取 -> 论文'+str(time.time())[:10])
    print('\r任务已完成')

if __name__ == '__main__':
    start(search_url,param,page)

-常见-

关注

5
点赞
踩
23

收藏

觉得还不错? 一键收藏
2
评论
python 中国知网爬虫项目

python 爬虫简单项目import requestsfrom bs4 import BeautifulSoupimport timefrom urllib import parseimport bs4search_url = 'https://link.springer.com/search/page/'base_url = 'https://link.springer.com'header_info={'Connection': 'close'}value = input('请输
复制链接

扫一扫