爬虫抓取百度文库中的文献

该博客介绍了如何使用Python 3.8抓取百度文库中关于交通碳排放的研究论文,通过代理IP和随机请求头避免被封禁。内容涵盖了获取论文标题、摘要、作者信息及引用量,重点在于信息技术在学术文献抓取中的应用。
摘要由CSDN通过智能技术生成

抓取百度文库中的文献

目前只能抓百度文库里的,google scholar(镜像)和wos、知网爬虫连接不上

version = python 3.8
import urllib
import urllib.request
import random
from bs4 import BeautifulSoup

#  http://www.goubanjia.com/  代理ip

def get_url(url):  # 为了避免被反所以添加请求头、代理ip,ip要时常换
    my_headers = [
        "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
        "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
        "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
        "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
    ]
    proxy_list = [
        "218.64.151.59:9000",
        "110.243.5.230:9999",
        "175.42.123.67:9999",
        "27.206.180.250:9000",
        "27.220.160.58:9000",
        "117.69.151.80:9999"
    ]
    proxy = random.choice(proxy_list)
    header = random.choice(my_headers)

    urlhandle = urllib.request.ProxyHandler({'http': proxy})
    opener = urllib.request.build_opener(urlhandle)
    urllib.request.install_opener(opener)

    req = urllib.request.Request(url)
    req.add_header('User-Agent', header)

    return urllib.request.urlopen(req, timeout=120)


def handle_txt(url):  # outname must str
    response = get_url(url)
    bs = BeautifulSoup(response, 'html.parser')
    list_titles = bs.find_all('div', class_="sc_content")

    for i in list_titles:
        title = i.find('h3', class_="t c_font").text
        f.write("题目:" + title.strip() + "\n")
        half_link = i.find('h3', class_="t c_font").find('a')['href']
        wholelink = 'https:' + str(half_link)
        re = get_url(wholelink)
        if re.status == 200:
            bs2 = BeautifulSoup(re, 'html.parser')
            infos = bs2.find('div', class_="main-info").find('div', class_="c_content")

            # abstract
            if infos.find('div', class_="abstract_wr") is not None:
                abstract = infos.find('div', class_="abstract_wr").find('p', class_="abstract").text.strip()
                f.write("摘要:" + abstract + "\n")

            # year
            if infos.find('div', class_="year_wr") is not None:
                year = infos.find('div', class_="year_wr").find('p', class_="kw_main").text.strip()
                f.write("年份:" + year + '\n')

            # author
            if infos.find('div', class_="author_wr") is not None:
                author = infos.find('div', class_="author_wr").find('p', class_="author_text").text.strip()
                f.write('作者:' + author + '\n')

            # referred
            if infos.find('div', class_="ref_wr") is not None:
                referred = infos.find('div', class_="ref_wr").find('a', class_="sc_cite_cont").text.strip()
                f.write('被引量:' + referred + '\n')

            # keywords
            if infos.find('div', class_="kw_wr") is not None:
                try:
                    kw = infos.find('div', class_="kw_wr").find("p", class_="kw_main").find_all("a")
                    f.write("关键词:")
                    for each in kw:
                        f.write("," + each.text)
                    f.write("\n")
                except:
                    pass

        f.write("\n\n")

f = open('{}.txt'.format("baidu"), 'w', encoding='utf-8')
for i in np.arange(0, 100, 10):  # 抓了10页(注意看每一页的网址变化在哪)
    url = "https://xueshu.baidu.com/s?wd=transportation%20carbon%20emission&pn={}&tn=SE_baiduxueshu_c1gjeupa&ie=utf-8&f=3&sc_f_para=sc_tasktype%3D%7BfirstSimpleSearch%7D&sc_hit=1".format(
        i)
    handle_txt(url)
f.close()
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值