连夜帮美女小姐姐爬取文献,第二天早晨给她一个Excel文件

最后爬取的结果如下

某天中午收到漂亮小姐姐微信,是这样的:

然后晚上10点下班回家开始了连夜写爬虫脚本,终于在2点的时候基本可以用了:

然后早上醒来直接将爬下来的文章发了过去O(∩_∩)O哈哈~。

代码实现如下:

# Author   : 叨陪鲤
# Date     : 2021/4/10
# Position : Beijing
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from urllib import request
from urllib import error
import xlwt
import re
import time

TotalNum=0

class Article(object):
    title = ""
    link = ""
    authors = ""
    magz = ""
    time = ""
    doi = ""
    cite = ""
    snip = ""
    def __init__(self):
        title = "New Paper"

def html_request(url):
    if url is None:
        return
    print("download html is :{0}".format(url))
    # 如果url包含中文,则需要进行编码

    # 模拟浏览器行为
    headers = {'UserAgent': str(UserAgent().random)}
    req = request.Request(url, headers=headers)

    try:
        html = request.urlopen(req).read().decode('utf-8')
    except error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
        return None
    # print(html)
    return html

def save_xls(sheet,paper):
    # 将数据按列存储入excel表格中
    global TotalNum
    sheet.write(TotalNum, 0, TotalNum)
    sheet.write(TotalNum, 1, paper.title)
    sheet.write(TotalNum, 2, paper.link)
    sheet.write(TotalNum, 3, paper.authors)
    sheet.write(TotalNum, 4, paper.magz)
    sheet.write(TotalNum, 5, paper.time)
    sheet.write(TotalNum, 6, paper.doi)
    sheet.write(TotalNum, 7, paper.Cite)
    sheet.write(TotalNum, 8, paper.Snip)
    TotalNum += 1

    # 最初用来调试解析页面用的
def html_parser0():
    if url is None or html is None:
        return

    # 使用正则匹配所有的文章列表
    pattern_article = '<article class="full-docsum" data-rel-pos=(.+?)</article>'
    articles = re.compile(pattern_article, re.S).findall(html.replace('\n', ''))

    # 遍历每一个文章的相关信息
    for article in articles:
        soup = BeautifulSoup(article, 'html.parser')

        title = soup.find('a', attrs={'class': 'docsum-title'})
        print("[Title]:{0}".format(title.text.replace('  ', '')))
        print("[Link]:{0}{1}".format("https://pubmed.ncbi.nlm.nih.gov", title.attrs['href']))
        authors = soup.find('span', attrs={'class': 'docsum-authors full-authors'})
        print("[Author]:{0}".format(authors.text))

        citationInfos = soup.find('span', attrs={'class': 'docsum-journal-citation full-journal-citation'})
        Mtd = "{0}".format(citationInfos.text).split('.')
        print("[MAGZ]:{0}".format(Mtd[0]))
        print("[Time]:{0}".format(Mtd[1].split(';')[0]))
        print("[DOI]:{0}".format(Mtd[2].split(':')[1]))

        citation = soup.find('span', attrs={'class': 'citation-part'})
        print("[Cite]:{0}".format(citation.text.split(':')[1]))

        citation = soup.find('div', attrs={'class': 'full-view-snippet'})
        print("[Snip]:{0}\n".format(citation.text).replace('  ', ''))

def html_parser(sheet, html):
    if url is None or html is None:
        return
    # 使用正则匹配所有的文章列表
    pattern_article = '<article class="full-docsum" data-rel-pos=(.+?)</article>'
    articles = re.compile(pattern_article, re.S).findall(html.replace('\n', ''))

    # 遍历每一个文章的相关信息
    for article in articles:
        paper = Article() # 创建一个对象,用来存储文章信息

        soup = BeautifulSoup(article, 'html.parser')

        # 分别用来获取不同的关键信息
        title = soup.find('a', attrs={'class': 'docsum-title'})
        authors = soup.find('span', attrs={'class': 'docsum-authors full-authors'})
        citationInfos = soup.find('span', attrs={'class': 'docsum-journal-citation full-journal-citation'})
        Mtd = "{0}".format(citationInfos.text).split('.')
        cite = soup.find('span', attrs={'class': 'citation-part'})
        snip = soup.find('div', attrs={'class': 'full-view-snippet'})

        # 将信息存储在paper对象上
        paper.title = "{0}".format(title.text.replace('  ', ''))
        paper.link = "{0}{1}".format("https://pubmed.ncbi.nlm.nih.gov",title.attrs['href'])
        paper.authors = "{0}".format(authors.text)
        paper.magz = "{0}".format(Mtd[0])
        paper.time = "{0}".format(Mtd[1].split(';')[0])

        # doi = "{0}".format(Mtd[2].replace(' ','').split(':')[1])
        paper.doi = "略"
        paper.Cite = "{0}".format(cite.text.replace(' ','').split(':')[1])
        paper.Snip = "{0}".format(snip.text).replace('  ', '')

        save_xls(sheet, paper)

        # print(Mtd)
        # print(paper.title)
        # print(paper.link)
        # print(paper.authors)
        # print(paper.magz)
        # print(paper.time)
        # print(paper.doi)
        # print(paper.Cite)
        # print(paper.Snip)
        # print("\n")

        # print("[Title]:{0}".format(title.text.replace('  ', '')))
        # print("[Link]:{0}{1}".format("https://pubmed.ncbi.nlm.nih.gov",title.attrs['href']))
        # print("[Author]:{0}".format(authors.text))
        # print("[MAGZ]:{0}".format(Mtd[0]))
        # print("[Time]:{0}".format(Mtd[1].split(';')[0]))
        # print("[DOI]:{0}".format(Mtd[2].split(':')[1]))
        # print("[Cite]:{0}".format(cite.text.split(':')[1]))
        # print("[Snip]:{0}\n".format(snip.text).replace('  ', ''))


if __name__ == '__main__':
    myxls = xlwt.Workbook()
    sheet1 = myxls.add_sheet(u'PaperInfo',True)

    column = ['序号','文章名称','原文链接','作者','发表周刊','发表时间','DOI','引用次数','摘要']
    for i in range(0, len(column)):
        sheet1.write(TotalNum, i, column[i])
    TotalNum+=1
    page = 1
    while page <= 1000:
        url = "https://pubmed.ncbi.nlm.nih.gov/?term=genetic%20map&page="+str(page)

        html = html_request(url)
        html_parser(sheet1, html)
        myxls.save('NCBI文章之geneticMap.xls')
        page += 1
    myxls.save('NCBI文章之geneticMap.xls')
 
 
 
 
 
 
评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

叨陪鲤

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值