前言
初学python,写了一个爬虫程序练练手。
这段代码通过手动输入关键词,爬取自然杂志官网文章题目、摘要和下载地址。
由于是国外网站爬虫速度较慢。
代码如下
import requests
import re
import os
path = os.path.abspath(os.path.dirname(__file__))
def getHTMLText(url):
try:
#headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'}
headers = {'User-Agent': 'Mozilla/5.0'}
r = requests.get(url, headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def main():
print('爬虫结果为Nature搜索引擎中内容!')
print('爬虫结果存放在 nature爬虫.txt 中!')
keywords = input("请输入关键词:")
#keywords = "GaInP"
depth = input("爬取的页数:")
#depth = 1
start_url = 'https://www.nature.com/search?q=' + keywords
result =[]
for i in range(int(depth)):
try:
url = start_url + '&page=' + str(i + 1)
html = getHTMLText(url)
papers_rank = re.findall(r'<div\sclass="cleared">.*?data-track-label="(.*?)" >', html, re.DOTALL)
papers_id = re.findall(r'<div\sclass="cleared">.*?<a href="(.*?)" itemprop=".*?"', html, re.DOTALL)
papers_name_pre = re.findall(r'<div\sclass="cleared">.*?data-track-label=".*?" >(.*?)</a>', html, re.DOTALL)
papers_name = []
for paper_name_pre in papers_name_pre:
x = re.sub(r'<.*?>', "", paper_name_pre)
papers_name.append(x.strip())
note1_1 = ('=' * 40 )
note1_2 = ( '=' * 18 + '第%d页' % (i + 1) + '=' * 18 )
note1_3 = ( '=' * 40)
print(note1_1)
print(note1_2)
print(note1_3)
result.append(note1_1)
result.append(note1_2)
result.append(note1_3)
#i = 1
for value in zip(papers_rank, papers_id, papers_name):
paper_rank, paper_id, paper_name = value
paper_url = 'https://www.nature.com' + str(paper_id)
result.append(paper_url)
print(paper_url)
html_abstract = getHTMLText(paper_url)
paper_abstract = re.findall(r'<meta\sname="description"\scontent="(.*?)"/>', html_abstract, re.DOTALL)
paper_rank = {'文章序号': paper_rank}
paper_name = {'文章名称': paper_name}
paper_abstract = {'文章摘要': paper_abstract}
result.append(paper_rank)
result.append(paper_name)
result.append(paper_abstract)
print(paper_rank)
print(paper_name)
print(paper_abstract)
note2 = '=' * 40
print(note2)
result.append(note2)
#i==6:
# break
#i= i+1
except:
continue
filename = 'nature网站爬虫.txt'
with open(filename,'w',encoding='utf-8') as f:
for line in result:
n = 1
for i in str(line):
f.write(i)
if n == 100:
f.write('\n')
n = 0
n +=1
f.write('\n')
main()
附录
程序写完后,我用python的pyinstaller库将其打包成可以在window系统下运行的exe程序。
以下程序可以直接运行,并将下载结果存入nature网站爬虫.txt文件中。
下载地址如下:
链接:https://pan.baidu.com/s/1LmbXmVuc80oegVdJBD31QA
提取码:eguf