就是备份一下
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import requests
import re
from io import StringIO
from io import open
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
#hang=0
for i in range(10006,10011):
def read_pdf(pdf):
# resource manager
rsrcmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
# device
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdf)
device.close()
content = retstr.getvalue()
retstr.close()
jie='Received'
x=[substr.start()for substr in re.finditer(jie,str(content))]
con=str(content)[0:x[-1]]
con = con.replace("\n", " ")
# 获取所有行
hang=0
target = 'https://www.nature.com/articles/nature'+'%d' %i+'#accession-codes.html'
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html)
texts = bf.find_all("div",class_="c-article-section__content",id="Abs2-content")
t=str(texts)
jian=re.sub(u"\\<.*?\\>", "", t)
jian = str(jian).replace(".", ".\n")
jian = str(jian).replace("?", "?\n")
jian = str(jian).replace("!", "!\n")
for ch in jian:
if(ch=='\n'):
hang=hang+1
print(jian)
print(hang)
lines = con.split(".")
text = open("%d.txt"%i, 'w+',encoding='gb18030',errors='ignore')
for j in range(len(lines)):
if(len(lines[j])>20):
if((lines[j][0].isdigit())==False and lines[j][1].isdigit()==False):
#if(lines[j][0]>='9' and lines[j][0]<='0'):
hang=hang-1
if(hang<0):
text.write("\n")
lines[j]= str(lines[j]).replace(".", ".\n")
lines[j] = str(lines[j]).replace("!", "!\n")
lines[j] = str(lines[j]).replace("?", ".\n")
if(hang<0):
text.write(lines[j])
text.write(".")
text.write("\n\n\nhighlight:\n")
text.close()
f= open("%d.txt"%i,"a+", encoding='gb18030',errors='ignore')
f.write(str(jian))
f.close()
def _main():
my_pdf = open('nature'+'%d' %i+'.pdf', "rb")
read_pdf(my_pdf)
my_pdf.close()
if __name__ == '__main__':
_main()