检索网站
https://journal.medsci.cn/m/nsfc.do
https://www.medsci.cn/sci/nsfc_index
保存源码到c.txt
提取检索信息
# -*- coding: utf-8 -*-
import pandas as pd
import re
def ifind(somestr,sub):
s = [substr.start() for substr in re.finditer(sub , somestr)]
return s
def find2(txt,str1,str2):
s = ifind(txt,str1)
# print(txt)
if(len(txt)>300):
txt1 = txt[s[0]+len(str1):s[0]+300]
else:
txt1 = txt[s[0]+len(str1):]
k = txt1.find(str2)
txt1 = txt1[:k]
return txt1
file = 'c.txt'
f = open(file,encoding='utf-8', errors='ignore')
txt = f.read()
f.close()
# print(txt)
str1 = '<dd data-label="项目名">'
str2 = '<dd data-label="负责人">'
str3 = 'target="_blank">'
str4 = '</a>'
s = ifind(txt,str1)
# print(s)
numRows = len(s)
numCols = 10
d = pd.DataFrame(index=range(numRows),columns=range(numCols))
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
txt2 = find2(txt1,str3,str4)
print(txt2)
d.iloc[m,0]=txt2
m=m+1
# print(k)
str1 = '<dd data-label="依托单位">'
str2 = '<dd data-label="经费">'
str3 = 'action=q&txtitle='
str4 = '" target="_blank">'
# s = ifind(txt,str1)
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
txt2 = find2(txt1,str3,str4)
d.iloc[m,1]=txt2
m=m+1
# print(k)
str1 = '<dd data-label="负责人">'
str2 = '<dd data-label="依托单位>'
str4 = '</dd>'
s = ifind(txt,str1)
print(s)
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
txt1 = txt1[:10]
# print(txt1)
k1 = txt1.find(str4)
txt1=txt1[:k1]
print(txt1)
d.iloc[m,2]=txt1
m=m+1
# print(k)
str1 = '<dd data-label="起始时间">'
str2 = '<dl class="dl">'
str3 = '<dd data-label="基金">'
str4 = '</dd>'
s = ifind(txt,str1)
# print(s)
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
txt2 = find2(txt[k:],str3,str4)
txt1 = txt1[:30]
# print(txt1)
k1 = txt1.find(str4)
txt1=txt1[:k1]
# if(m==0):
# print(txt2)
d.iloc[m,3]=txt1
d.iloc[m,4]=txt2
m=m+1
# print(k)
str1 = '<dd data-label="代码">'
str2 = '<dd data-label="项目名">'
str3 = '<dd data-label="基金">'
str4 = '</dd>'
s = ifind(txt,str1)
# print(s)
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
# txt2 = find2(txt[k:],str3,str4)
txt1 = txt1[:30]
# print(txt1)
k1 = txt1.find(str4)
txt1=txt1[:k1]
print(txt1)
# if(m==0):
# print(txt2)
# d.iloc[m,3]=txt1
d.iloc[m,5]=txt1
m=m+1
# print(k)
str1 = '<dd data-label="经费">'
str2 = '<dd data-label="起始时间">'
str3 = '<dd data-label="基金">'
str4 = '</dd>'
s = ifind(txt,str1)
# print(s)
m=0
for k in s:
txt1 = find2(txt[k:],str1,str2)
# txt2 = find2(txt[k:],str3,str4)
txt1 = txt1[:30]
# print(txt1)
k1 = txt1.find(str4)
txt1=txt1[:k1]
print(txt1)
# if(m==0):
# print(txt2)
# d.iloc[m,3]=txt1
d.iloc[m,6]=txt1
m=m+1
# print(k)
d.to_csv(file+'out.csv',index=1,header=1,encoding='UTF-8-sig')
结果在c.txtout.csv