远程教育杂志链接
http://dej.zjtvu.edu.cn/
2018年第二期第二刊:
http://dej.zjtvu.edu.cn//oa/darticle.aspx?type=view&id=201802002
分析页面…CRTL+U 没啥好分析的
用时间戳来区分不同期刊 像:201X0YZZZ
凑时间戳用了rjust方法,右对齐填充数字
一个静态页面
用BeautifulSoap库就可以完成
比较暴力
唯一的坑就是论文格式不是固定的..比如不是每篇文章都会有DOI 所以只设置了title的try-except 这是原来没有想到的
一开始因为自信没有用traceback调试 导致手动调试甚至用上了二分法调bug2333
然后traceback 3分钟就调完了所有bug orz
另外直接写到文件里,不需要用列表存字典,多余的操作
import traceback
import requests
from bs4 import BeautifulSoup
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(html,fpath):
soup = BeautifulSoup(html, 'html.parser')
dict = {}
try:
Tag = soup.find('span', attrs={'id': 'LbTitleC'})
dict['论文名称'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbTitleE'})
dict['title'] = Tag.string
except:
return ""
Tag = soup.find('span', attrs={'id': 'LbAuthorC'})
dict['作者'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbAuthorE'})
dict['Author(s)'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbUnitC'})
dict['作者'] = dict['作者'] + Tag.string
Tag = soup.find('span', attrs={'id': 'LbUnitE'})
dict['Author(s)'] = dict['Author(s)'] + Tag.string
Tag = soup.find('span', attrs={'id': 'LbKeyC'})
dict['关键字'] = Tag.get_text()
Tag = soup.find('span', attrs={'id': 'LbKeyE'})
dict['Keywords'] = Tag.get_text()
Tag = soup.find('span', attrs={'id': 'LbFLH'})
dict['分类号'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbDOI'})
dict['DOI'] = Tag.get_text()
Tag = soup.find('span', attrs={'id': 'TbWXBSM'})
dict['文献标志码'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbZY'})
dict['摘要'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbZYE'})
dict['Abstract'] = Tag.string
Tag = soup.find('span', attrs={'id': 'lbxswx'})
dict['相似文献/references'] = Tag.string
Tag = soup.find('span', attrs={'id': 'LbMemory'})
dict['备注/Memo'] = Tag.string
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(dict) + '\n')
def main():
output_file='D:/PagesDetails.txt'
start_url='http://dej.zjtvu.edu.cn//oa/darticle.aspx?type=view&id='
for year in range(2013,2019):
for m in range(1,6):
for i in range(1,100):
try:
url=start_url+str(year)+'0'+str(m)+str(i).rjust(3,'0')
html=getHTMLText(url)
parsePage(html,output_file)
except:
continue
main()
跑完大概是这样的 字典类型: