由于项目的需要,需要检索到NCBI网页中相应字段。。。代码还需优化,欢迎大家指正。相应补充内容后期补充中
从网页中抽取出所需要的信息:
包括标题,作者,摘要,Keywrods, Mesh Terms等等信息
若需要抽取指定pubmed id,只需把pubmed id添加到pubmedIDS中即可。
可以查看网页链接查看代码中所用示例:http://www.ncbi.nlm.nih.gov/pubmed/23931021
BeautifulSoup 学习资料:http://www.crummy.com/software/BeautifulSoup/bs3/documentation.zh.html
# -- coding: utf-8 --
from bs4 import BeautifulSoup, NavigableString, Tag
import urllib2
import re
import time
import socket
timeout=10
socket.setdefaulttimeout(timeout)
def getContent(stuff):
text = ''
if stuff == None:
return None
else :
cool = stuff.descendants
for parse in cool:
if (isinstance(parse, NavigableString)):
text += parse
elif parse.name == 'p':
text += '\n'
elif re.match(r"h[0-9]+", parse.name):
text += '\n'
elif ("li" == parse.name):
text += '\n\t'
return text
def pageScrapy(url):
req = urllib2.Request(url)
webpage = None
flag = 1
while 1 == flag:
pageText = ""
try:
webpage = urllib2.urlopen(req)
pageText = webpage.read()
except socket.timeout as e:
print "Timeout, try again. waiting 3 secs and try again."
time.sleep(3)
continue
except Exception as e:
if hasattr(e, 'reason'):
print 'Reason: %s\n' % e.reason
elif hasattr(e, 'code'):
print 'Error code: %s\n' % e.code
print "Waiting 5 secs to continue."
time.sleep(5)
continue
else:
flag = 0
webpage.close()
soup = BeautifulSoup(pageText)
rprt_abstract = soup.find('div', {'class': "rprt abstract"})
TagElements = ['auths', 'aff', 'abstr', 'aux', 'morecit']
if rprt_abstract is not None:
print getContent(rprt_abstract.find('div', {'class': 'cit'}))
print getContent(rprt_abstract.find("h1"))
for tagElem in TagElements:
text = getContent(rprt_abstract.find('div', {'class': tagElem}))
if (text != None):
print text
def main():
baseUrl = "http://www.ncbi.nlm.nih.gov/pubmed/"
pubmedIdList = ['23931021']
for pubmedId in pubmedIdList:
url = baseUrl + pubmedId
pageScrapy(url)
pageScrapy(url)
if __name__ == '__main__':
main()