python医药数据_python爬虫:爬取医药数据库drugbank

import os

import time

import datetime

import codecs

from lxml import etree

from selenium import webdriver

import csv

'''遇到python不懂的问题,可以加Python学习交流群:1004391443一起学习交流,群文件还有零基础入门的学习资料'''

#控制编码,全英文网页,用不着

# import sys

# reload(sys)

# sys.setdefaultencoding('utf-8')

# # date格式转为string格式

today = datetime.date.today()

today_string = today.strftime('%Y-%m-%d')

#通过浏览器得到网页页面--反反爬虫

def html_getter(site,file_name):

driver = webdriver.Firefox()

# chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'

# os.environ['webdriver.chrome.driver'] = chromedriver

# driver = webdriver.Chrome(chromedriver)

driver.get(site)

driver.maximize_window() # 将浏览器最大化显示

time.sleep(5) # 控制间隔时间,等待浏览器反映

# 保存页面

source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML")

f = codecs.open(file_name, 'w+', 'utf8')

f.write(source_code)

f.close()

#打开保存在本地的html文件

def file_html(file_name):

f = open(file_name,'r')

html = f.read()

f.close()

return html

#写入csv,也可以有其他写入方式,这个地方就csv啦

def csv_writer(ll):

headers = ['drug','inter','snp_rs_id','Allele_name','Defining_change','Adverse_Reaction','ref','href','original_title']

with open('drugbank.csv','a') as f:

f_csv = csv.writer(f)

f_csv.writerow(headers)

f_csv.writerows(ll)

#用xpath解析网页,得到表格数据,我就是这么爱xpath,不喜欢正则表达式

def data_get(html):

selector = etree.HTML(html)

tbody=selector.xpath('/html/body/main/table/tbody/tr')

for each in tbody:

# #1.'drug'

drug_name=each.xpath('td[1]/strong/text()')[0]

drug_sn=each.xpath('td[1]/a/text()')[0]

drug=drug_name+' '+drug_sn

# #print(drug)

# #2.'Interacting Gene/Enzyme'

int=each.xpath('td[2]')[0]

inter=int.xpath('string(.)')

# print(inter)

# #3.'SNP RS ID'

snp=each.xpath('td[3]/a/text()')

if snp:

snp_rs_id=snp[0]

else:

snp_rs_id='Not Available '

#print snp_rs_id

#4.Allele name

Allele=each.xpath('td[4]/text()')

if Allele:

Allele_name=Allele[0]

else:

Allele_name='Not Available '

# #print Allele_name

# #5.'Defining change'

Defining=each.xpath('td[5]/text()')

if Defining:

Defining_change=Defining[0]

else:

Defining_change='Not Available '

# print Defining_change

# 6.'Adverse Reaction'

Adverse=each.xpath('td[6]/text()')

if Adverse:

Adverse_Reaction=Adverse[0]

else:

Adverse_Reaction='Not Available '

# print Adverse_Reaction

#7.'Reference(s)'

ref=each.xpath('td[7]/span/a/text()')[0]

href=each.xpath('td[7]/span/a/@href')[0]

original_title=each.xpath('td[7]/span/a/@data-original-title')[0]

# print ref

# print(href)

# print(original_title)

tt=(drug,inter,snp_rs_id,Allele_name,Defining_change,Adverse_Reaction,ref,href,original_title)

ll.append(tt)

#print ll

if __name__ == '__main__':

ll=[]

for i in range(1,5):

page_num=i

site='http://www.drugbank.ca/genobrowse/snp-adr?page='+str(page_num)

#get the html through webdriver

file_name=unicode(today_string)+u'drugbank_'+unicode(str(page_num))+u'.html'

html_getter(site,file_name)

html=file_html(file_name)

data_get(html)

csv_writer(ll)

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值