python如何一键注释_python批量爬取NCBI基因注释并调用谷歌API批量翻译

今天是2020-02-09

作者:沙雕学习小组

这里有视频教程:

https://www.bilibili.com/video/av87724182

今天想实现这个功能:

差异分析得到了200多个基因(甚至更多)

90e3b5b648c3

1

我要一个一个把基因的summary信息得到,要手动一个一个查可能要查到下个星期,周五就要汇报了啊……!

90e3b5b648c3

2

有python怕啥?!不要慌

动手之前先动脑

step1:获取这个基因在NCBI上的summary信息——输入gene.txt得到genesummary.txt

step2:检查输出文件是否有空行,若有删掉输入genesummary.txt得到newsummary.txt

step3:批量翻译——输入newsummary.txt,得到genetrans.txt

step1:获取这个基因在NCBI上的summary信息——输入gene.txt得到genesummary.txt

#!/usr/bin/env python

# -*- coding:utf-8 -*-

# Author:Abao

from Bio import Entrez # pip install biopython

#from translate_api.translate_api import api # pip install translate_api

#from Pytrans import *

import re

Entrez.email = "shinningbzw@foxmail.com" # email

#这里修改文件路径和文件名,绝对路径

output_file = 'genesummary.txt' # 注意你的输出文件路径:绝对路径

input_file = 'gene.txt'# 输入文件:去重后的基因列表 (将基因列保存为 txt,uniq *.txt>gene_list.txt )

gene_list = []

line_c = []

count = len(open(input_file, 'r').readlines())

print("Waiting...")

#from Pytrans import *

import requests

from Pytrans import *

def google_translate(content):

'''google translation'''

js = Pytrans()

tk = js.getTk(content)

if len(content) > 4891:

print("too long!!!")

return

param = {'tk': tk, 'q': content}

result = requests.get("""http://translate.google.cn/translate_a/single?client=t&sl=en

&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss

&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1&srcrom=0&ssel=0&tsel=0&kc=2""", params=param)

trans = result.json()[0]

ret = ''

# for i in range(len(trans)):

# line = trans[i][0]

# if line != None:

# ret += trans[i][0]

for i in range(len(trans)):

line = trans[i][0]

if line != None:

ret += trans[i][0]

return ret

#a = google_translate("hello,Input file will be translated, please be patient")

#print(a)

# get gene list

for line in open(input_file):

if line != "基因":

gene_list.append(line)

gene_list.remove(gene_list[0])

rm_pattern = re.compile('\[.*?\]')

with open(output_file, 'a+', encoding='utf-8') as f:

for line in gene_list:

gene = str(line.strip())

gene_term = "(" + gene +"[Gene Name]) AND Homo sapiens[Organism]"

Entrez.email = "shinningbzw@foxmail.com"

handle = Entrez.esearch(db="gene", term=gene_term)

gene_id = Entrez.read(handle)['IdList'][0]

sum_handle = Entrez.esummary(db="gene", id=gene_id)

sum_record = Entrez.read(sum_handle)

r_gene_sum = sum_record['DocumentSummarySet']['DocumentSummary'][0]['Summary']

gene_sum = rm_pattern.sub('', r_gene_sum)

#translation = google_translate(gene_sum)

#f.write(gene + "\n" + gene_sum + "\n" + translation + "\n")

f.write(gene + "\n" + gene_sum + "\n" )

line_c.append("b")

if count % len(line_c) == 0:

perc = (len(line_c) / count) * 100

print("Completed " + str(int(perc)) + "%")

step2:检查输出文件是否有空行,若有删掉。输入genesummary.txt得到newsummary.txt

#!/usr/bin/env python

# -*- coding:utf-8 -*-

# Author:cici

#这里修改你的文件路径,请看清文件名奥~

with open('genesummary.txt', 'r', encoding='utf-8') as fr, open('newsummary.txt', 'w', encoding='utf-8') as fd:

for text in fr.readlines():

if text.split():

fd.write(text)

print('输出成功....')

step3:批量翻译

这里先写个函数

#!/usr/bin/env python

# -*- coding:utf-8 -*-

# Author:Topshi

import execjs

class Pytrans():

def __init__(self):

self.ctx = execjs.compile("""

function TL(a) {

var k = "";

var b = 406644;

var b1 = 3293161072;

var jd = ".";

var $b = "+-a^+6";

var Zb = "+-3^+b+-f";

for (var e = [], f = 0, g = 0; g < a.length; g++) {

var m = a.charCodeAt(g);

128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),

e[f++] = m >> 18 | 240,

e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,

e[f++] = m >> 6 & 63 | 128),

e[f++] = m & 63 | 128)

}

a = b;

for (f = 0; f < e.length; f++) a += e[f],

a = RL(a, $b);

a = RL(a, Zb);

a ^= b1 || 0;

0 > a && (a = (a & 2147483647) + 2147483648);

a %= 1E6;

return a.toString() + jd + (a ^ b)

};

function RL(a, b) {

var t = "a";

var Yb = "+";

for (var c = 0; c < b.length - 2; c += 3) {

var d = b.charAt(c + 2),

d = d >= t ? d.charCodeAt(0) - 87 : Number(d),

d = b.charAt(c + 1) == Yb ? a >>> d: a << d;

a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d

}

return a

}

""")

def getTk(self, text):

return self.ctx.call("TL", text)

调用这个函数——输入newsummary.txt,得到genetrans.txt

#!/usr/bin/env python

# -*- coding:utf-8 -*-

# Author:Topshi

from Pytrans import *

import requests

def google_translate(content):

'''google translation'''

js = Pytrans()

tk = js.getTk(content)

if len(content) > 4891:

print("too long!!!")

return

param = {'tk': tk, 'q': content}

result = requests.get("""http://translate.google.cn/translate_a/single?client=t&sl=en

&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss

&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1&srcrom=0&ssel=0&tsel=0&kc=2""", params=param)

trans = result.json()[0]

ret = ''

for i in range(len(trans)):

line = trans[i][0]

if line != None:

ret += trans[i][0]

return ret

a = google_translate("hello,Input file will be translated, please be patient")

print(a)

genotype_annotation_list = []

translate_file = open('genetrans.txt', "a+", encoding='utf-8')

with open('newsummary.txt', 'r') as f: #有空行会报错!!

for element in f:

genotype_annotation_list.append(element.strip())

# print(genotype_annotation_list)

count = 0

for ga in genotype_annotation_list:

translation = google_translate(ga)

#translate_file.write(ga + '\t' + translation + '\n')

translate_file.write(translation + '\n')

count += 1

print('complete', '%.1f%%' % ((count / len(genotype_annotation_list)) * 100))

欢迎关注我的公众号:

90e3b5b648c3

天黑请闭眼预言家请睁眼.jpg

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值