爬虫实战:提取人和老鼠中RNA表达性较好的RNA序列

1 篇文章 0 订阅

        最近,一个学医的同学问我说,想找到RNA在人和老鼠身上都显性表达较好的序列(具体 医学用语我不懂),但是序列都太多了,问我能不能编个程序找出表达性较好的序列。然后就开始试着写了。

      她告诉我她知道phylonocode上有所有有关人RNA序列和老鼠RNA序列的信息。于是我去下载了,结果发现,其中很重要的一项指标只能在网页上一条条地查看,并不包括在下载的信息中,几万条啊,简直是坑。于是我先用个爬虫将这一信息爬下来,再综合到信息列表中。

     

import urllib2  
from bs4 import BeautifulSoup  
import re  
import urlparse  
#num=0 
class SpiderMain(object):  
    def __init__(self):  
  
        self.downloader=HtmlDownloader()  
        self.parser=HtmlParser()  
        self.outputer=HtmlOutputer()  
  
    def craw(self,root_url):  
        html_cont=self.downloader.download(root_url)  
        new_data=self.parser.parse(root_url,html_cont)  
      #  if new_data is not None:
       #     num+=1
        self.outputer.collect_data(new_data)  
        self.outputer.output_html()  
  
  
class HtmlDownloader(object):  
    def download(self,url):  
        if url is None:  
            return None  
        response =urllib2.urlopen(url)  
        if response.getcode()!=200:  
            return None  
        return response.read()  
  
  
class HtmlParser(object):  
  
  
    def _get_new_data(self,page_url,soup):  
        res_data={}  
  
        res_data['url']=page_url  
        title_node=soup.find('font',style=re.compile('font-size'))  
        if title_node!=None:  
            res_data['name']=title_node.get_text()[38:]  
        
        score_node=soup.find_all('tr',style=re.compile("background"))

        if score_node!=None:
            for single_node in score_node:
                if single_node.get_text().find("Gene Symbol")==0:
                    res_data["Gene Symbol"]=single_node.get_text()[11:];
        return res_data  
  
    def parse(self,page_url,html_cont):  
        if page_url is None or html_cont is None:  
            return  
  
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')  
        new_data=self._get_new_data(page_url,soup)  
        return new_data  
  
class HtmlOutputer(object):  
    def __init__(self):  
        self.datas=[]  
  
    def collect_data(self,data):  
        if data is None:  
            return   
        self.datas.append(data)  
  
    def output_html(self):  
        f=open("mouse.txt",'a')
        for data in self.datas:
            f.write("%s " % data["name"])
            f.write("%s " % data["Gene Symbol"])
            f.write("\n")

if __name__=="__main__":  
    i=1
    while(i!=42559):
        a=str(i);
        for j in range(0,5-len(a)):
            a='0'+a
     #   root_url="http://www.bioinfo.org/phyloNoncode/gene.php?ID=PNCG_HSA0%s"%a  
        root_url="http://www.bioinfo.org/phyloNoncode/gene.php?ID=PNCG_MMU0%s"%a  
        obj_spider=SpiderMain()  
        obj_spider.craw(root_url)
        i=int(a)
        i=i+1
  #  print num
        随后考虑将爬下来的信息与原有的信息融合在一起。开始考虑写程序实现,后来同学说vim直接就可以编辑。然后就愉快地搞定了。

        之后从中帅选在人的RNA列在小鼠身上也表达性良好的,且Gene Symbol有记录的项。程序如下:

       

#include <iostream>
#include <string>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <boost/lexical_cast.hpp>

using namespace std;
using namespace boost;
#define BUF_SIZE 256

int main()
{
	FILE *file1=NULL;
	FILE *file2=NULL;
	int humanratnum=0;
	if((file1=fopen("mousebase.txt","r"))==NULL)
	{
        cout<<"open mousebase.txt error"<<endl;
		exit(1);
	}
	if((file2=fopen("mousehuman.txt","a+"))==NULL)
	{
        cout<<"open mousehuman.txt error"<<endl;
		exit(1);
	}
    char buf[BUF_SIZE];
	string str,str1,str2,str3,str4,str5,str6,str7,str8,str9,str10,str11,str12;
	stringstream ss;
	string humanmouse="";
    while(fgets(buf,BUF_SIZE,file1)!=NULL&&buf[0]!='\n')
	{
        str=buf;
		ss<<str;
		ss>>str1;
		ss>>str2;
		ss>>str3;
		ss>>str4;
		ss>>str5;
		ss>>str6;
		ss>>str7;
		ss>>str8;
		ss>>str9;
		ss>>str10;
		ss>>str11;
		ss>>str12;
		
		if(str2!="NA"&&str3!="-"&&lexical_cast<double>(str3)>=0.9)
		{
            humanmouse=str1+"   "+str2+"   "+str3;
			fprintf(file2,"%s\n",humanmouse.c_str());
			humanratnum++;
		}
	//	cout<<str1<<" "<<str2<<" "<<str3<<" "<<str4<<" "<<str5<<" "
	//	<<str6<<" "<<str7<<" "<<str8<<" "<<str9<<" "<<str10<<" "<<str11<<" "<<str12<<endl;
	}
	cout<<humanratnum<<endl;		
	
}
        最后对比,人和老鼠中RNA Gene Symbol相同的序列。

        

#include <iostream>
#include <string>
#include <stdlib.h>
#include <fstream>
#include <sstream>
#include <map>
#include <vector>

using namespace std;
using namespace boost;
#define BUF_SIZE 126

int main()
{
	FILE *file1=NULL;
	FILE *file2=NULL;
	FILE *file3=NULL;
	int humanratnum=0;
	if((file1=fopen("humanmouse.txt","r"))==NULL)
	{
        cout<<"open humanmouse.txt error"<<endl;
		exit(1);
	}
	if((file2=fopen("mousehuman.txt","r"))==NULL)
	{
        cout<<"open mousehuman.txt error"<<endl;
		exit(1);
	}
	if((file3=fopen("combinehumanmouse.txt","a+"))==NULL)
	{
        cout<<"open combinehumanmouse.txt error"<<endl;
		exit(1);
	}
    char buf[BUF_SIZE];
	string str,str1,str2,str3,str4;
	stringstream ss;
	string humanmouse="";
	map<string,vector<string> > map1;
	map<string,vector<string> >::iterator iter;
    while(fgets(buf,BUF_SIZE,file1)!=NULL&&buf[0]!='\n')
	{
        str=buf;
		ss<<str;
		ss>>str1;
		ss>>str2;
		ss>>str3;
        str4=str1+"   "+str3;
	    map1[str2].push_back(str4);
	}
    while(fgets(buf,BUF_SIZE,file2)!=NULL&&buf[0]!='\n')
	{
        str=buf;
		ss<<str;
		ss>>str1;
		ss>>str2;
		ss>>str3;
        str4=str1+"   "+str3;
	    map1[str2].push_back(str4);
	}
    int i;
	for(iter=map1.begin();iter!=map1.end();++iter)
	{
        if((iter->second).size()>=2)
		{
            for(i=0;i<(iter->second).size();i++)
			{
                humanmouse+=(iter->second)[i]+"   ";
			}
			fprintf(file3,"%s\n",humanmouse.c_str());
            humanmouse="";
			humanratnum++;
		}
	}
	
	cout<<humanratnum<<endl;		
	


}
        

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值