main.cpp
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#include <cmath>
#include <inttypes.h>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string.hpp>
#include "rpkm.h"
using namespace std;
void usage()
{
cout << "./rpkm <input.sh_sort.sam> <genome.fa.fai> <output> " << endl;
cout << " -h get help information" << endl;
exit (0);
}
int main(int argc, char *argv[])
{
int c;
while ( (c=getopt(argc,argv,"a:p:h")) != -1 )
{
switch(c)
{
case 'h' : usage();break;
default : usage();
}
}
if (argc < 3) usage();
string sam_file = argv[optind++];
string fai_file = argv[optind++];
map<string,int> GenSize;
map<string,int> GenRead;
load_fai(fai_file,GenSize);
ifstream infile;
infile.open(sam_file.c_str());
if (! infile )
{
cerr << "fail to open input file" << sam_file;
}
string lineStr;
int total_line;
while ( getline(infile,lineStr,'\n' ))
{
vector<string> lineVec;
++total_line;
boost::split(lineVec,lineStr,boost::is_any_of("\t"),boost::token_compress_on);
if (lineVec[5] == "*"){
continue;
}
GenRead[lineVec[2]]++;
}
infile.close();
cout << "Gene\trpkm\treads\tGene_length\ttotal_reads" << endl;
cal_rpkm(GenRead,GenSize,total_line);
}
rpkm.h
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <algorithm>
#include <map>
#include <set>
#include <cmath>
#include <inttypes.h>
#include <boost/lexical_cast.hpp>
#include <boost/algorithm/string.hpp>
void load_fai(std::string &matrix_file,std::map<std::string,int> &GenSize);
void cal_rpkm(std::map<std::string,int> &GenRead, std::map<std::string,int> &GenSize,int total);
rpkm.cpp
#include "rpkm.h"
using namespace std;
void load_fai(string &matrix_file, map<string,int> &GenSize){
ifstream infile;
infile.open(matrix_file.c_str());
if (! infile ) {
cerr << "fail to open input file " << matrix_file << endl;
exit(0);
}
string lineStr;
while (getline(infile,lineStr,'\n')){
if (lineStr[0] == ' ' || lineStr[0] == '\n'){
continue;
}
vector<string> lineVec;
boost::split(lineVec,lineStr, boost::is_any_of(":, \t\n"), boost::token_compress_on);
GenSize[lineVec[0]] = boost::lexical_cast<int>(lineVec[1]);
}
infile.close();
}
void cal_rpkm(map<string,int> &GenRead, map<string,int> &GenSize,int total){
map<string,int>::const_iterator map_size;
for (map_size=GenSize.begin();map_size!=GenSize.end();map_size++){
int _size = map_size->second;
int _read = 0;
map<string,int>::iterator map_read;
map_read = GenRead.find(map_size->first) ;
if ( map_read != GenRead.end() ){
_read = GenRead[map_size->first];
}
double rpkm = log( (double)_read ) + log( (double)100000000 ) - log( (double)_size ) - log( (double)total );
cout << map_size->first << "\t" << exp(rpkm) << "\t" << _read << "\t" << _size << "\t" << total << endl;
}
}
Makefile
cc=g++
exe=rpkm_bam
obj=main.o rpkm.o
$(exe):$(obj)
$(cc) -o $(exe) $(obj)
main.o:main.cpp rpkm.h
$(cc) -c main.cpp
MTP.o:rpkm.cpp rpkm.h
$(cc) -c MTP.cpp
clean:
rm -rf *.o $(exe)