输入文件
input_v1.0.txt (三列,分别是 *.1.fastq.gz,*2.fastq.gz , *.sam)
hisat2运行参数与流程(hisat2_IWGSCv1.0.py)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'shengwei ma'
__author_email__ = '[email protected]'
import subprocess
with open('input_v1.0_1.txt', 'r') as f:
for line in f:
lin = line.strip().split()
fq1, fq2, sam = lin[0], lin[1], lin[2]
proc = subprocess.Popen(
['hisat2', '-p', '20', '--dta', '-x', '/data2/Fshare/IWGSCv1.0_hisat2/IWGSCv1.0_hiast2', '--known-splicesite-infile',
'/data2/Fshare/IWGSCv1.0_hisat2/TGACv1.ss', '--novel-splicesite-infile', '/data2/masw_data/rna_seq/all.ss',
'-t', '--no-discordant', '--no-mixed', '-1', fq1, '-2', fq2, '-S', sam], shell=False)
proc.wait()
print sam
new = sam[:-3] + 'unmap.txt'
mat = sam[:-3] + 'match.sam'
mis = sam[:-3] + 'mismatch.sam'
unmap = open(new, 'w')
mat1 = open(mat, 'w')
mis1 = open(mis, 'w')
with open(sam, 'r') as f1:
for (num, value) in enumerate(f1):
lin = value.strip().split()
if value.startswith('@'):
mat1.writelines(value)
mis1.writelines(value)
else:
if '*' in lin[2]:
unmap.writelines(lin[0])
else:
if 'I' not in lin[5] and 'D' not in lin[5] and 'XM:i:0' in value: #筛选完全匹配的reads,但是对于softclip 无效
mat1.writelines(value)
if 'I' not in lin[5] and 'D' not in lin[5] and 'XM:i:0' not in value:
mis1.writelines(value)
mat1.close()
unmap.close()
mis1.close()
proc = subprocess.Popen(['samtools', 'view', '-@', '10', '-b', '-o', mat[:-3] + 'bam', mat], shell=False)
proc.wait()
proc = subprocess.Popen(['samtools', 'sort', '-@', '10', '-o', mat[:-3] + 'sorted.bam', mat[:-3] + 'bam'], shell=False)
proc.wait()
proc = subprocess.Popen(['shred', '-u', '-z', mat, sam, mat[:-3] + 'bam'], shell=False)
proc.wait()
hisat2输出信息,也可见该目录下的mapping_information.txt
Sample | total_reads | unmapped_reads | uniquely-mapped_reads | Multimapped_reads |
---|---|---|---|---|
ATW_AAOSW_6 | 45401788(100.00%) | 1625896(3.58%) | 36054911(79.41%) | 7720981(17.01%) |
ATW_ABOSW_7 | 43317025(100.00%) | 3384472(7.81%) | 32613798(75.29%) | 7318755(16.90%) |
ATW_ACOSW | 51224256(100.00%) | 3682442(7.19%) | 37964008(74.11%) | 9577806(18.70%) |
ATW_ADOSW | 85237299(100.00%) | 5327852(6.25%) | 63649972(74.67%) | 16259475(19.08%) |
ATW_AEOSW | 44470405(100.00%) | 3714550(8.35%) | 33422930(75.16%) | 7332925(16.49%) |
ATW_AFOSW_2 | 38815740(100.00%) | 2344586(6.04%) | 30028428(77.36%) | 6442726(16.60%) |
ATW_AGOSW_2 | 35749803(100.00%) | 3298492(9.23%) | 26017323(72.78%) | 6433988(18.00%) |
ATW_AHOSW_3 | 52146021(100.00%) | 4219037(8.09%) | 39229076(75.23%) | 8697908(16.68%) |
ATW_AIOSW_2 | 67283195(100.00%) | 19946431(29.65%) | 31218902(46.40%) | 16117862(23.96%) |
ATW_AKOSW_2 | 75347431(100.00%) | 24014018(31.87%) | 33383763(44.31%) | 17949650(23.82%) |
ATW_ALOSW_3 | 42039096(100.00%) | 2031197(4.83%) | 32613783(77.58%) | 7394116(17.59%) |
ATW_AMOSW_4 | 38844640(100.00%) | 7661962(19.72%) | 25025383(64.42%) | 6157295(15.85%) |
ATW_ANOSW | 89075171(100.00%) | 11783105(13.23%) | 63333349(71.10%) | 13958717(15.67%) |
ATW_AOSW | 50836846(100.00%) | 1967671(3.87%) | 40375224(79.42%) | 8493951(16.71%) |
ATW_COSW | 45388739(100.00%) | 4336069(9.55%) | 33513277(73.84%) | 7539393(16.61%) |
ATW_DOSW_2 | 48400597(100.00%) | 1782615(3.68%) | 38184280(78.89%) | 8433702(17.42%) |
ATW_FOSW_2 | 47627837(100.00%) | 11084697(23.27%) | 29081034(61.06%) | 7462106(15.67%) |
ATW_GOSW_3 | 47851480(100.00%) | 2025594(4.23%) | 38041640(79.50%) | 7784246(16.27%) |
ATW_HOSW_3 | 46349244(100.00%) | 3243306(7.00%) | 35294358(76.15%) | 7811580(16.85%) |
ATW_IOSW_4 | 53653235(100.00%) | 2427235(4.52%) | 42707453(79.60%) | 8518547(15.88%) |
ATW_KOSW_4 | 39894644(100.00%) | 3043191(7.63%) | 30655324(76.84%) | 6196129(15.53%) |
ATW_LOSW_5 | 40476784(100.00%) | 2375565(5.87%) | 31157278(76.98%) | 6943941(17.16%) |
ATW_MOSW_5 | 49643196(100.00%) | 6008219(12.10%) | 34849848(70.20%) | 8785129(17.70%) |
ATW_NOSW_6 | 45463315(100.00%) | 5519168(12.14%) | 32357850(71.17%) | 7586297(16.69%) |
ATW_POSW_6 | 42820604(100.00%) | 4444437(10.38%) | 31740451(74.12%) | 6635716(15.50%) |
ATW_QOSW_7 | 45189058(100.00%) | 2519770(5.58%) | 35464036(78.48%) | 7205252(15.94%) |
ATW_ROSW_7 | 41964678(100.00%) | 2418292(5.76%) | 32377059(77.15%) | 7169327(17.08%) |
ATW_SOSW_8 | 46010346(100.00%) | 2071554(4.50%) | 36664030(79.69%) | 7274762(15.81%) |
ATW_TOSW_8 | 41117096(100.00%) | 3337900(8.12%) | 30307865(73.71%) | 7471331(18.17%) |
ATW_VOSW_6 | 44532829(100.00%) | 1723299(3.87%) | 34529586(77.54%) | 8279944(18.59%) |
ERR392055 | 26786162(100.00%) | 4047245(15.11%) | 15990303(59.70%) | 6748614(25.19%) |
ERR392056 | 29879250(100.00%) | 4692482(15.70%) | 17363629(58.11%) | 7823139(26.18%) |
ERR392057 | 30226502(100.00%) | 3185007(10.54%) | 19814488(65.55%) | 7227007(23.91%) |
ERR392058 | 18558499(100.00%) | 1998085(10.77%) | 12203151(65.76%) | 4357263(23.48%) |
ERR392059 | 30793173(100.00%) | 2344455(7.61%) | 22029424(71.54%) | 6419294(20.85%) |
ERR392060 | 22417889(100.00%) | 2366252(10.56%) | 14884436(66.40%) | 5167201(23.05%) |
ERR392061 | 20355570(100.00%) | 3719356(18.27%) | 11386270(55.94%) | 5249944(25.79%) |
ERR392062 | 25108363(100.00%) | 3797487(15.12%) | 14548403(57.94%) | 6762473(26.93%) |
ERR392063 | 29965698(100.00%) | 4989790(16.65%) | 16957287(56.59%) | 8018621(26.76%) |
ERR392064 | 34565714(100.00%) | 4811540(13.92%) | 20727298(59.96%) | 9026876(26.12%) |
ERR392064 | 34565714(100.00%) | 4811540(13.92%) | 20727298(59.96%) | 9026876(26.12%) |
ERR392065 | 27177427(100.00%) | 4520724(16.63%) | 15412488(56.71%) | 7244215(26.66%) |
ERR392066 | 37515837(100.00%) | 7292639(19.44%) | 19656276(52.39%) | 10566922(28.17%) |
ERR392067 | 29548095(100.00%) | 5994021(20.29%) | 15715972(53.19%) | 7838102(26.53%) |
ERR392068 | 30476750(100.00%) | 3991164(13.10%) | 18606572(61.05%) | 7879014(25.85%) |
ERR392069 | 31424506(100.00%) | 2679321(8.53%) | 21520594(68.48%) | 7224591(22.99%) |
ERR392070 | 29660313(100.00%) | 4427930(14.93%) | 17592218(59.31%) | 7640165(25.76%) |
ERR392071 | 29373530(100.00%) | 4610961(15.70%) | 17098379(58.21%) | 7664190(26.09%) |
ERR392072 | 33407315(100.00%) | 4025192(12.05%) | 21596921(64.65%) | 7785202(23.30%) |
ERR392073 | 28004777(100.00%) | 3852619(13.76%) | 17378994(62.06%) | 6773164(24.19%) |
ERR392074 | 26658599(100.00%) | 5465146(20.50%) | 13906279(52.16%) | 7287174(27.34%) |
ERR392075 | 27595485(100.00%) | 4936815(17.89%) | 15247427(55.25%) | 7411243(26.86%) |
ERR392076 | 29674240(100.00%) | 5400293(18.20%) | 16416891(55.32%) | 7857056(26.48%) |
ERR392077 | 31078820(100.00%) | 3574955(11.50%) | 20081821(64.62%) | 7422044(23.88%) |
ERR392078 | 32327629(100.00%) | 2421810(7.49%) | 23264665(71.97%) | 6641154(20.54%) |
ERR392079 | 23704655(100.00%) | 3131080(13.21%) | 14112639(59.54%) | 6460936(27.26%) |
ERR392080 | 27746131(100.00%) | 2992584(10.79%) | 18525078(66.77%) | 6228469(22.45%) |
ERR392081 | 31358914(100.00%) | 3746073(11.95%) | 20709901(66.04%) | 6902940(22.01%) |
ERR392082 | 32871524(100.00%) | 2346792(7.14%) | 23727550(72.18%) | 6797182(20.68%) |
ERR392083 | 32240850(100.00%) | 3128224(9.70%) | 21232287(65.86%) | 7880339(24.44%) |
ERR392084 | 32641566(100.00%) | 2632926(8.07%) | 23252740(71.24%) | 6755900(20.70%) |
NG-5789_1A_lib7482 | 34356521(100.00%) | 2743533(7.99%) | 26100505(75.97%) | 5512483(16.04%) |
NG-5789_1B_lib7486 | 42444661(100.00%) | 2276170(5.36%) | 32538608(76.66%) | 7629883(17.98%) |
NG-5789_2A_lib7483 | 58377229(100.00%) | 1980165(3.39%) | 46325570(79.36%) | 10071494(17.25%) |
NG-5789_2B_lib7487 | 45374776(100.00%) | 1459680(3.22%) | 35121823(77.40%) | 8793273(19.38%) |
NG-5789_3A_lib7484 | 39082313(100.00%) | 2105348(5.39%) | 30799509(78.81%) | 6177456(15.81%) |
NG-5789_3B_lib7488 | 39159546(100.00%) | 1543503(3.94%) | 30324689(77.44%) | 7291354(18.62%) |
NG-5789_4A_lib7485 | 28215511(100.00%) | 1319564(4.68%) | 22135951(78.45%) | 4759996(16.87%) |
NG-5789_4B_lib7489 | 34385209(100.00%) | 2265183(6.59%) | 26117900(75.96%) | 6002126(17.46%) |
SRR1175868 | 62225856(100.00%) | 4434070(7.13%) | 46741787(75.12%) | 11049999(17.76%) |
SRR1177760 | 68599252(100.00%) | 4784428(6.97%) | 51327798(74.82%) | 12487026(18.20%) |
SRR1177761 | 77573636(100.00%) | 5418773(6.99%) | 58236525(75.07%) | 13918338(17.94%) |
SRR1460549 | 30618243(100.00%) | 2210006(7.22%) | 20459229(66.82%) | 7949008(25.96%) |
SRR1460550 | 72955159(100.00%) | 5074113(6.96%) | 48678341(66.72%) | 19202705(26.32%) |
SRR1460551 | 27179120(100.00%) | 2914078(10.72%) | 17292077(63.62%) | 6972965(25.66%) |
SRR1460552 | 37693050(100.00%) | 2304774(6.11%) | 25576360(67.85%) | 9811916(26.03%) |
SRR1460553 | 23942473(100.00%) | 1441952(6.02%) | 16313271(68.14%) | 6187250(25.84%) |
SRR1460554 | 17985663(100.00%) | 1158728(6.44%) | 12218492(67.93%) | 4608443(25.62%) |
Wheat_Room1_AL_20DPA_RNA_Extra2 | 32685090(100.00%) | 3056423(9.35%) | 21550138(65.93%) | 8078529(24.72%) |
Wheat_Room_SE_30DPA_RNA | 23711650(100.00%) | 3477993(14.67%) | 13233266(55.81%) | 7000391(29.52%) |
使用featurecount计算reads数
其中-a 是输入文件,-o 是输出结果,每次运行注意修改。
featureCounts -T 20 -t exon -g Name --readExtension5 70 --readExtension3 70 -p -O --donotsort -C -a /data2/masw_data/transcript/TGACv1.cdna.gff3 -o /data2/masw_data/transcript/TGACv1.cdna.reformat_expression_new