首先创建需要的文件夹
mkdir result stringtie bam fastqc-fastp all_data fastqc fastp featureCount
将需要的fq.gz全部转移至对应data文件夹
mv ./fastq/*.fq.gz ./all_data
首先对fq文件进行质控,查看测序质量如何
nohup fastqc -t 20 -o fastqc ./all_data/*fq.gz &
multiqc ./fastqc -o fastqc
接下来针对质控的结果进行低质量reads的去除以及adapter的去除
nohup fastp -i ./all_data/E140-P0-B_1.fq.gz -I ./all_data/E140-P0-B_2.fq.gz -o ./fastp/CE140P0B_Data1.fq.gz --adapter_fasta /data1/guolhh/proj/index/fastp/gene.fna -O ./fastp/CE140P0B_Data2.fq.gz -h ./fastp/CE140P0B_.html -j ./fastp/CE140P0B_.json &
nohup fastp -i ./all_data/E140-P0-L_1.fq.gz -I ./all_data/E140-P0-L_2.fq.gz -o ./fastp/CE140P0L_Data1.fq.gz --adapter_fasta /data1/guolhh/proj/index/fastp/gene.fna -O ./fastp/CE140P0L_Data2.fq.gz -h ./fastp/CE140P0L_.html -j ./fastp/CE140P0L_.json &
去除后重新进行质控,确保结果是可用的
nohup fastqc -t 20 -o fastqc-fastp ./fastp/*fq.gz &
multiqc ./fastqc-fastp -o fastqc-fastp
利用hisat2和对应物种的index对fq文件进行比对定量(index一般选择最新的),得到bam文件
####MacFac_6 mapping
nohup hisat2 -p 20 \
-x /data1/guolhh/proj/index/ensembl_Macaca_fascicularis_6.0/ref_fasci_6_gtf \
-1 ./fastp/CE140P0B_Data1.fq.gz \
-2 ./fastp/CE140P0B_Data2.fq.gz \
-S ./bam/CE140P0B_MacFac_6.sam > ./bam/CE140P0B_MacFac_6_hisat2.log 2>&1 &
nohup hisat2 -p 20 \
-x /data1/guolhh/proj/index/ensembl_Macaca_fascicularis_6.0/ref_fasci_6_gtf \
-1 ./fastp/CE140P0L_Data1.fq.gz \
-2 ./fastp/CE140P0L_Data2.fq.gz \
-S ./bam/CE140P0L_MacFac_6.sam > ./bam/CE140P0L_MacFac_6_hisat2.log 2>&1 &
将bam文件排序得到sort后的bam文件
nohup samtools sort -@ 20 ./bam/CE140P0B_MacFac_6.sam -o ./bam/CE140P0B_MacFac_6_sort.bam &
nohup samtools index ./bam/CE140P0B_MacFac_6_sort.bam &
nohup samtools sort -@ 20 ./bam/CE140P0L_MacFac_6.sam -o ./bam/CE140P0L_MacFac_6_sort.bam &
nohup samtools index ./bam/CE140P0L_MacFac_6_sort.bam &
通过stringtie包对sort后的bam文件定量得到基因和表达量的tab文件
nohup stringtie -t -C ./stringtie/CE140P0B_cov_refs.gtf.txt -e -B -A ./stringtie/CE140P0B_MacFac_6_gene_abund.tab -p 20 \
-G /data1/guolhh/proj/index/ensembl_Macaca_fascicularis_6.0/Macaca_fascicularis.Macaca_fascicularis_6.0.107.gtf \
-o ./stringtie/CE140P0B_Annotated.gtf -l CE140P0B ./bam/CE140P0B_MacFac_6_sort.bam &
nohup stringtie -t -C ./stringtie/CE140P0L_cov_refs.gtf.txt -e -B -A ./stringtie/CE140P0L_MacFac_6_gene_abund.tab -p 20 \
-G /data1/guolhh/proj/index/ensembl_Macaca_fascicularis_6.0/Macaca_fascicularis.Macaca_fascicularis_6.0.107.gtf \
-o ./stringtie/CE140P0L_Annotated.gtf -l CE140P0L ./bam/CE140P0L_MacFac_6_sort.bam &
通过featureCounts包获得基因的counts数
nohup featureCounts -t exon -g gene_id -Q 20 --primary -s 0 -p -T 20 \
-a /data1/guolhh/proj/index/MFA_Macaca_fascicularis_5.0/GCF_000364345.1_Macaca_fascicularis_5.0_genomic.gtf \
-o ./featureCount/raw_counts.csv ./bam/*_sort.bam &
获得数据counts数以及TPM后,RNA-seq数据上游处理就结束了。接下来可以利用基因counts数或者TPM进行下游差异基因富集以及通路富集分析等分析。