1.建立相应目录
对新数据建立对应实验人员(kongyu)、测序类型(RNA_seq)和日期(2021_02_22)的目录。
# 建立后如下:
(base) zexing@DNA:~/projects/kongyu/RNA_seq/2021_02_22$
# 新建对应的目录
mkdir raw_data clean_data ballgown bam bam_sort sam fastqc_report GSEA MD5_txt scripts_log
2.检查数据完整性
(base) zexing@DNA:~/projects/kongyu/RNA_seq/2021_02_22$ cat md5.txt > check_md5sum.txt && md5sum -c check_md5sum.txt
./clean_data/DIPG17_1_1_2.clean.fq.gz: OK
./clean_data/DIPG4_3_2_2.clean.fq.gz: OK
./clean_data/DIPG13_4_2_1.clean.fq.gz: OK
./clean_data/DIPG17_1_2_1.clean.fq.gz: OK
./clean_data/DIPG13_1_1_2.clean.fq.gz: OK
./clean_data/DIPG4_2_1_2.clean.fq.gz: OK
./clean_data/DIPG13_4_1_1.clean.fq.gz: OK
./clean_data/DIPG4_2_1_1.clean.fq.gz: OK
./clean_data/DIPG13_4_2_2.clean.fq.gz: OK
./clean_data/DIPG13_2_2_1.clean.fq.gz: OK
./clean_data/DIPG13_1_2_1.clean.fq.gz: OK
./clean_data/DIPG17_4_2_2.clean.fq.gz: OK
./clean_data/DIPG13_3_1_1.clean.fq.gz: OK
./clean_data/DIPG17_1_2_2.clean.fq.gz: OK
./clean_data/DIPG4_1_1_2.clean.fq.gz: OK
./clean_data/DIPG13_3_2_2.clean.fq.gz: OK
./clean_data/DIPG4_3_1_2.clean.fq.gz: OK
./clean_data/DIPG17_3_2_1.clean.fq.gz: OK
./clean_data/DIPG4_1_1_1.clean.fq.gz: OK
./clean_data/DIPG17_2_1_1.clean.fq.gz: OK
./clean_data/DIPG13_3_1_2.clean.fq.gz: OK
./clean_data/DIPG4_3_2_1.clean.fq.gz: OK
./clean_data/DIPG13_2_1_2.clean.fq.gz: OK
./clean_data/DIPG4_2_2_2.clean.fq.gz: OK
./clean_data/DIPG4_4_2_1.clean.fq.gz: OK
./clean_data/DIPG4_4_2_2.clean.fq.gz: OK
./clean_data/DIPG17_4_1_1.clean.fq.gz: OK
./clean_data/DIPG4_4_1_1.clean.fq.gz: OK
./clean_data/DIPG13_2_1_1.clean.fq.gz: OK
./clean_data/DIPG4_1_2_2.clean.fq.gz: OK
./clean_data/DIPG17_2_2_1.clean.fq.gz: OK
./clean_data/DIPG13_1_1_1.clean.fq.gz: OK
./clean_data/DIPG13_2_2_2.clean.fq.gz: OK
./clean_data/DIPG17_4_2_1.clean.fq.gz: OK
./clean_data/DIPG4_1_2_1.clean.fq.gz: OK
./clean_data/DIPG17_3_2_2.clean.fq.gz: OK
./clean_data/DIPG13_3_2_1.clean.fq.gz: OK
./clean_data/DIPG4_3_1_1.clean.fq.gz: OK
./clean_data/DIPG17_2_2_2.clean.fq.gz: OK
./clean_data/DIPG13_4_1_2.clean.fq.gz: OK
./clean_data/DIPG17_3_1_2.clean.fq.gz: OK
./clean_data/DIPG4_4_1_2.clean.fq.gz: OK
./clean_data/DIPG17_1_1_1.clean.fq.gz: OK
./clean_data/DIPG17_3_1_1.clean.fq.gz: OK
./clean_data/DIPG13_1_2_2.clean.fq.gz: OK
./clean_data/DIPG4_2_2_1.clean.fq.gz: OK
./clean_data/DIPG17_2_1_2.clean.fq.gz: OK
./clean_data/DIPG17_4_1_2.clean.fq.gz: OK
3.在Linux服务器中对RNA_seq数据进行处理
vim新建RNA_seq_script将数据质控、比对、格式转换、排序、拼接和定量综合在一起。
#!/bin/bash
# 上面一行宣告这个script的语法使用bash语法,当程序被执行时,能够载入bash的相关环境配置文件。
# Program
# This program is used for RNA-seq data analysis.
# History
# 2021/02/22 zexing First release
# 设置变量${dir}为常用目录
dir=/f/xudonglab/zexing/projects/kongyu/RNA_seq/2021_02_22
# 对数据进行质控
fastqc -t 16 -o ${dir}/fastqc_report/ ${dir}/clean_data/*.fq.gz
# 利用for循环进行后续操作
for i in DIPG4_1_1 DIPG4_1_2 DIPG4_2_1 DIPG4_2_2 DIPG4_3_1 DIPG4_3_2 DIPG4_4_1 DIPG4_4_2 DIPG13_1_1 DIPG13_1_2 DIPG13_2_1 DIPG13_2_2 DIPG13_3_1 DIPG13_3_2 DIPG13_4_1 DIPG13_4_2 DIPG17_1_1 DIPG17_1_2 DIPG17_2_1 DIPG17_2_2 DIPG17_3_1 DIPG17_3_2 DIPG17_4_1 DIPG17_4_2
do
# 对数据进行比对
hisat2 -t -p 16 -x /f/xudonglab/zexing/reference/UCSC_hg19/hisat2_index/hisat2_index_hg19 \
-1 ${dir}/clean_data/${i}_1.clean.fq.gz \
-2 ${dir}/clean_data/${i}_2.clean.fq.gz \
-S ${dir}/sam/${i}.sam
# 对数据进行格式转换
samtools view -@ 16 -S ${dir}/sam/${i}.sam -1b -o ${dir}/bam/${i}.bam
# 对数据进行排序
samtools sort -@ 16 -l 5 -o ${dir}/bam_sort/${i}.bam.sort ${dir}/bam/${i}.bam
# 对数据进行拼接、定量
mkdir ${dir}/ballgown/"$i"
stringtie ${dir}/bam_sort/"$i".bam.sort -o ${dir}/ballgown/"$i"/"$i".gtf \
-p 16 -G /f/xudonglab/zexing/reference/UCSC_hg19/hg19_genes.gtf -e -B \
-A ${dir}/ballgown/"$i"/"$i".gene.tab
done
后台运行RNA_seq_script:
nohup bash RNA_seq_script > RNA_seq_script_log &
4.使用prepDE.py脚本提取read_counts数值
- 进入ballgown文件夹,将prepDE.py脚本拷贝至当前文件夹
cp /f/xudonglab/zexing/software/prepDE.py ./
- 退出当前conda环境
conda deactivate
- 使用python命令直接运行脚本
python prepDE.py
运行结果中"gene_count_matrix.csv"即是DESeq2的输入文件。