RNA-seq workflow

lwlisme

已于 2023-11-08 13:23:48 修改

阅读量78

点赞数

文章标签： python

于 2023-11-06 00:11:27 首次发布

本文链接：https://blog.csdn.net/lwlisme/article/details/134237255

版权

### Quarlity control step

###对当前目录下单一fq.gz文件进行fastqc
fastqc -t 12 -o ~/xxx/xxx your_rawdata_name_1.fq.gz your_rawdata_name_2.fq.gz

###将当前目录下的所有fq.gz文件进行fastqc

for file in ./*.fq.gz
do
fastqc -t 12 -o ~/xxx/fastqc_result "$file"
done

### Quarlity control step

###对当前目录下单一fq.gz文件进行fastqc
fastqc -t 12 -o ~/xxx/xxx your_rawdata_name_1.fq.gz your_rawdata_name_2.fq.gz

###将当前目录下的所有fq.gz文件进行fastqc

for file in ./*.fq.gz
do
    fastqc -t 12 -o ~/xxx/fastqc_result "$file"
done

### Trimming for adaptors (Not required)

###对当前目录下单一fq.gz文件进行trim_galore
trim_galore -q 20 --phred33 --stringency 3 --length 20 -e 0.1 --paired your_rawdata_name_1.fq.gz your_rawdata_name_2.fq.gz

###将当前目录下的所有fq.gz文件进行trim_galore

declare -a identifiers=("rawdata_name1" "rawdata_name2" "rawdata_namen" )#namen为第n个样品

for id in "${identifiers[@]}"; do

r1_file="${id}.raw.R1.fq.gz"

r2_file="${id}.raw.R2.fq.gz"

trim_galore -q 20 --phred33 --stringency 3 --length 20 -e 0.1 --paired $r1_file $r2_file --o ~/xxx/xxx

done

### Trimming for adaptors (Not required)

###对当前目录下单一fq.gz文件进行trim_galore
trim_galore -q 20 --phred33 --stringency 3 --length 20 -e 0.1 --paired your_rawdata_name_1.fq.gz your_rawdata_name_2.fq.gz

###将当前目录下的所有fq.gz文件进行trim_galore

declare -a identifiers=("rawdata_name1" "rawdata_name2" "rawdata_namen" )#namen为第n个样品

for id in "${identifiers[@]}"; do

  r1_file="${id}.raw.R1.fq.gz"

  r2_file="${id}.raw.R2.fq.gz"

  trim_galore -q 20 --phred33 --stringency 3 --length 20 -e 0.1 --paired $r1_file $r2_file --o ~/xxx/xxx

done

### Alignment step

###对当前目录下单一fq.gz文件进行hisat2及将sam文件转化为bam
hisat2 -p 12 -x <GRCm39.genome> -1 <sample_id>_R1.fastq.gz -2 <sample_id>_R2.fastq.gz | samtools view -Sb > <sample_id>.bam

###将当前目录下的所有fq.gz文件进行hisat2及将sam文件转化为bam

declare -a sample_ids=("1-1a" "1-1b" "1-1c" "1-2c" "1-3a" "1-3b" "1-3c" "1-4a" "1-4b" "1-4c" "1-5a" "1-5b" "1-5c") #括号中为每个样品的文件名

output_dir="/home/myname/xxx/xxx/xxx" # 替换为您实际的输出目录
genome_index="/home/myname/xxx/xxx/xxx/grch38_tran/genome_tran" # 替换为您实际的参考基因组索引路径

# 确保输出目录存在
mkdir -p $output_dir

# 遍历样本 ID 数组
for sample_id in "${sample_ids[@]}"; do
# 生成每一对配对的 FASTQ 文件名
r1_file="${sample_id}.raw.R1_val_1.fq.gz"
r2_file="${sample_id}.raw.R2_val_2.fq.gz"

# 生成输出 BAM 文件名
output_bam="$output_dir/${sample_id}.bam"

# 运行 hisat2 和 samtools
hisat2 -p 12 -x $genome_index -1 $r1_file -2 $r2_file | samtools view -Sb > $output_bam
done

### Alignment step

###对当前目录下单一fq.gz文件进行hisat2及将sam文件转化为bam
hisat2 -p 12 -x <GRCm39.genome> -1 <sample_id>_R1.fastq.gz -2 <sample_id>_R2.fastq.gz | samtools view -Sb > <sample_id>.bam

###将当前目录下的所有fq.gz文件进行hisat2及将sam文件转化为bam

declare -a sample_ids=("1-1a" "1-1b" "1-1c" "1-2c" "1-3a" "1-3b" "1-3c" "1-4a" "1-4b" "1-4c" "1-5a" "1-5b" "1-5c")  #括号中为每个样品的文件名
                                                                       
output_dir="/home/myname/xxx/xxx/xxx"  # 替换为您实际的输出目录
genome_index="/home/myname/xxx/xxx/xxx/grch38_tran/genome_tran"  # 替换 为您实际的参考基因组索引路径
                 
# 确保输出目录存在
mkdir -p $output_dir
                  
# 遍历样本 ID 数组
for sample_id in "${sample_ids[@]}"; do
  # 生成每一对配对的 FASTQ 文件名
  r1_file="${sample_id}.raw.R1_val_1.fq.gz"
  r2_file="${sample_id}.raw.R2_val_2.fq.gz"
                       
  # 生成输出 BAM 文件名
  output_bam="$output_dir/${sample_id}.bam"
                           
  # 运行 hisat2 和 samtools                                                       
  hisat2 -p 12 -x $genome_index -1 $r1_file -2 $r2_file | samtools view -Sb > $output_bam
done

### Counting step
featureCounts -T 12 -p -t exon -g gene_id -a <gtf.gz> -o <sample_id>.txt <sample_id>.bam

#genomic_annotation.gtf.gz文件在当前目录则替换掉<gtf.gz>；不在当前目录则需要输出文件路径，如/home/myname/xxx/xxx/genomic_annotation.gtf.gz

###对当前目录下的单一bam文件进行feaureCounts

featureCounts -T 12 -p -t exon -g gene_id -a genomic_annotation.gtf.gz -o /xxx/xxx/xxx/sample_id.txt sample_id.bam

###将当前目录下的所有bam文件进行featurecounts,合并结果保存一个merged_count.txt中

featureCounts -T 12 -p -t exon -g gene_id -a genomic_annotation.gtf.gz -o /xxx/xxx/xxx/merged_count.txt *.bam

### Counting step
featureCounts -T 12 -p -t exon -g gene_id -a <gtf.gz> -o <sample_id>.txt <sample_id>.bam

#genomic_annotation.gtf.gz文件在当前目录则替换掉<gtf.gz>；不在当前目录则需要输出文件路径，如/home/myname/xxx/xxx/genomic_annotation.gtf.gz

###对当前目录下的单一bam文件进行feaureCounts

featureCounts -T 12 -p -t exon -g gene_id -a genomic_annotation.gtf.gz  -o /xxx/xxx/xxx/sample_id.txt  sample_id.bam

###将当前目录下的所有bam文件进行featurecounts,合并结果保存一个merged_count.txt中

featureCounts -T 12 -p -t exon -g gene_id -a genomic_annotation.gtf.gz  -o /xxx/xxx/xxx/merged_count.txt *.bam