VHH淘选第一轮测序数据分析

最新推荐文章于 2022-04-13 14:43:01 发布

All_Will_Be_Fine噻

最新推荐文章于 2022-04-13 14:43:01 发布

阅读量270

点赞数

分类专栏： VHH NGS 文章标签： NGS

本文链接：https://blog.csdn.net/jiangshandaiyou/article/details/120132017

版权

NGS 同时被 2 个专栏收录

21 篇文章 3 订阅

订阅专栏

VHH

9 篇文章 3 订阅

订阅专栏

#质检
#修剪序列
#运行igblast
ls | grep "fq.gz$" |while read id ;do (fastqc -q -t 16 -o ./Fastqc $id );done &


java -jar /public/home/djs/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -phred33 F-2-test_R1.fastq.gz F-2-test_R2.fastq.gz -baseout F-2.fastq.gz \
ILLUMINACLIP:/public/home/djs/software/Trimmomatic-0.39/adapters/TruSeq3-PE.fa:2:30:10:8:true LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:150


# cd ~/software
# wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.39.zip
# unzip Trimmomatic-0.39.zip

cd ~/huiyu/BCR_NGS/trim2
cat ../list.txt |while read id ;do (arr=($id);java -jar /public/home/djs/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -phred33 ../${arr[0]} ../${arr[1]} -baseout ${arr[0]}.trim.fastq.gz ILLUMINACLIP:/public/home/djs/software/Trimmomatic-0.39/adapters/TruSeq3-PE.fa:2:30:10:8:true LEADING:3 TRAILING:3 SLIDINGWINDOW:4:20 MINLEN:150 );done

#conda search pandaseq
#conda install pandaseq

ls |grep "1P.fastq.gz" > 1p.txt
ls |grep "2P.fastq.gz" > 2p.txt
paste 1p.txt 2p.txt > list.txt
cat list.txt |while read id ;do arr=($id);pandaseq -f ${arr[0]} -r ${arr[1]} -F -N -o 12 -w ./${arr[0]}.merge;done #结果未压缩
cat list.txt |while read id ;do arr=($id);pandaseq -f ${arr[0]} -r ${arr[1]} -F -N -o 12 -W ./${arr[0]}.merge;done #结果文件是bzip2格式

#ls |grep "merge" |while read id ;do bzip2 -d $id ;done
#ls |grep "merge.out" |while read id ;do seqkit fq2fa $id -o ${id}.fa;done
#ls |grep -v "merge" |grep -v "txt" |while read id ;do rm $id ;done

cat NB244-R1-H_contig.fa|grep -o CAGGCGGCC.*GAACCCAAGA |sed -n 's/CAGGCGGCC\(.*\)GAACCCAAGA/\1/p'

ls |grep "merge.out" | while read id ;do awk '{if(NR%4==2) print $0}' $id |grep -o CAGGCGGCC.*GAACCCAAGA |sed -n 's/CAGGCGGCC\(.*\)GAACCCAAGA/\1/p' > ${id}.trim_vector;done
ls |grep "trim_vector" | while read id ;do awk '{a=length($0);print $0"\t"a}' $id | sort -n -k2 |cut -f1 > ${id}.sort ;done
ls |grep "sort" | while read id ;do cat $id |uniq -c |sort -n -k1 |awk '{print ">"NR"+"$1;print $2}' > ${id}.sort.sort ;done
# ls |grep "sort.sort" |while read id ;do cat $id | sed 's/.*GGTTTCGCTACCGTGGCCCAGGCGGCC/GGTTTCGCTACCGTGGCCCAGGCGGCC/g' |sed 's/GAACCCAAGACA.*//g' > ${id}.trim_vector ;done


mkdir output_igblast
ls /public/home/djs/huiyu/BCR_NGS/trim2 |grep "sort.sort" |while read id ;do
./bin/igblastn -germline_db_V ./database/alpaca_v_igblast.fasta  -germline_db_D ./database/alpaca_d_igblast.fasta -germline_db_J ./database/alpaca_j_igblast.fasta  \
-query /public/home/djs/huiyu/BCR_NGS/trim2/$id -organism alpaca  \
-auxiliary_data ./optional_file/alpaca_gl.aux -show_translation -extend_align5end -extend_align3end  \
-num_alignments_V 1 -num_alignments_D 1 -num_alignments_J 1 \
-outfmt 19  > /public/home/djs/huiyu/BCR_NGS/trim2/output_igblast/$id.txt ;done

#！/usr/bin/bash
#coding=utf-8

file=${2}
dir=${1}

/usr/bin/python3 /public/home/djs/huiyu/NGS_1.py ${dir} ${file}

cut -f 2,3,11,13,16,49 ${dir}/test1.txt > ${dir}/test2.txt

awk 'BEGIN{FS="\t"} NR > 1 {print ">"$2;print $49}' ${dir}/test1.txt  > ${dir}/cluster.fa

~/software/cd-hit-v4.8.1-2019-0228/cd-hit -i ${dir}/cluster.fa -o ${dir}/cluster.test  -c 0.79 -g 1 -n 4 -d 0 -l 4 -S 0  -sc 1 #

##注意修改如下地方的路径
cat ${dir}/cluster.test.clstr |sed 's/.*>/>/g'|sed 's/\..*//g' |tr -d '\n' | sed 's/>C/\nC/g' | awk 'BEGIN{FS=">"} {for(i=2;i<=NF;i++){print $1"\t"$i}}' > ${dir}/test.cluster.txt

sed -i '1i cluster_id\tsequence_id' ${dir}/test.cluster.txt

a=$(echo ${file} |cut -d "_" -f1)

#统计reads数目
echo -e "name\tcount" >> ${file}.count.txt   ##20210828修改
echo -e "reads\t$(cat ../../Fastqc/multiqc/multiqc_data/multiqc_fastqc.txt |grep "${a}" |cut -f 5 |uniq)" >> ${file}.count.txt

#统计contig
ls ../ | grep ${a} |grep -v "vector" | while read id ;do echo $id && awk '{if(NR%4 == 2) print $0}' ../$id | wc -l ;done > count.txt
awk 'NR==2 {print "contig\t" $0}' count.txt >> ${file}.count.txt

#统计VH contain contig
echo -e "VHH_contain_contig\t$(cat test1.txt |wc -l)" >>  ${file}.count.txt  ##20210828修改

#找到represent CDR3
cat cluster.test.clstr | grep -A 1 ">C" > represent.txt
sed -i '/--/d' represent.txt
cat represent.txt |sed 's/.*>/>/g' | sed 's/\..*//g' |awk '{if(NR%2==1) print $0}' > cluster.name.txt ##
cat represent.txt |sed 's/.*>/>/g' | sed 's/\..*//g' |awk '{if(NR%2==0) print $0}' > index_name.txt  ##
paste cluster.name.txt index_name.txt > represent_cluster.txt
sed -i 's/>//g' index_name.txt
/public/home/qingw/CORE/bin/lt_create_idx cluster.fa
#/public/home/qingw/CORE/bin/ltr_gwc cluster.fa index_name.txt  #修改
/public/home/qingw/CORE/bin/ltr_gwc cluster.fa index_name.txt  > represent_retrive.fa
awk '{if(NR%2==1) print $0}' represent_retrive.fa  > sequence_id.txt ###
awk '{if(NR%2==0) print $0}' represent_retrive.fa  > sequence_cdr3.txt ###
paste sequence_id.txt sequence_cdr3.txt > represent_retrieve_1.txt
paste represent_cluster.txt represent_retrieve_1.txt | cut -f 1,4 > ${file}.dictionary.txt  ##20210828修改

#统计cluster
echo -e "CDR3_cluster\t$(cat ${file}.dictionary.txt |wc -l)" >>  ${file}.count.txt  ##20210828修改

#统计unique protein
echo -e "unique_protein\t$(cat test2.txt |cut -f5 |sort |uniq | wc -l)" >>  ${file}.count.txt

/usr/bin/python3 /public/home/djs/huiyu/NGS_2.py ${dir} ${file}

#python脚本1 : NGS_1.py
#!/usr/bin/python3
#coding=utf-8

import sys
import os
import numpy as np
import pandas as pd

Dir = sys.argv[1]
File = sys.argv[2]
os.chdir(Dir)
df1 = pd.read_csv(File,sep='\t')
df2 = df1[(df1["stop_codon"] == 'F') & (df1["vj_in_frame"] == 'T') & (df1["v_frameshift"] == 'F') & (df1["productive"] == 'T') & (df1["complete_vdj"] == 'T')]
df3 = df2.sort_values(["v_call","j_call"])
df3.to_csv("test1.txt",sep="\t")


#!/usr/bin/python3
#coding=utf-8

import sys
import os
import numpy as np
import pandas as pd
#from openpyxl import load_workbook

Dir = sys.argv[1]
File = sys.argv[2]
os.chdir(Dir)

df1 = pd.read_csv("test2.txt",sep="\t")
df2 = pd.read_csv("test.cluster.txt",sep="\t")
df3 = pd.merge(df1,df2,on="sequence_id")
df3["cDNA_contig"] = df3["sequence_id"].map(lambda x:x.split('+')[1])
df3.cDNA_contig = df3.cDNA_contig.astype("int64")

#df3.DNA_count.dtype
df4 = df3.groupby(["v_call","j_call","cluster_id"]).cDNA_contig.sum() #xxxx
df4.name = "counts_of_contig_in_the_same_clonotype"
df5 = pd.merge(df3,df4,on=["v_call","j_call","cluster_id"])
df5["clonotype"] = df5["cluster_id"].map(lambda x:x.split(" ")[1])+"@"+df5["v_call"].map(lambda x:x.split("*")[0])+"@"+df5["j_call"].map(lambda x:x.split("*")[0])
#你可以发现cluster id相同但是V & J germline reference不一定相同
#因为我们是根据CDR3聚类得到的cluster id，所以后续还需要继续整理
#但是此处的Cluster count 是根据V & J的germline reference相同，CDR3长度相同且相似度大于80%统计的

df6 = df5.groupby("cluster_id").cDNA_contig.sum()
df6.name = "counts_of_contig_in_the_same_cluster"
df6 = pd.merge(df5,df6,on="cluster_id") #

df7= df3.groupby("sequence_alignment_aa").cDNA_contig.sum()
df7.name = "counts_of_contig_in_the_same_AA"
df7 = pd.merge(df7,df6,on="sequence_alignment_aa") #

df7 = df7.loc[:,["clonotype","counts_of_contig_in_the_same_clonotype","cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa","counts_of_contig_in_the_same_AA","cDNA_contig","v_call","j_call","sequence_alignment_aa","sequence"]]
df7.cluster_id = df7.cluster_id.map(lambda x:x.split(' ')[1])
#df7.cluster_id.dtype
df7["cluster_id"] = df7.cluster_id.astype("int64")
df7["cluster_id"] = df7.cluster_id + 1
#总表制作完成

#分子工程部希望有table of unique protein，然后呢后续分析 protein对应的contig少于10则丢弃
#然后还有的烦人的需求，他们希望最后的分析表格中cluster对应的蛋白种类标记出来，#还是需要用到总表

#table of unique protein 
df8 = df7.sort_values(["counts_of_contig_in_the_same_AA","cDNA_contig"],ascending=False) 
df8 = df8.drop_duplicates(subset="sequence_alignment_aa",keep="first")   #此时70189行

#我们开始丢弃一些序列了
df9 = df8.loc[df8.counts_of_contig_in_the_same_AA >= 10]   #此时是1250行

#table of unique clonotype
df10 = df9.sort_values(["cDNA_contig","counts_of_contig_in_the_same_AA","counts_of_contig_in_the_same_cluster"],ascending=False)
df10 = df10.drop_duplicates(subset="clonotype", keep='first')   #72行
 
#table of unique cluster
df11 = df9.sort_values(["counts_of_contig_in_the_same_cluster","cDNA_contig","counts_of_contig_in_the_same_AA"],ascending=False)
df11 = df11.drop_duplicates(subset="cluster_id", keep='first')   #66行

#开始最后的分析表格制作了，头大
#table of analysis
df12 = df11.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","cdr3_aa"]]

df12["frequency_of_total"] = df12.counts_of_contig_in_the_same_cluster/df3.cDNA_contig.sum()

df12["frequency"] = df12.counts_of_contig_in_the_same_cluster/df12.counts_of_contig_in_the_same_cluster.sum()

df12["CDR3_length"] = df12.cdr3_aa.apply(lambda x:len(x))

df12 = df12.loc[:,["cluster_id","counts_of_contig_in_the_same_cluster","frequency","frequency_of_total","cdr3_aa","CDR3_length"]]
df12 = df12.sort_values("cluster_id")

#把代表性序列薅出来
df13 = pd.read_csv(File+".dictionary.txt",sep="\t",names=["cluster_id","represent"])
df13["cluster_id"] = df13["cluster_id"].map(lambda x:x.split(' ')[1])
df13.cluster_id = df13.cluster_id.astype("int64")
df13.cluster_id = df13.cluster_id + 1

#
#df12中的关键列是int64
df12.cluster_id = df12.cluster_id.astype("string")
df13.cluster_id = df13.cluster_id.astype("string")
df14 = pd.merge(df12,df13,on="cluster_id",how="inner")

#忘记了还要把蛋白种类提出来，WC
#用table of unique protein 制作这个数据
df16 = df8.groupby("cluster_id",as_index=False).agg("count")
df16 = df16.loc[:,["cluster_id","sequence_alignment_aa"]]
df16.rename(columns={"sequence_alignment_aa":"variety_of_protein"})
df16.cluster_id = df16.cluster_id.astype("string")
df14 = pd.merge(df14,df16,on="cluster_id",how="inner")

#将一些总结性的数据放在表头
df15 = pd.read_csv(File+".count.txt",sep="\t")
df15.iloc[2,1] = df3.cDNA_contig.sum()

#写入文件
with pd.ExcelWriter(File+".result.xlsx") as writer:
    df7.to_excel(writer, sheet_name="all_of_contig",index=False)
    df8.to_excel(writer, sheet_name="table_of_unique_protein",index=False)
	df10.to_excel(writer, sheet_name="table_of_unique_clonotype",index=False)
	df11.to_excel(writer, sheet_name="table_of_unique_cluster",index=False)
	df14.to_excel(writer, sheet_name="table_of_analysis",index=False,startrow=8)
	df15.to_excel(writer,sheet_name="table_of_analysis",index=False,startrow=0)

All_Will_Be_Fine噻

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
VHH淘选第一轮测序数据分析

#质检#修剪序列#运行igblastls | grep "fq.gz$" |while read id ;do (fastqc -q -t 16 -o ./Fastqc $id );done &java -jar /public/home/djs/software/Trimmomatic-0.39/trimmomatic-0.39.jar PE -phred33 F-2-test_R1.fastq.gz F-2-test_R2.fastq.gz -baseout F-2.fastq.gz
复制链接

扫一扫

专栏目录