太久不用了啥都忘了 一步步走..
1.重新下载ubuntu就花了好久....
2.下好了以后关联python3命令到python3.9.6(习惯)
sudo ln -s /usr/bin/python3 /usr/bin/python
zky@ZKY:/mnt/g/linux_zky/Python-3.9.6$ python3
Python 3.9.6 (default, Mar 10 2024, 08:00:53)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> quit()
3.下载
apt install hisat-2
conda install -c bioconda stringtie
4.建库
hisat2-build --bmax 389309110 --dcv 1024 -p 8 genome.fa genome
5.文件位置
/mnt/e/转录组全部原始数据/DZOE2024011942-b1
#双端文件比对mapping到参考基因组
hisat2 -t -p 20 -x /mnt/g/linux_zky/database/genome \ -1 H_S_L_l_1_R1.fq -2 H_S_L_l_1_R2.fq -S /mnt/g/linux_zky/H_S_L_l_1.sam
排序
samtools view -@ 16 -S H_S_L_l_1.sam -b > H_S_L_l_1.bam
samtools sort H_S_L_l_1.bam -o H_S_L_l_1_sort.bam -@ 16
单个样本组装
stringtie -p 16 H_S_L_l_1_sort.bam -G genes.gtf -o H_S_L_l_1.gtf
这里通过脚本进行批量单个样本的组装
import os
import subprocess
# 项目位置:/mnt/g/linux_zky/
# 定义目录路径
directory = "/mnt/e/转录组全部原始数据/DZOE2024011942-b1"
unique_list = set()
for filename in os.listdir(directory):
# 构建完整的文件路径
print(filename.split(".")[0])
unique_list.add(filename.split(".")[0])
print(unique_list.__len__())
for i in unique_list:
print("run cmd:", i)
#这里重新加了-e 不加就会获得所有转录本 包括未知的
cmd4 = f'stringtie -e -p 64 /mnt/g/linux_zky/bam_gtfs/bams/{i}_sort.bam -G /mnt/g/linux_zky/genes.gtf -o /mnt/g/linux_zky/test/sort_gtf/{i}_sort.gtf'
subprocess.run(cmd4, shell=True)
获得strimerge.gtf
1.emergeLIst
import os
# 项目位置:/mnt/g/bam_gtfs/gtfs
# 定义目录路径
directory = "/mnt/g/linux_zky/test/sort_gtf/"
allGtfs = open("mergelist.txt", "w")
unique_list = set()
for filename in os.listdir(directory):
# 构建完整的文件路径
allGtfs.write(directory + filename + "\n")
#获得strimerge.gtf
"""
then=>
stringtie --merge -p 16 -G genes.gtf -o strimerge.gtf mergelist.txt
"""
2.组装
stringtie --merge -p 16 -G genes.gtf -o strimerge.gtf mergelist.txt
Ballgown获得所有转录本及其丰度
import os
import subprocess
# "/mnt/g/linux_zky/test/ballGown/b_gtfs"
# 定义目录路径
directory_o_gtfs = "/mnt/g/linux_zky/test/ballGown/b_gtfs/"
directory_o_tabs = "/mnt/g/linux_zky/test/ballGown/b_tabs/"
directory_in = "/mnt/g/linux_zky/bam_gtfs/bams/"
for filename in os.listdir(directory_in):
# 构建完整的文件路径
sortBam = directory_in + filename
oriName = filename.split("_sort")[0]
ballgown_gtf = directory_o_gtfs + oriName + ".gtf"
gene_abund = directory_o_tabs + oriName + ".tab"
print(sortBam, gene_abund, ballgown_gtf)
cmd = f'stringtie -e -B -p 64 -G strimerge.gtf -o {ballgown_gtf} {sortBam} -A {gene_abund}'
subprocess.run(cmd, shell=True)
命令行:
python prepDE.py3 -i /mnt/g/bam_gtfs/rnaSeqAnalyse/sample_list.txt -g /mnt/g/bam_gtfs/rnaSeqAnalyse/gene_count.csv -t /mnt/g/bam_gtfs/rnaSeqAnalyse/transcript.csv
sample_list
# 代替字典
replace_dict = {
"H": "12h",
"W": "6d",
"M": "13d",
"S": "9782",
"T": "9614",
"N": "np",
"L": "lp",
}
import os
# 定义目录路径
directory = "/mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/"
all_sample = open("sample_list.txt", "w")
for filename in os.listdir(directory):
# 构建完整的文件路径
if filename.split(".")[1] == "gtf":
file = directory + filename
sample = filename.split(".")[0]
for key in replace_dict:
sample = sample.replace(key, replace_dict[key])
print(sample)
print(file)
all_sample.write(sample + "\t" + directory + filename + "\n")
"""
(zky) zky@ZKY:/mnt/g/linux_zky/bam_gtfs/stringtie-master$
python prepDE.py3 -i /mnt/g/linux_zky/mapping后分析/sample_list.txt -g /mnt/g/linux_zky/mapping后分析/preDEdata/gene_count.csv -t /mnt/g/linux_zky/mapping后分析/preDEdata/transcript.csv"""
sample_list
12h_9782_lp_l_1 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_l_1.gtf
12h_9782_lp_l_2 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_l_2.gtf
12h_9782_lp_l_3 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_l_3.gtf
12h_9782_lp_r_1 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_r_1.gtf
12h_9782_lp_r_2 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_r_2.gtf
12h_9782_lp_r_3 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_L_r_3.gtf
12h_9782_np_l_1 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_N_l_1.gtf
12h_9782_np_l_2 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_N_l_2.gtf
12h_9782_np_l_3 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_N_l_3.gtf
12h_9782_np_r_1 /mnt/g/linux_zky/mapping后分析/ballGown/b_gtfs/H_S_N_r_1.gtf
............
prepDE:
(zky) zky@ZKY:/mnt/g/linux_zky/bam_gtfs/stringtie-master$
python prepDE.py3
-i /mnt/g/linux_zky/mapping后分析/sample_list.txt
-g /mnt/g/linux_zky/mapping后分析/preDEdata/gene_count.csv
-t /mnt/g/linux_zky/mapping后分析/preDEdata/transcript.csv
重新调整
import pandas as pd
g_c = pd.read_csv("G:\linux_zky\mapping后分析\preDEdata\gene_count.csv")
test = g_c[~g_c['gene_id'].str.contains("\|")]
print(test)
g_c['gene_id'] = g_c['gene_id'].str.split("\|").str[1]
g_c.to_csv("G:\linux_zky\mapping后分析\preDEdata\gene_count_2.csv", index=False) # 写入新的 CSV 文件,不包含索引列
重排序 所有tab
import os
import pandas as pd
#排序用于整理出表达量矩阵
dictionary = r"G:\linux_zky\mapping后分析\ballGown\b_tabs"
for i in os.listdir(dictionary):
with open(os.path.join(dictionary, i), "r") as f:
reader = pd.read_csv(f, sep='\t')
print(i)
i_index = reader.sort_values(by="Gene Name")
i_index.iloc[:,1:].to_csv(fr"G:\linux_zky\mapping后分析\ballGown\sorted\{i}_sort.csv", index=False) # 写入新的 CSV 文件,不包含索引列
用重排的tabs自己实现表达量矩阵
import os
import pandas as pd
replace_dict = {
"H": "12h",
"W": "6d",
"M": "13d",
"S": "9782",
"T": "9614",
"N": "np",
"L": "lp",
}
dictionary = r"G:\linux_zky\mapping后分析\ballGown\sorted"
all_FPKM_values = [] # 用于存储所有的 FPKM 值
with open(r"G:\linux_zky\mapping后分析\ballGown\sorted\H_S_L_l_1.tab_sort.csv", "r") as f_head:
reader = pd.read_csv(f_head, sep=',')
FPKM_head = reader.iloc[:, :1] # 获取 FPKM 列
all_FPKM_values.append(FPKM_head)
df = pd.concat(all_FPKM_values, axis=1) # 将多个 FPKM 列合并为一个 DataFrame
for i in os.listdir(dictionary):
if i.endswith(".csv"):
with open(os.path.join(dictionary, i), "r") as f:
reader = pd.read_csv(f, sep=',')
FPKM_column = reader.iloc[:, 7] # 获取 FPKM 列
i = i.split(".")[0]
for key in replace_dict:
i = i.replace(key, replace_dict[key])
df[i] = FPKM_column
df.to_csv("fpkm_matrix.csv", index=False) # 写入新的 CSV 文件,不包含索引列
g_diff_DEG_group.py
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(["case", "case_name", "control", "control_name", "replicate", "paired", "method"])
group_ = [
"12h_9782_l",
"12h_9782_r",
"12h_9614_l",
"12h_9614_r",
"6d_9782_l",
"6d_9782_r",
"6d_9614_l",
"6d_9614_r",
"12d_9782_l",
"12d_9782_r",
"12d_9614_l",
"12d_9614_r",
]
days = ["12h", "6d", "13d"]
materials = ["_9782", "_9614"]
c_c = ["_lp", "_np"]
parts = ["_l", "_r"]
repetition = ["_1", "_2", "_3"]
for day in days:
for material in materials:
for part in parts:
case = day + material + c_c[0] + part + repetition[0] + "," + day + material + c_c[0] + part + repetition[1] + "," + day + material + c_c[0] + part + repetition[2]
caseName = day + material + c_c[0] + part
control = day + material + c_c[1] + part + repetition[0] + "," + day + material + c_c[1] + part + repetition[1] + "," + day + material + c_c[1] + part + repetition[2]
controlName = day + material + c_c[1] + part
replicate = "yes"
paired = "yes"
method = "DESeq2"
ws.append([case, caseName, control, controlName, replicate, paired, method])
wb.save("diff_deg_group.xlsx")
用欧易云做吧 做了degs分析
——————
——
wgcna分析
由于进行WGCNA分析时候组别太多 并且全部的表达量矩阵会导致出来的聚类模块非常错乱
并且在三个样本里面均质性不好
所以就用三个生物学重复在DEGS里面验证过P值Q值的基因直接提出来分别拿到根和叶中的表达量矩阵
重做表达量矩阵....
"""
将把表达量矩阵重做成平均值来分析
将每组三列数据的平均值作为新的一列,且新列的名称保留原始列名的前缀部分。
"""
import pandas as pd
f_ubundant = pd.read_csv("newFpkmMatrix.csv", index_col=0)
# 提取每组三列数据的前缀部分,例如将 '12h_9614_lp_l_1' 转换为 '12h_9614_lp_l'
column_prefixes = set(col.rsplit('_', 1)[0] for col in f_ubundant.columns[:])
print(len(column_prefixes))
# 计算每组三列数据的平均值
averaged_data = pd.DataFrame()
for prefix in column_prefixes:
cols = [col for col in f_ubundant.columns if col.startswith(prefix)]
print(cols)
averaged_data[prefix] = round(f_ubundant[cols].mean(axis=1), 1)
print(prefix)
# 保存结果
averaged_data.to_csv('averaged_gene_FPKM.csv')
a获取了表达量均值的矩阵 后面就是把DEGS差异文件里面的每一个-diff-p-val-0.05-FC-2.gene.xls文件的相应组别的基因号提取出来 然后去除表达量太低的 (<1.5)
返回对照到矩阵就好了
import os
import pandas as pd
id_L = set()
id_R = set()
for filename in os.listdir("../DEGS"):
if filename.endswith("0.05-FC-2.gene.xls"):
if "_l-" in filename:
f_l = pd.read_csv("../DEGS/" + filename, sep="\t")
id_L.update(f_l['gene_id']) # 使用 update() 将 'gene_id' 列的值添加到集合中
elif "_r-" in filename:
f_l = pd.read_csv("../DEGS/" + filename, sep="\t")
id_R.update(f_l['gene_id']) # 使用 update() 将 'gene_id' 列的值添加到集合中
else:
pass
# 读取表达量矩阵
matrix = pd.read_csv("averaged_gene_FPKM.csv")
# 获取包含 "_l" 的列名
l_columns = [col for col in matrix.columns if "p_l" in col]
# 获取包含 "_r" 的列名
r_columns = [col for col in matrix.columns if "_r" in col]
# 根据 id_L 筛选行,并选择 "_l" 列创建 f_lMatrix
f_lMatrix = matrix[matrix["gene_id"].isin(id_L)][["gene_id"] + l_columns]
# 去除所有基因表达量都低于1.5的基因
f_lMatrix = f_lMatrix[(f_lMatrix[l_columns] > 1.5).any(axis=1)]
f_lMatrix.drop_duplicates(subset="gene_id",inplace=True)
f_lMatrix.to_csv("f_lMatrix.csv", index=False)
# 根据 id_R 筛选行,并选择 "_r" 列创建 f_rMatrix
f_rMatrix = matrix[matrix["gene_id"].isin(id_R)][["gene_id"] + r_columns]
# 去除所有基因表达量都低于1.5的基因
f_rMatrix = f_rMatrix[(f_rMatrix[r_columns] > 1.5).any(axis=1)]
f_rMatrix.drop_duplicates(subset="gene_id",inplace=True)
f_rMatrix.to_csv("f_rMatrix.csv", index=False)
gene_id,12h_9782_lp_l,13d_9614_lp_l,6d_9782_np_l,6d_9782_lp_r,13d_9614_np_r,6d_9782_np_r,12h_9614_np_r,13d_9782_lp_r,13d_9614_lp_r,6d_9614_lp_r,6d_9614_lp_l,13d_9782_lp_l,12h_9782_lp_r,12h_9782_np_l,12h_9614_np_l,12h_9782_np_r,13d_9782_np_l,6d_9782_lp_l,6d_9614_np_r,12h_9614_lp_l,6d_9614_np_l,13d_9782_np_r,12h_9614_lp_r,13d_9614_np_l
Zm00001d014914,29.3,14.9,26.4,0.8,0.8,0.4,0.7,0.3,1.1,12.2,16.8,21.3,2.1,14.6,22.5,0.1,4.2,22.9,1.9,20.9,3.9,0.4,0.9,6.9
Zm00001d023904,33.4,4.6,36.5,31.9,5.2,35.9,8.0,35.4,4.9,6.3,5.1,34.6,38.0,39.2,5.0,52.0,28.8,40.6,5.9,4.3,5.2,35.0,7.0,5.5
Zm00001d007951,52.1,73.8,82.9,83.1,98.8,84.7,108.4,98.0,111.4,84.7,65.0,78.8,76.2,44.7,36.4,110.2,68.2,83.1,87.8,32.4,66.1,79.1,102.8,71.6
Zm00001d006372,0.1,0.1,0.2,0.9,0.1,0.9,0.7,0.1,0.0,0.5,0.1,0.1,0.3,0.0,0.0,0.2,0.3,0.3,1.5,0.0,0.0,0.3,0.5,0.0
Zm00001d034644,279.8,314.2,381.5,595.8,448.5,581.4,402.1,492.6,485.1,478.5,272.4,345.7,499.2,254.9,283.1,450.9,337.5,349.5,460.0,266.8,280.9,604.6,451.5,310.1
传入相关性状....
————————
分析补充1:玉米的B73数据库+基因注释——匹配整理
要求对照
的基因id,直接在https://www.ncbi.nlm.nih.gov/gene搜了
(b73) AND "Zea mays"[porgn:__txid4577]
把列表信息全下下来了
b73_geneData.xls
Aliases | description | other_designations | map_location | chromosome | genomic_nucleotide_accession.version | start_position_on_the_genomic_accession | end_position_on_the_genomic_accession | orientation | exon_count | OMIM |
ZEAMMB73_Zm00001d001812, GRMZM2G457621, RAF1 | Rubisco accumulation factor 1, chloroplastic | rubisco accumulation factor 1, chloroplastic|Rubisco Assembly Factor 1 | 2 | NC_050097.1 | 1617667 | 1619220 | plus | 1 | ||
ZEAMMB73_Zm00001d044122, DFR, FNR, GRMZM2G026930, pco110172(456) | Dihydroflavonol 4-reductase | dihydroflavonol 4-reductase|A1 (Teosinte)|Dihydrokaempferol 4-reductase|NADPH-dependent reductase|NADPH-dependent reductase-like protein|dihydroflavonol4-reductase|flavanone 4-reductase | 3 | NC_050098.1 | 221779797 | 221781573 | minus | 4 | ||
ZEAMMB73_Zm00001d050133, GRMZM2G365423, GRMZM2G389303, GRMZM2G437977, umc1175 | Bifunctional aspartokinase/homoserine dehydrogenase 1, chloroplastic | bifunctional aspartokinase/homoserine dehydrogenase 1, chloroplastic|AK-HD 1|AK-HSDH 1|Bifunctional aspartokinase/homoserine dehydrogenase 2 chloroplastic|Homoserine dehydrogenase|aspartate kinase-homoserine dehydrogenase | 4 | NC_050099.1 | 68464991 | 68510882 | minus | 19 | ||
ZEAMMB73_Zm00001d050339, ALDH10A8, AMADH1, GRMZM2G013214, GRMZM2G146754, ZmAMADH1a, gpm154 | Aminoaldehyde dehydrogenase 1a | aminoaldehyde dehydrogenase 1a|4-trimethylammoniobutyraldehyde dehydrogenase AMADH1a|Aminobutyraldehyde dehydrogenase AMADH1a|Betaine aldehyde dehydrogenase 2 mitochondrial|Betaine aldehyde dehydrogenase AMADH1a|Gamma-guanidinobutyraldehyde dehydrogenase AMADH1a|aminoaldehyde dehydrogenase 1 | 4 | NC_050099.1 | 82466139 | 82471927 | plus | 15 |
后面用正则慢慢配到Symbol这一列
"""
输入任意表头的单列基因编号文件 用于匹配相应基因注释
"""
import pandas as pd
# 读取基因文件
f_geneData = r'b73T1.csv'
f_geneData = pd.read_csv(f_geneData)
csv_ = pd.read_csv("input.txt")
print(csv_)
result_description = []
result_aliases = []
for i in csv_.iloc[:, 0]:
target = 'description'
f_g_t_aliases = f_geneData[f_geneData['Aliases'].str.contains(i)]
if len(f_g_t_aliases) != 0:
description = f_g_t_aliases[target].values[0]
aliases = f_g_t_aliases['other_designations'].values[0]
result_description.append(description)
result_aliases.append(aliases)
else:
f_g_t_symbol = f_geneData[f_geneData['Symbol'].str.contains(i)]
if len(f_g_t_symbol) != 0:
description = f_g_t_symbol[target].values[0]
aliases = f_g_t_symbol['other_designations'].values[0]
result_description.append(description)
result_aliases.append(aliases)
else:
print("没有匹配到:", i)
result_description.append(None)
result_aliases.append(None)
csv_['description'] = result_description
# csv_['Aliases'] = result_aliases
print(csv_)
csv_.to_csv("结果.csv", index=False)
分析补充2 :自己计算所有基因的log2FoldChange
"""
对这样的一个csv文件写一个脚本 首先
取表头的前半部分和最后一部分比如 12h_9782_lp_l 取到的应该是 12h_9782_l 可以考虑用split("_")进行拼接 新建一个dataframe,
之后按照set中的内容分别检索 比如12h_9782_l是一个set中的元素 取12h_9782_np_l 和 12h_9782_lp_l那么往这个新的df中加入一列,
命名为12h_9782_l-LPvsNP 其中的内容是12h_9782_lp_l/12h_9782_np_l 的值 最后生成一个含有12个纵列的比值df 存为csv ```
在修改后的代码中,我们定义了一个伪计数pseudocount,其值为0.001。在计算log2FoldChange之前,我们将处理组和对照组的表达量值分别加上伪计数。
result_df[f"{col}-LPvsNP"] = np.log2((df[lp_col] + pseudocount) / (df[np_col] + pseudocount))
通过添加伪计数,我们可以避免除以0或计算log2(0)的问题。这种方法在处理RNA-seq或其他高通量数据时常用,可以有效地处理表达量为0的情况。
请注意,伪计数的选择可能会影响结果的解释。选择一个合适的伪计数值取决于数据的特点和研究目的。常用的伪计数值包括0.001、1e-5或1。
"""
import pandas as pd
# 读取CSV文件
df = pd.read_csv('averaged_gene_FPKM.csv')
# 提取列名的第1、2、4个元素
columns = df.columns[1:]
column_set = set(["_".join(col.split("_")[:2] + [col.split("_")[3]]) for col in columns])
# 创建一个新的DataFrame用于存储比值
result_df = pd.DataFrame()
# 将gene_id添加到结果DataFrame中
result_df['gene_id'] = df['gene_id']
# 遍历集合中的每个元素
for col in column_set:
lp_col = [c for c in columns if col.split("_")[0] in c and col.split("_")[1] in c and col.split("_")[2] in c and "lp" in c]
np_col =[c for c in columns if col.split("_")[0] in c and col.split("_")[1] in c and col.split("_")[2] in c and "np" in c]
if lp_col and np_col:
lp_col = lp_col[0]
np_col = np_col[0]
# 计算比值并添加到结果DataFrame中
result_df[f"{col}-LPvsNP"] = df[lp_col] / df[np_col]
result_df = result_df.round(3)
# 保存结果为新的CSV文件
result_df.to_csv('output.csv', index=False)
这种方法可以获得差异倍数文件:
用于表达量作图 自定义基因的表达差异分析
gene_id,6d_9782_l-LPvsNP,6d_9782_r-LPvsNP,6d_9614_r-LPvsNP,6d_9614_l-LPvsNP,13d_9614_l-LPvsNP,12h_9782_l-LPvsNP,13d_9782_l-LPvsNP,12h_9614_l-LPvsNP,12h_9614_r-LPvsNP,12h_9782_r-LPvsNP,13d_9782_r-LPvsNP,13d_9614_r-LPvsNP
Zm00001d014914,-5.0426,0.9982,2.6822,1.6451,1.1105,1.0049,-3.8029,-0.1064,0.3621,4.3786,-0.4138,0.4589
Zm00001d023904,-0.1943,-0.1704,0.0946,0.2768,-0.2577,-0.231,0.2977,-0.2175,-0.1926,-0.4525,0.0164,-0.0857
Zm00001d007951,0.0035,-0.0275,-0.0519,0.3577,0.0437,0.221,0.523,-0.1679,-0.0765,-0.5323,0.3091,0.1732
Zm00001d006372,2.1643,0.0,-1.583,8.9687,6.6582,6.6582,-1.5754,0.0,-0.4846,0.5826,-1.5754,-6.6582
Zm00001d034644,0.6431,0.0353,0.0569,0.7685,0.0189,0.1345,0.5455,-0.0856,0.1672,0.1468,-0.2956,0.1132
Zm00001d016572,-0.3539,-0.0895,0.089,0.4088,-0.037,0.7711,0.3268,-0.5075,0.1355,0.1817,0.2278,-0.2194
Zm00001d017857,0.2529,-0.085,0.03,1.1262,-0.1072,0.0842,0.8826,-0.5577,0.5288,0.1322,0.1466,0.9272
Zm00001d051685,-0.1389,-0.1106,-0.0252,1.0196,-0.0179,0.0366,0.634,-0.5733,0.4165,-0.0442,0.1448,0.6312
Zm00001d040880,0.5826,-0.735,-0.5826,0.0,0.0,0.4138,0.5826,0.0,0.0,-1.3176,0.0,1.3176