借助chatGPT解决：将fasta文件按空行分割成独立文件并按第一行命名（Python）

最新推荐文章于 2024-04-26 13:38:27 发布

行琉

最新推荐文章于 2024-04-26 13:38:27 发布

阅读量340

点赞数

文章标签： python

本文链接：https://blog.csdn.net/q506122967/article/details/128922737

版权

该文章展示了一个Python函数，用于读取FASTA格式的序列文件，然后按头部标识符拆分为多个单独的文件。在处理过程中，它解决了语法错误问题，以及文件名中非法字符导致的错误，使用正则表达式替换了这些字符。

摘要由CSDN通过智能技术生成

def split_fasta("D:\AAseq\Homo_CBP_sequence.fasta", "D:\AAseq\split"):
with open("D:\AAseq\Homo_CBP_sequence.fasta", "r") as f:
lines = f.readlines()
seq = []
for line in lines:
if line.startswith(">"):
if seq:
with open(f"{D:\AAseq\split}/{header[1:].strip()}.fasta", "w") as f:
f.write("".join(seq))
seq = []
header = line
seq.append(line)
if seq:
with open(f"{D:\AAseq\split}/{header[1:].strip()}.fasta", "w") as f:
f.write("".join(seq))

file_path = ""D:\AAseq\Homo_CBP_sequence.fasta""
save_folder = "D:\AAseq\split"

split_fasta("D:\AAseq\Homo_CBP_sequence.fasta", save_folder)

#提示：SyntaxError: invalid syntax
#这是因为定义函数时，不能将字符串参数直接带上引号，需要将它们放在函数的括号内，如下所示：

def split_fasta(file_path, save_folder):
with open(file_path, "r") as f:
lines = f.readlines()
seq = []
for line in lines:
if line.startswith(">"):
if seq:
with open(f"{save_folder}/{header[1:].strip()}.fasta", "w") as f:
f.write("".join(seq))
seq = []
header = line
seq.append(line)
if seq:
with open(f"{save_folder}/{header[1:].strip()}.fasta", "w") as f:
f.write("".join(seq))

file_path = "D:/AAseq/Homo_CBP_sequence.fasta"
save_folder = "D:/AAseq/split"

split_fasta(file_path, save_folder)

#提示：Traceback (most recent call last):
# File "<stdin>", line 1, in <module>
# File "<stdin>", line 8, in split_fasta
# FileNotFoundError: [Errno 2] No such file or directory: 'D:/AAseq/split/NP_057455.1 3-keto-steroid reductase/17-beta-hydroxysteroid dehydrogenase 7 isoform 1 [Homo sapiens].fasta'
#这是因为文件名中有一些不合法的字符，比如 "/" 和 " ". 解决方法是对文件名进行替换

import re

def split_fasta(file_path, save_folder):
with open(file_path, "r") as f:
lines = f.readlines()
seq = []
for line in lines:
if line.startswith(">"):
if seq:
header = re.sub("[\s/]", "_", header.strip())
with open(f"{save_folder}/{header[1:]}.fasta", "w") as f:
f.write("".join(seq))
seq = []
header = line
seq.append(line)
if seq:
header = re.sub("[\s/]", "_", header.strip())
with open(f"{save_folder}/{header[1:]}.fasta", "w") as f:
f.write("".join(seq))

file_path = "D:/AAseq/Homo_CBP_sequence.fasta"
save_folder = "D:/AAseq/split"

split_fasta(file_path, save_folder)