在之前使用的几个版本的脚本中,发现由于读取的数据和index信息不一致导致写入的结果变少,故增加判断,在生成文件过程中方便直接发现异常,并进行调整输入的文件信息。
此版本是在之前的只做了修改,未进行优化:
后续优化:
txt文本可用pandas模块读取
本次的异常判断可用try except
后续再进行优化
#_*_coding:UTF-8_*_
import xlrd
import csv
import datetime
# import pandas as pd
def read_excel(lib_and_index_info):
pooling_file = xlrd.open_workbook(r"*.xlsx", "r")
table = pooling_file.sheet_by_name("Sheet1")
n_rows = table.nrows # 获取行数
# lib_and_index_info = {} # 新建字典存储编号以及index序列信息 键为实验室编号信息,值为index信息
for i in range(5, n_rows): # pooling表的第5行开始为实验室记录的文库及index取样等信息的表头
lib_num = table.cell(i, 1).value
index_info = table.cell(i, 3).value
# 如果单元格中有空白行或者只有一个cell有数据的删除
lib_and_index_info[lib_num] = index_info # 生成样本编号与index对应关系的字典
if index_info == "": # 判断index中信息是否为空值
lib_and_index_info.pop(lib_num) # 删除值为空的键,即从字典中删除了该信息
def read_index_seq(index_dict):
# index_dict = {}
"""
增加了对index碱基数的if判断,在调用该脚本是写入使用的是8或者6碱基的模式,
生成满足6碱基或者8碱基的index序列
:param index_dict:
:return:
"""
index_num = int(input("输入index的碱基数: ")) # 数值类型转换
# index_num = input("输入index的碱基数: ") # 该命令直接执行else后的语句
with open(r"D:\My Projects\DoubleIndex.txt", "r") as index_file:
if index_num == 8:
for i in range(1, 96):
lines = index_file.readline()
index_name = lines.split("\t")[0]
P7_seq = lines.split("\t")[2]
P5_seq = lines.split("\t")[4]
index_dict[index_name] = [P7_seq, P5_seq]
else:
for i in range(1, 96):
lines = index_file.readline()
index_name = lines.split("\t")[0]
P7_seq = lines.split("\t")[2][:6]
P5_seq = lines.split("\t")[4][:6]
index_dict[index_name] = [P7_seq, P5_seq]
index_file.close()
def write_csv(lib_and_index_info,index_dict):
global write_sam_num # 定义全局变量,用于统计写入的数量,并返回该结果用于后续判断
write_sam_num = 0
current_time = datetime.datetime.now().strftime('%Y/%m/%d')
with open(r"D:\My Projects\temp.csv", "w", newline="") as temp_csv: # newline=""控制行距,经测试只可在python3环境下使用
csv_writer = csv.writer(temp_csv)
csv_writer.writerow(["[Header]"])
col_names = ["Sample_ID", "Sample_Name", "Sample_Plate", "Sample_Well",
"I7_Index_ID", "index", "index2","Sample_Project", "Description"] # 表头信息
csv_writer = csv.DictWriter(temp_csv, fieldnames=col_names) # 添加表头信息,用于构建字典
csv_writer.writeheader()
for k_lib, v_lib in lib_and_index_info.items():
for k_index, v_index in index_dict.items():
if v_lib == k_index:
# index_P7 = v_index[0]
# index_P5 = v_index[1]
csv_writer.writerow(
{"Sample_ID": k_lib, "Sample_Name": k_lib, "I7_Index_ID": v_lib, "index": v_index[0],
"index2": v_index[1], "Sample_Project": "fastq", "Description": "fastq"})
write_sam_num += 1
temp_csv.close()
return write_sam_num
def main(): # 主函数
lib_and_index_info = {}
index_dict = {}
read_excel(lib_and_index_info)
read_index_seq(index_dict)
write_csv(lib_and_index_info,index_dict)
# 异常处理
if write_sam_num == len(lib_and_index_info): # 增加判断写入的数据和读取的数据个数是否一致
print("Numbers of sample to be analyzed:{}".format(len(lib_and_index_info)))
else:
print("Please check the sample information of the output file:Inconsistent results")
if __name__ == "__main__":
main()
记录于20191003