import pandas as pd
def process_doc(txt_path1, txt_path2):
# 加载文件1,使用默认列名 test.txt
file1 = pd.read_csv(txt_path1, sep='\s+', header=None)
# 加载文件2,使用默认列名 t.txt
file2 = pd.read_csv(txt_path2, sep='\s+', header=None)
for index1, row1 in file1.iterrows(): # row1是test.txt里的内容
for index2, row2 in file2.iterrows(): # row2是t.txt里的内容
if row1[0] == row2[11]: # 比较文件1和文件2的第一列
file2.at[index2, 'total_length'] = row1.iloc[1] # 修改文件2的'total_length'列为文件1的第二列
# 保存到新文件
file2.to_csv("processed.txt", sep='\t', index=False, header=False)
return file2
def read_csv_file(filename,index_out,index):
# 使用 Pandas 读取以大片空白为分隔符的 CSV 文件
df = pd.read_csv(filename, sep='\s+', header=None)
# 提取前两列,并将结果转换为列表套列表形式
data = df.iloc[index_out:index + 1, :2].values.tolist()
#print(data)
return data
def total_length_with_overlap(segments):
# 计算所有线段的总长度,包括重叠部分
total = 0
segments.sort() # 将线段按照起始点排序
prev_end = float('-inf')
for seg in segments:
start, end = seg
# 如果当前线段的起始点在前一个线段内,则加上整条线段长度
if start <= prev_end:
total += max(0, end - prev_end)
else:
total += end - start
# 更新前一个线段的结束点
prev_end = max(prev_end, end)
return total
def calculate(filename,index_out,index):
data = read_csv_file(filename,index_out,index)
total_all = total_length_with_overlap(data)
return total_all
def count(data):
# 读取数据,假设数据存储在名为 'data.txt' 的文件中,且列之间以制表符分隔
# data = pd.read_csv('new.txt', sep='\s+', header=None) 若两个脚本分开运行的话可以把这个取消注释。
# 初始化总和为0
total_sum = 0
index_out = 0
total_sum_first = 0
tag_line = 11 # query名称所在列
length_line = 6 # query比对上的长度所在列
# 遍历数据,计算满足条件的总和
for index, row in data.iterrows():
# print(index)
"""if index == 0:
total_sum = calculate(filename,index_out,index) #data.iloc[index, length_line]
data.at[index, 'total_sum'] = total_sum
pass
elif 0 < index < len(data) - 1 and row[tag_line] == data.iloc[index - 1, tag_line]:
pass
total_sum = total_sum + data.iloc[index, length_line] # 累加第七列内容
data.at[index, 'total_sum'] = total_sum"""
if 0 <= index < len(data) - 1 and not row[tag_line] == data.iloc[index + 1, tag_line]:
# print("{}的对比长度是{}".format(data.iloc[index - 1, 11], data.iloc[index - 1, 14]))
data.at[index, 'total_sum'] = calculate(filename, index_out, index)
print("{}:{}:{}".format(data.iloc[index, 11], data.iloc[index, 12],
int(data.iloc[index, 15]) / int(data.iloc[index, 14])))
#total_sum = data.iloc[index, 6]
index_out = index + 1
elif index == len(data) - 1:
#total_sum = total_sum + data.iloc[index, length_line]
data.at[index, 'total_sum'] = calculate(filename, index_out, index)
print("{}:{}:{}".format(data.iloc[index, 11], data.iloc[index, 12],
int(data.iloc[index, 15]) / int(data.iloc[index, 14])))
# print("{}的对比长度是{}".format(data.iloc[index, 11], data.iloc[index, 14]))
#print(data)
data.to_csv("finally.txt", sep='\t', index=False, header=False)
# print(data) #仅作最终测试使用!
if __name__=='__main__':
filename = "processed.txt"
data = process_doc("test.txt", "t.txt")
count(data)
coverageV3.0.py
最新推荐文章于 2024-09-15 22:31:42 发布