from collections import deque
import difflib
import numpy as np
def calculate_similarity(text1, text2):
"""计算两个句子的相似度"""
return difflib.SequenceMatcher(None, text1, text2).ratio()
def find_most_similar_sentence(given_sentence, all_lines):
"""在所有行中找到与给定句子最相似的句子"""
max_similarity = 0
most_similar_sentence = ""
most_similar_index = -1
for index, line in enumerate(all_lines):
similarity = calculate_similarity(given_sentence, line)
if similarity > max_similarity:
max_similarity = similarity
most_similar_sentence = line
most_similar_index = index
# 计算最相似句子的位置在文件中的百分比
total_sentences = len(all_lines)
position_percentage = (most_similar_index + 1) / total_sentences * 100
return most_similar_sentence, max_similarity, position_percentage
def remove_outliers_iqr(data):
"""
使用IQR方法去除数据中的离群点。
参数:
data (array-like): 输入数据数组。
返回:
np.ndarray: 去除离群点后的数据。
"""
data = np.array(data)
# 计算四分位数
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
# 计算IQR
IQR = Q3 - Q1
# 设定离群点的阈值
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# 筛选出在阈值范围内的数据
filtered_data = data[(data >= lower_bound) & (data <= upper_bound)]
return filtered_data
# 源日志文件路径
source_log_file_path = "/RAID5/projects/fuxingwen/fxw/code/FastSurferProgram/FastSurfer.log"
# 目标日志文件路径
target_log_file_path = "/RAID5/projects/fuxingwen/fxw/code/FastSurferProgram/FastSurferEX.log"
# 读取源日志文件中的最后50行
with open(source_log_file_path, "r") as file:
last_50_lines = deque(file, maxlen=5)
# 读取目标日志文件中的所有行
with open(target_log_file_path, "r") as file:
target_lines = file.readlines()
# 对于最后50行的每一句话,找到其在目标文件中最相似的句子
pplist = []
for line in last_50_lines:
line = line.strip() # 移除行尾的换行符
most_similar_sentence, max_similarity, position_percentage = find_most_similar_sentence(line, target_lines)
# print(f"原句: {line}")
# print(f"最相似的句子: {most_similar_sentence.strip()}")
# print(f"相似度: {max_similarity}")
# print(f"在目标日志文件中的位置为前: {position_percentage:.2f}%\n")
if max_similarity > 0.7:
pplist.append(position_percentage)
filtered_data = remove_outliers_iqr(pplist)
print(np.mean(filtered_data))
比较两个日志文件的句子相似度,并分析相似度数据
于 2024-08-13 18:19:03 首次发布