相似度(参数调节代码)

// test1.cpp : Defines the entry point for the console application.
//

#include <vector>
#include <cstring>
#include <fstream>
#include <string>
#include <iostream>
#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <map>
#include <ctime>
#include <iomanip>
#include <cmath>
#include <queue>

using namespace std;

struct serie
{
	vector<double> vec;
	string label;
	double len;
};
double speed = 1.6;
double INF = 1e10;
serie series[10000];
double dpA[2000][2000];
double dpB[2000][2000];
int num = 637;
double coe=0.2;
int WIN = 0;
double down_ratio;
map<int, string> names;
vector<int> lens(38, 0);
vector<int> num1(38, 0);
vector<int> num2(38, 0);
vector<int> classes(38,0);
vector<double> dev_mean(38, 0);

string to_string(int value)
{
	char help[10];
	sprintf(help,"%d",value);
	return string(help);
}
double DPA(int i, int j, int win)
{
	if ((i == 0 && j == win + 1) || (j == 0 && i == win + 1))
	{
		return dpA[i][j];;
	}
	if (abs(i - j) <= win)
		return dpA[i][j];
	return INF;
}
double DPB(int i, int j, int win)   
{
	if ((i == 0 && j == win + 1) || (j == 0 && i == win + 1))
	{
		return dpB[i][j];;
	}
	if (abs(i - j) <= win)
		return dpB[i][j];
	return INF;
}
double getdis(int one, int two, int i, int j)
{
	vector<double>& vec1 = series[one].vec;
	vector<double>& vec2 = series[two].vec;
	int n = vec1.size();
	double ans = (vec1[i * 2] - vec2[j * 2])*(vec1[i * 2] - vec2[j * 2]) + (vec1[i * 2 + 1] - vec2[j * 2 + 1])*(vec1[i * 2 + 1] - vec2[j * 2 + 1]);
	return sqrt(ans);
}
double getdistance00(int one, int two)
{
	int n = series[two].vec.size() + 1;
	dpA[0][0] = 0;
	dpB[0][0] = 0;
	dpA[1][0] = 0;
	dpB[0][1] = 0;
	n /= 2;
	for (int i = 2; i <= min(n - 1, WIN + 1); i++)
	{
		dpA[i][0] = dpA[i - 1][0] + getdis(one, one, i - 1, i - 1 - 1);// fabs(series[one].vec[i - 1] - series[one].vec[i - 1 - 1]);
		dpB[0][i] = dpB[0][i - 1] + getdis(two, two, i - 1, i - 1 - 1);// fabs(series[two].vec[i - 1] - series[two].vec[i - 1 - 1]);getdis(two, two, i - 1, j - 1-1)
	}
	int win = WIN;
	for (int i = 1; i < n; i++)
		for (int j = max(1, i - win); j < min(n, i + win + 1); j++)
		{
			if (i != 1)
				dpA[i][j] = min(DPA(i - 1, j, win) + getdis(one, one, i - 1, i - 1 - 1), DPB(i - 1, j, win) + coe*getdis(one, two, i - 1, j - 1));
			else
				dpA[i][j] = DPB(i - 1, j, win) + coe*getdis(one, two, i - 1, j - 1);

			if (j != 1)
				dpB[i][j] = min(DPA(i, j - 1, win) + coe*getdis(one, two, i - 1, j - 1), DPB(i, j - 1, win) + getdis(two, two, i - 1, j - 1 - 1));
			else
				dpB[i][j] = DPA(i, j - 1, win) + coe*getdis(one, two, i - 1, j - 1);

		}
	return min(dpA[n - 1][n - 1], dpB[n - 1][n - 1]);
}
double getdistance0t(int one, int two)
{
	int n = series[two].vec.size() + 1;
	dpA[0][0] = 0;
	dpB[0][0] = 0;
	dpA[1][0] = 0;
	dpB[0][1] = 0;
	for (int i = 2; i <= min(n-1,WIN + 1); i++)
	{
		dpA[i][0] = dpA[i - 1][0] + fabs(series[one].vec[i - 1] - series[one].vec[i - 1 - 1]);
		dpB[0][i] = dpB[0][i - 1] + 0*fabs(series[two].vec[i - 1] - series[two].vec[i - 1 - 1]);
	}
	int win = WIN;
	for (int i = 1; i < n; i++)
		for (int j = max(1, i - win); j < min(n, i + win + 1); j++)
		{
			if (i != 1)
				dpA[i][j] = min(DPA(i - 1, j, win) + fabs(series[one].vec[i - 1] - series[one].vec[i - 1 - 1]), DPB(i - 1, j, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]));
			else
				dpA[i][j] = DPB(i - 1, j, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]);

			if (j != 1)
				dpB[i][j] = min(DPA(i, j - 1, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]), DPB(i, j - 1, win) + 0*fabs(series[two].vec[j - 1] - series[two].vec[j - 1 - 1]));
			else
				dpB[i][j] = DPA(i, j - 1, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]);

		}
	return min(dpA[n - 1][n - 1], dpB[n - 1][n - 1]) ;
}
double getdistance0(int one, int two)
{
	int n = series[two].vec.size() + 1;
	dpA[0][0] = 0;
	dpB[0][0] = 0;
	dpA[1][0] = 0;
	dpB[0][1] = 0;
	for (int i = 2; i <= min(n - 1, WIN + 1); i++)
	{
		dpA[i][0] = dpA[i - 1][0] + fabs(series[one].vec[i - 1] - series[one].vec[i - 1 - 1]);
		dpB[0][i] = dpB[0][i - 1] + fabs(series[two].vec[i - 1] - series[two].vec[i - 1 - 1]);
	}
	int win = WIN;
	for (int i = 1; i < n; i++)
		for (int j = max(1, i - win); j < min(n, i + win + 1); j++)
		{
			if (i != 1)
				dpA[i][j] = min(DPA(i - 1, j, win) + fabs(series[one].vec[i - 1] - series[one].vec[i - 1 - 1]), DPB(i - 1, j, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]));
			else
				dpA[i][j] = DPB(i - 1, j, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]);

			if (j != 1)
				dpB[i][j] = min(DPA(i, j - 1, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]), DPB(i, j - 1, win) + fabs(series[two].vec[j - 1] - series[two].vec[j - 1 - 1]));
			else
				dpB[i][j] = DPA(i, j - 1, win) + coe*fabs(series[one].vec[i - 1] - series[two].vec[j - 1]);

		}
	return min(dpA[n - 1][n - 1], dpB[n - 1][n - 1]);
}
double dp[2000][2000];
double getdistance1(int one, int two)
{
	int n = series[two].vec.size();
	dp[0][0] = fabs(series[one].vec[0] - series[two].vec[0]);
	int win = WIN;
	for (int i = 1; i < min(i+win,n); i++)
	{
		dp[i][0] = dp[i-1][0]+fabs(series[one].vec[i]-series[two].vec[0]);
		dp[0][i] = dp[0][i-1]+fabs(series[one].vec[0] - series[two].vec[i]);
	}
	for (int i = 1; i < n; i++)
		for (int j = max(1,i-win); j < min(i+win+1,n); j++)
		{
			if (j==i-win)
				dp[i][j] = fabs(series[one].vec[i] - series[two].vec[j]) +  min(dp[i - 1][j], dp[i - 1][j - 1]);
			else if (j==i+win)
				dp[i][j] = fabs(series[one].vec[i] - series[two].vec[j]) + min(dp[i][j - 1], dp[i - 1][j - 1]);
			else
				dp[i][j] = fabs(series[one].vec[i] - series[two].vec[j]) +min(dp[i - 1][j], min(dp[i][j - 1], dp[i - 1][j - 1]));
		}
	return dp[n - 1][n - 1];
	return 0;
}

double getdistance2(int one, int two)
{
	int n = series[two].vec.size();
	double ans = 0;
	for (int i = 0; i < series[one].vec.size(); i++)
	{
		ans += fabs(series[one].vec[i] - series[two].vec[i]);
	}
	return ans;
}
double getdistance3(int one, int two)
{
	int n = series[one].vec.size();
	double ans = 0;
	for (int i = 0; i < n - 1; i++)
	{
		ans += abs(series[one].vec[i] - series[two].vec[i]);
		ans += abs(series[two].vec[i] - series[one].vec[i + 1]);
		ans += abs(series[one].vec[i] - series[two].vec[i + 1]);
	}
	ans += abs(series[one].vec[n - 1] - series[two].vec[n - 1]);
	return ans;
}
double getdistance4(int one, int two)
{
	int n = series[two].vec.size();
	double ans = 0;
	dp[0][0] = abs(series[one].vec[0] - series[two].vec[0]);
	for (int i = 1; i < n; i++)
	{
		dp[i][0] = dp[i - 1][0] + abs(series[one].vec[i - 1] - series[one].vec[i]);
		dp[0][i] = dp[0][i - 1] + abs(series[two].vec[i - 1] - series[two].vec[i]);
	}
	int win = WIN;
	for (int i = 1; i < n; i++)
		//for (int j = 1; j < n; j++)
			for (int j = max(1, i - win); j < min(i + win + 1, n); j++)
		{
			dp[i][j] = INF;
			if (j - i + 1 <= win)
			dp[i][j] = min(dp[i][j],dp[i - 1][j] + abs(series[one].vec[i - 1] - series[one].vec[i]));
			if (i - j + 1 <= win)
			dp[i][j] = min(dp[i][j], dp[i][j - 1] + abs(series[two].vec[j - 1] - series[two].vec[j]));
			if (j - i + 1 <= win)
			dp[i][j] = min(dp[i][j], dp[i - 1][j ] + abs(series[one].vec[i] - series[two].vec[j])*coe);
			if (i - j + 1 <= win)
			dp[i][j] = min(dp[i][j], dp[i ][j - 1] + abs(series[one].vec[i] - series[two].vec[j])*coe);
			
		}
	return dp[n - 1][n - 1];
}
double getdistance5(int one, int two)
{
	int n = series[one].vec.size();
	double ans = 0;
	ans += abs(series[one].vec[0] - series[two].vec[0]);
	ans += abs(series[one].vec[n - 1] - series[two].vec[n - 1]);
	for (int i = 1; i < n - 1; i++)
	{
		ans += min(abs(series[one].vec[i] - series[two].vec[i - 1]), min(abs(series[one].vec[i] - series[two].vec[i]), abs(series[one].vec[i] - series[two].vec[i + 1])));
	}

	ans += abs(series[one].vec[n - 1] - series[two].vec[n - 1]);
	return ans;
}

void readInfo()
{
	ifstream file("/home/xiefubao/myproject/experiment/vldb_dataset/numhelp.txt");
	if (!file.is_open())
	{
		cout << "num.txt not open!" << endl;
		exit(0);
	}
	for (int i = 0; i < 38; i++)
	{
		string now;
		int no;
		file >> no;
		file >> now;
		file >> lens[i];
		file >> num1[i];
		file >> num2[i];
		file >> classes[i];
		file >> dev_mean[i];
		names[i] = now;
	}
}
void down_sample(int counter)
{
	int newnum = num*(1 - down_ratio);
	int getout = num - newnum;
	for (int i = 0; i < counter; i++)
	{
		vector<double> down;
		vector<int> help(num, 0);
		vector<bool> rem(num, 1);
		for (int j = 0; j < num; j++)
		{
			help[j] = j;
		}
		for (int j = 0; j < getout; j++)
		{
			int position = rand() % (num - j);
			rem[help[position]] = 0;
			swap(help[position], help[num-j-1]);
		}
		vector<double> now(newnum, 0);
		int add = 0;
		for (int j = 0; j < num; j++)
		{
			if (rem[j])
				now[add++] = series[i].vec[j];
		}
		swap(series[i].vec, now);
	}
	num = newnum;
}
void readFile(int filenum , bool train)
{
	num = lens[filenum - 1];
	string filepath = "/home/xiefubao/myproject/experiment/vldb_dataset/" + to_string(filenum) + "/" + names[filenum - 1] + (train ? "_TRAIN" : "_TEST") + ".txt";
	cout << filepath << endl;
	string now;
	ifstream in(filepath.c_str());
	if (!in.is_open())
	{
		cout << "not open" << endl;
		exit(1);
	}
	int number = train ? num2[filenum - 1] : num1[filenum - 1];
	for(int u = 0;u < number;u++)
	{
		in >> now;
		series[u].label = now;
		double len = 0;
		series[u].vec.clear();
		for (int i = 0; i < num; i++)
		{
			double point;
			in >> point;
			point = point;
			series[u].vec.push_back(point);
			if (i != 0)
				len += fabs(point - series[u].vec[i - 1]);
		}
		series[u].len = len;
	}
	cout<<"done 1"<<endl;
	//down_sample(number);
}
/*bool operator<(pair<double,string> n1,pair<double,string> n2) {
        return n1.first < n2.first;
}*/
int getvalue(int filenum,int wlen,double c,int counter,double(*distance) (int, int))
{
	int ans = 0;
	WIN = wlen;
	coe = c;
	for (int i = 0; i < counter; i++)
		{
			//cout << counter << " " << ans <<endl;
			priority_queue<pair<double,string> > pri;
			int prinum = 3;
			for(int j = 0;j < counter; j++)
			{
				if (i == j) continue;
				if (((i+3737)*(j+4343)) %(max(1,counter/classes[filenum]/5)) != 0) continue;
				double dis = distance(i,j);

				if(pri.size() < prinum)
					pri.push(make_pair(dis,series[j].label));
				else if(pri.top().first > dis)
				{
					pri.pop();
					pri.push(make_pair(dis,series[j].label));
				}
			}
			for(int u = 1;u <= prinum && !pri.empty();u++)
			{
				if(pri.top().second == series[i].label)
				{
					ans += u*u;
				}
				pri.pop();
			}
		}
		cout << "ans " << ans <<endl;
	return ans;
}
int   find_win_size(int filenum,double c,int counter,double(*distance) (int, int))
{
	int max_value = -1;
	int len = -1;
	for(int wlen = lens[filenum] / 2; wlen >= 2 ; wlen /= speed)
	{
		cout<<wlen<<endl;
		int value = getvalue(filenum,wlen,c,counter,distance);
		if(value >= max_value)
		{
			max_value = value;
			len = wlen;
		}
	}
	return len;
}
double find_ceo(int filenum,int counter, double(*distance) (int, int))
{
	int max_value = -1;
	double ans = -1;
	for(double rat = 1; rat > 0.001 ; rat /= 1.8)
	{
		int value = getvalue(filenum,WIN,rat,counter,distance);
		if(value >= max_value)
		{
			max_value = value;
			ans = rat;
		}
	}
	return ans;
}
int main()
{
	//cout << "xie" <<  << 123 << "123" << endl; getchar();
	readInfo(); 
	ofstream result;
	result.open("/home/xiefubao/myproject/experiment/vldb_dataset/compare.txt",ios::app);
	//file location
	//int filenum = 31;
	bool train = false;
	for (int filenum = 38; filenum <= 38; filenum++)
	{
		/*down_ratio = 0.2;
		readFile(filenum, false);
		cout << "start find winsize" << endl;
		WIN = find_win_size(filenum - 1,dev_mean[filenum-1]/2,num2[filenum -1],getdistance0);
		cout << "start find coe" << endl;
		coe = find_ceo(filenum - 1,num2[filenum -1],getdistance0);
		cout << "have done" << endl;
		cout << "WIN && coe : " << WIN << " " << coe <<endl;
        */
		clock_t start, finish;
		start = clock();

        WIN = 5;
        coe = dev_mean[filenum - 1];
		vector<int> hitnum(5, 0);
		int counter = num1[filenum - 1];

		cout << "reading" << endl;
		
		readFile(filenum, train);
		cout << "have read" << endl;
		cout << "xiefubao " << WIN << coe <<endl;
		//double(*distance[5]) (int, int) = { getdistance00, getdistance0, getdistance1, getdistance2, getdistance3 };
		double(*distance[5]) (int, int) = { getdistance0,getdistance1, getdistance2 };
		vector<int> computeDis;
		computeDis.push_back(0);
		computeDis.push_back(1);
		computeDis.push_back(2);
		//computeDis.push_back(3);
		//computeDis.push_back(4);
		for (int i = 0; i < counter; i++)
		{
			
			vector<double> mist(5, INF);
			vector<int> bestnum(5, -1);
			//cout << i << endl;  
			for (int j = 0; j < counter; j++)
			{
				if (i == j) continue;
				if (rand() %(max(1,counter/classes[filenum-1]/5)) != 0) continue;
				vector<double> dist(5, 0);
				for (int k = 0; k < computeDis.size(); k++)
				{
					dist[computeDis[k]] = distance[computeDis[k]](i, j);
				}
				for (int k = 0; k < computeDis.size(); k++)
					if (dist[computeDis[k]] < mist[computeDis[k]])
					{
						mist[computeDis[k]] = dist[computeDis[k]];
						bestnum[computeDis[k]] = j;
					}
			}
			if (i == counter - 1)
			{
				result << setw(2) << setfill(' ')  << filenum << " ";
			}
			for (int k = 0; k < computeDis.size(); k++)
			{
				if (series[i].label == series[bestnum[computeDis[k]]].label)
				{
					hitnum[computeDis[k]]++;
				}
				if (i % 100 == 0 || i == counter - 1)
					cout << "distance" << computeDis[k] << " hitsnum:" << hitnum[computeDis[k]] << " / " << i + 1 << " " << counter << endl;
				if (i == counter - 1)
				{
					result << setw(5) << setfill(' ')<< hitnum[computeDis[k]] << " ";
				}
			} 
			if (i % 100 == 0 || i == counter - 1)
				cout << endl;
			if (i == counter - 1)
			{
				result << setw(5) << setfill(' ') << counter <<" ";
				result << setw(4) << setfill(' ')<< WIN << "    " << setw(7) << setfill(' ') << coe << endl;
			}
		}
		finish = clock();
		//cout << "timeofcost: "<< finish - start << endl;
		cout << "series length: " << num << endl << endl;
		
	}
	return 0;
}

通过例子介绍使用方法如下: 1.差异统计 统计某一个版本的代码包相对于一个原始的基线代码包,变动的代码量 以及变动的代码量中各语言非空非注释行(NBNC)的结果 diffcount 缺省执行的就是差异统计,直接跟上两个代码包的目录即可 在diffcount目录下执行: diffcount test\sp1 test\sp2 实际使用中,可能会有文件名和目录名大小写不一致的情况,如果希望忽略 文件名大小写的差异,需要使用 --ignore-case 参数,否则两个一样的文件 一个会算作删除,一个会算作新增 G:\diffcount>diffcount test\sp1 test\sp2 Diffcount [test\sp1] and [test\sp2] result: LANG ADD MOD DEL A&M BLK CMT NBNC RATE ----------------------------------------------------------------------- C 44 7 26 51 8 11 35 1.00 Pascal 0 0 25 0 0 0 0 0.23 Java 7 4 11 11 0 3 9 0.41 Config 31 4 0 35 1 0 34 0.12 XML 126 0 0 126 2 0 124 0.12 ----------------------------------------------------------------------- Convert all NBNC lines to standard C Total: 57.65 (standard C lines) ADD MOD DEL A&M BLK CMT NBNC RATE 的 含义分别为: 新增、修改、删除、新增+修改、空行、注释、非空非注释行、标准C折算率 2.代码统计: 如果需要,可以把diffcount当作普通的代码行统计工具,统计一个代码代码统计使用 -c (或者--count-only)参数, 在diffcount目录下执行 diffcount -c test\count 执行结果如下: G:\diffcount>diffcount -c test\count Counting package [test\count] result: LANG TOTAL BLK CMT NBNC RATE ----------------------------------------------------------------------- C 203 46 61 101 1.00 C++ 57 7 25 25 0.42 Pascal 117 24 17 79 0.23 Java 71 7 24 40 0.41 ASM 129 34 12 85 2.50 C# 18 1 1 17 0.20 Basic 447 140 20 295 0.21 Perl 97 4 26 74 0.16 TCL/TK 91 12 26 54 0.50 Config 116 13 0 103 0.12 XML 126 2 0 124 0.12 ----------------------------------------------------------------------- Convert
### Rerank 相似度计算方法概述 Rerank 是一种用于重新排列检索到的文档列表的技术,通常基于更复杂的模型来优化排名顺序。其核心目标是通过多种信号(如向量相似度、关键词匹配等)综合评估查询与文档之间的相关性。 #### 综合得分公式解析 综合得分可以通过加权的方式结合多个相似度指标得出。具体而言: \[ \text{综合得分} = (\text{向量相似度权重} \times \text{文档和 query 向量相似度得分}) + (\text{关键词相似度权重} \times \text{文档和 query 关键词相似度得分}) \] 其中: - **向量相似度**:利用嵌入表示(embeddings),衡量两个向量间的余弦距离或其他相似度函数[^1]。 - **关键词相似度**:通过对齐文本中的关键字频率或位置信息,量化两者的语义关联程度。 以下是两种主要相似度计算的具体实现方式及其代码示例。 --- ### 向量相似度计算 向量相似度通常是通过预训练的语言模型生成文本嵌入后得到的结果。常见的做法如下: 1. 使用 Sentence-BERT 或其他句编码器获取 `query` 和 `document` 的嵌入表示; 2. 应用余弦相似度作为两者间的关系强度测量工具。 ```python from sentence_transformers import SentenceTransformer, util def calculate_vector_similarity(query_embedding, document_embeddings): """ 计算 query 与多篇文档之间基于向量的相似度分数。 参数: query_embedding (np.ndarray): 查询的嵌入向量. document_embeddings (list[np.ndarray]): 多篇文档对应的嵌入向量列表. 返回: list[float]: 每篇文档相对于查询的相似度分数. """ scores = [] for doc_emb in document_embeddings: score = util.cos_sim(query_embedding, doc_emb).item() scores.append(score) return scores ``` 此部分实现了如何从已有的嵌入数据出发快速求解余弦相似度。 --- ### 关键词相似度计算 对于关键词相似度,则可以采用 BM25 或 TF-IDF 这样的传统 IR 技术完成初步筛选后再进一步精调。下面展示了一个简单的例子说明过程: ```python import math from sklearn.feature_extraction.text import TfidfVectorizer class KeywordSimilarityCalculator: def __init__(self, documents): self.vectorizer = TfidfVectorizer() self.tfidf_matrix = self.vectorizer.fit_transform(documents) def compute_similarity(self, query): """ 基于 TF-IDF 转换后的矩阵计算 Query 对每篇文章的关键字匹配分值 参数: query (str): 用户输入的问题或者请求字符串 返回: np.array: 表明各文章针对当前提问的重要性评分数组形式呈现出来 """ query_tfidf = self.vectorizer.transform([query]) cosine_similarities = util.cosine_similarity(query_tfidf, self.tfidf_matrix)[0] return cosine_similarities.tolist() # Example Usage calculator = KeywordSimilarityCalculator(["doc1 content", "another article text"]) print(calculator.compute_similarity("example search term")) ``` 上述脚本定义了一种机制用来估算给定条件下不同资源文件里所含主题元素彼此接近的程度。 --- ### 权重调整策略 为了平衡两类特征的作用效果,在实际应用过程中往往还需要引入超参数调节各自贡献比例大小关系。例如设置默认情况下二者重要性的相对高低为7:3即意味着更加侧重考量前者因素的影响力度;当然也可以依据实验结果动态修改这些数值直至达到最佳性能表现为止。 最终完整的 reranking 函数可能看起来像这样: ```python def rerank_documents(query, documents, vector_model, keyword_calculator, weight_vec=0.7, weight_key=0.3): """ 根据 query 和 documents 列表执行 re-ranking 并返回排序后的索引 参数: query (str): 输入查询串 documents (list[str]): 待处理的文章集合 vector_model (...): 句子嵌入模型实例化对象 keyword_calculator (...) : 已初始化好的关键词计算器类实体 weight_vec (float): 向量相似度占比,默认值设为 0.7 weight_key (float): 关键词相似度占比,默认值设定成 0.3 返回: tuple[list[int], list[float]]: 排序后的文档索引以及对应总分 """ # 获取 Embedding 版本下的比较成果 embeddings = [vector_model.encode(d) for d in documents] vec_scores = calculate_vector_similarity(vector_model.encode(query), embeddings) # 执行 Keywords 方面的操作流程 key_scores = keyword_calculator.compute_similarity(query) combined_scores = [(i, wv*vec_score + wk*key_score) for i,(wv,wk),(vec_score,key_score) in zip(range(len(documents)),[(weight_vec,weight_key)]*len(documents), zip(vec_scores,key_scores))] sorted_indices_and_scores = sorted(combined_scores, key=lambda x:x[1], reverse=True) ranked_indices = [idx for idx,_ in sorted_indices_and_scores] final_scores = [score for _,score in sorted_indices_and_scores] return ranked_indices ,final_scores ``` 该版本不仅考虑到了原始检索阶段获得的信息同时也加入了额外层次上的判断从而提高了整体质量水平. ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值