代码查重基于编辑距离

#define _CRT_SECURE_NO_WARNINGS
#define ENDSIGN ""
#define ENDSIGNS "\n"
#include<stdio.h>
#include<stdlib.h>
#include<malloc.h>
#include<string.h>
#define N 200
#define MAXLEN 3600
#define KEYLEN 20
#define Numofnode 50
#define threshold .95

/*
字符串编辑距离(Edit Distance),是俄罗斯科学家Vladimir Levenshtein提出的概念。两个字符串之间的最小距离就是指把一个字符串转换为另一个字符串时,所需要的最小编辑操作(插入、删除、替换)的次数。
编辑操作包含以下3种:
替换(substitution),将一个字符替换为另一个字符
插入(insertion),插入一个字符
删除(deletion),删除一个字符
下面给出了一个基于动态规划算法的编辑距离计算函数editdistDP 的C语言实现,该函数参数为2个字符串,返回值为2个字符串的编辑距离,函数实现如下:
*/

#define max2(a,b) ((a)>(b)?(a):(b))
int** Dp, MaxDP = 3300;								//for dynamic programming
int min3(int a, int b, int c)
{
	int min = a < b ? a : b;
	return min < c ? min : c;
}
int error2(const char* s)
{
	//fprintf(stderr,"%s\n",s); 
	exit(-1);
}
int initDP()
{
	int i;
	Dp = (int**)malloc(MaxDP * sizeof(int*));
	for (i = 0; i < MaxDP; i++)
		Dp[i] = (int*)malloc(MaxDP * sizeof(int));
	return 0;
}
int editdistDP(char* str1, char* str2)
{
	int i, j;
	int len1, len2;
	static int flag = 0;

	(flag++) ? 1 : initDP();
	len1 = strlen(str1) + 1; len2 = strlen(str2) + 1;
	(max2(len1, len2) >= MaxDP) ? error2("DP memory error!") : len1;
	for (i = 0; i <= len1; i++) {
		for (j = 0; j <= len2; j++) {
			if (i == 0)
				Dp[i][j] = j;
			else if (j == 0)
				Dp[i][j] = i;
			else if (str1[i - 1] == str2[j - 1])
				Dp[i][j] = Dp[i - 1][j - 1];
			else
				Dp[i][j] = 1 + min3(Dp[i][j - 1], Dp[i - 1][j], Dp[i - 1][j - 1]);
		}
	}
	return Dp[len1][len2];
}



typedef struct Node {
	int info;
	char codes[MAXLEN];
	int len;
	bool flag;
};
typedef struct keepword {
	char key[20];
	struct keepword* next;
}KW,*K;
int InitFileData(Node node[Numofnode]) {
	int index = 0;
	FILE* file = fopen("project2024/codes.txt", "r");
	if (file == NULL)return -1;
	char codes[N + 1];
	while ((fscanf_s(file, "%d", &node[index].info)) != EOF) {
		int cindex = 0;
		int bracket = 0;
		while (fgets(codes, N, file) != NULL) {
			if (!strcmp(codes, ENDSIGN) || !strcmp(codes, ENDSIGNS)) {
				//printf("%d %d\n", index,cindex);
				node[index++].codes[cindex] = '\0';
				node[index - 1].len = cindex;
				node[index - 1].flag = false;
				//printf("%s", node[index - 1].codes);
				break;
			}
			int ccindex = 0;
			
			while (codes[ccindex++] != '\0') {
				if (codes[ccindex - 1] == '\n' || codes[ccindex - 1] == '\t' || codes[ccindex - 1] == '\r' || codes[ccindex - 1] == ' ')continue;
				if (codes[ccindex - 1] == '{')bracket++;
				if (codes[ccindex - 1] == '}')bracket--;
				node[index].codes[cindex++] = codes[ccindex-1];
				if (bracket==0&&node[index].codes[cindex - 1] == '(') {
					node[index].codes[cindex - 1] = '\n';
					node[index].codes[cindex] = '\n';
					for (int i = cindex; i > cindex - ccindex; i--) {
						node[index].codes[i] = node[index].codes[i - 1];
					}
					node[index].codes[cindex - ccindex] = '\n';
					cindex++;
					break;
				}
			}
			
		}
	}
	fclose(file);
	return index;
}
K InitKeepwordData() {
	FILE* file = fopen("project2024/keepwords.txt","r");
	K head = (K)malloc(sizeof(KW));
	K p=head;
	p->next = NULL;
	while (fgets(p->key, KEYLEN, file) != NULL) {
		for (int i = 0; i < KEYLEN; i++) {
			if (p->key[i] == '\n') {
				p->key[i] = '\0';
				break;
			}
		}
		p->next = (K)malloc(sizeof(KW));
		p = p->next;
		p->next = NULL;
		p->key[0] = '\0';
		//printf("%s\n",key);
	}
	//p = head;
	//while (p != NULL) {
	//	printf("%s\n", p->key);
	//	p = p->next;
	//}
	return NULL;
}
double Sim(Node node[Numofnode], int i, int j) {
	return 1 - (double)(editdistDP(node[i].codes, node[j].codes)) / max2(node[i].len, node[j].len);
}
void Output(Node node[Numofnode],int non) {
	bool printed = false;
	bool firstfirst = true;
	for (int i = 0; i < non; i++) {
		for (int j = i + 1; j < non; j++) {
			if (!node[j].flag && Sim(node, i, j) >= threshold) {
				if (firstfirst)
					printf("%d ", node[i].info);
				firstfirst = false;
				printf("%d ", node[j].info);
				node[j].flag = true;
				printed = true;
			}
		}
		if(printed)
			printf("\n");
		printed = false;
		firstfirst = true;
	}
}
int main(void) {
	Node node[Numofnode];
	int non = InitFileData(node);
	int choice=0;
	//while (choice != -1) {
	//	printf("输入选择:");
	//	scanf_s("%d", &choice);
	//	printf("%s", node[choice].codes);
	//}
	// 
	Output(node, non);
	//K head = InitKeepwordData();
	return 0;
}
  • 27
    点赞
  • 27
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

alasnot

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值