#define _CRT_SECURE_NO_WARNINGS
#define ENDSIGN ""
#define ENDSIGNS "\n"
#include<stdio.h>
#include<stdlib.h>
#include<malloc.h>
#include<string.h>
#define N 200
#define MAXLEN 3600
#define KEYLEN 20
#define Numofnode 50
#define threshold .95
/*
字符串编辑距离(Edit Distance),是俄罗斯科学家Vladimir Levenshtein提出的概念。两个字符串之间的最小距离就是指把一个字符串转换为另一个字符串时,所需要的最小编辑操作(插入、删除、替换)的次数。
编辑操作包含以下3种:
替换(substitution),将一个字符替换为另一个字符
插入(insertion),插入一个字符
删除(deletion),删除一个字符
下面给出了一个基于动态规划算法的编辑距离计算函数editdistDP 的C语言实现,该函数参数为2个字符串,返回值为2个字符串的编辑距离,函数实现如下:
*/
#define max2(a,b) ((a)>(b)?(a):(b))
int** Dp, MaxDP = 3300; //for dynamic programming
int min3(int a, int b, int c)
{
int min = a < b ? a : b;
return min < c ? min : c;
}
int error2(const char* s)
{
//fprintf(stderr,"%s\n",s);
exit(-1);
}
int initDP()
{
int i;
Dp = (int**)malloc(MaxDP * sizeof(int*));
for (i = 0; i < MaxDP; i++)
Dp[i] = (int*)malloc(MaxDP * sizeof(int));
return 0;
}
int editdistDP(char* str1, char* str2)
{
int i, j;
int len1, len2;
static int flag = 0;
(flag++) ? 1 : initDP();
len1 = strlen(str1) + 1; len2 = strlen(str2) + 1;
(max2(len1, len2) >= MaxDP) ? error2("DP memory error!") : len1;
for (i = 0; i <= len1; i++) {
for (j = 0; j <= len2; j++) {
if (i == 0)
Dp[i][j] = j;
else if (j == 0)
Dp[i][j] = i;
else if (str1[i - 1] == str2[j - 1])
Dp[i][j] = Dp[i - 1][j - 1];
else
Dp[i][j] = 1 + min3(Dp[i][j - 1], Dp[i - 1][j], Dp[i - 1][j - 1]);
}
}
return Dp[len1][len2];
}
typedef struct Node {
int info;
char codes[MAXLEN];
int len;
bool flag;
};
typedef struct keepword {
char key[20];
struct keepword* next;
}KW,*K;
int InitFileData(Node node[Numofnode]) {
int index = 0;
FILE* file = fopen("project2024/codes.txt", "r");
if (file == NULL)return -1;
char codes[N + 1];
while ((fscanf_s(file, "%d", &node[index].info)) != EOF) {
int cindex = 0;
int bracket = 0;
while (fgets(codes, N, file) != NULL) {
if (!strcmp(codes, ENDSIGN) || !strcmp(codes, ENDSIGNS)) {
//printf("%d %d\n", index,cindex);
node[index++].codes[cindex] = '\0';
node[index - 1].len = cindex;
node[index - 1].flag = false;
//printf("%s", node[index - 1].codes);
break;
}
int ccindex = 0;
while (codes[ccindex++] != '\0') {
if (codes[ccindex - 1] == '\n' || codes[ccindex - 1] == '\t' || codes[ccindex - 1] == '\r' || codes[ccindex - 1] == ' ')continue;
if (codes[ccindex - 1] == '{')bracket++;
if (codes[ccindex - 1] == '}')bracket--;
node[index].codes[cindex++] = codes[ccindex-1];
if (bracket==0&&node[index].codes[cindex - 1] == '(') {
node[index].codes[cindex - 1] = '\n';
node[index].codes[cindex] = '\n';
for (int i = cindex; i > cindex - ccindex; i--) {
node[index].codes[i] = node[index].codes[i - 1];
}
node[index].codes[cindex - ccindex] = '\n';
cindex++;
break;
}
}
}
}
fclose(file);
return index;
}
K InitKeepwordData() {
FILE* file = fopen("project2024/keepwords.txt","r");
K head = (K)malloc(sizeof(KW));
K p=head;
p->next = NULL;
while (fgets(p->key, KEYLEN, file) != NULL) {
for (int i = 0; i < KEYLEN; i++) {
if (p->key[i] == '\n') {
p->key[i] = '\0';
break;
}
}
p->next = (K)malloc(sizeof(KW));
p = p->next;
p->next = NULL;
p->key[0] = '\0';
//printf("%s\n",key);
}
//p = head;
//while (p != NULL) {
// printf("%s\n", p->key);
// p = p->next;
//}
return NULL;
}
double Sim(Node node[Numofnode], int i, int j) {
return 1 - (double)(editdistDP(node[i].codes, node[j].codes)) / max2(node[i].len, node[j].len);
}
void Output(Node node[Numofnode],int non) {
bool printed = false;
bool firstfirst = true;
for (int i = 0; i < non; i++) {
for (int j = i + 1; j < non; j++) {
if (!node[j].flag && Sim(node, i, j) >= threshold) {
if (firstfirst)
printf("%d ", node[i].info);
firstfirst = false;
printf("%d ", node[j].info);
node[j].flag = true;
printed = true;
}
}
if(printed)
printf("\n");
printed = false;
firstfirst = true;
}
}
int main(void) {
Node node[Numofnode];
int non = InitFileData(node);
int choice=0;
//while (choice != -1) {
// printf("输入选择:");
// scanf_s("%d", &choice);
// printf("%s", node[choice].codes);
//}
//
Output(node, non);
//K head = InitKeepwordData();
return 0;
}
代码查重基于编辑距离
于 2024-06-28 17:57:30 首次发布