利用余弦相似度在大量文章中找出抄袭的文章-CSDN博客

本文链接：https://blog.csdn.net/dhklsl/article/details/146251152

我前面的2篇文章分别讲了如果利用余弦相似度来判断2篇文章的相似度，来确定文章是否存在抄袭，和余弦相似度的原理，即余弦相似度到底是怎么来判断文章的相似性高低的等等。这一篇再说下，对于文章字数多和大量文章时，如果找到两篇相似度高的文章。这里就需要考虑内存溢出的风险了。所以对第一篇的代码进行了改造。在一定程度上降低了内存溢出的风险。

pom依赖

<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-math3</artifactId>
    <version>3.6.1</version>
</dependency>

这里和第一篇略有不同，即第一篇采用的hankcs包实现的余弦相似度算法。本篇文章时通过math3包实现的。但是原理相同。

代码如下：

package com.lsl.config;

import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.RealVector;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;

public class PlagiarismDetector {

    // 计算余弦相似度
    public static double cosineSimilarity(RealVector vectorA, RealVector vectorB) {
        double dotProduct = vectorA.dotProduct(vectorB);
        double normA = vectorA.getNorm();
        double normB = vectorB.getNorm();
        return dotProduct / (normA * normB);
    }

    // 将文本转换为词频向量
    public static Map<String, Integer> textToWordFrequency(String text) {
        Map<String, Integer> wordFrequency = new HashMap<>();
        String[] words = text.split("\\s+");
        for (String word : words) {
            wordFrequency.put(word, wordFrequency.getOrDefault(word, 0) + 1);
        }
        return wordFrequency;
    }

    // 将词频映射转换为向量
    public static RealVector wordFrequencyToVector(Map<String, Integer> wordFrequency, List<String> vocabulary) {
        double[] vector = new double[vocabulary.size()];
        for (int i = 0; i < vocabulary.size(); i++) {
            vector[i] = wordFrequency.getOrDefault(vocabulary.get(i), 0);
        }
        return new ArrayRealVector(vector);
    }

    // 读取文件内容（流式读取）
    public static String readFile(String filePath) throws IOException {
        StringBuilder content = new StringBuilder();
        try (BufferedReader br = new BufferedReader(new FileReader(filePath))) {
            String line;
            while ((line = br.readLine()) != null) {
                content.append(line).append("\n");
            }
        }
        return content.toString();
    }

    // 构建词汇表（增量构建）
    public static List<String> buildVocabulary(Path papersDir) throws IOException {
        Set<String> vocabulary = new HashSet<>();
        Files.list(papersDir).forEach(path -> {
            try {
                String content = readFile(path.toString());
                String[] words = content.split("\\s+");
                vocabulary.addAll(Arrays.asList(words));
            } catch (IOException e) {
                e.printStackTrace();
            }
        });
        return new ArrayList<>(vocabulary);
    }

    // 主函数
    public static void main(String[] args) throws IOException {
        // 论文文件目录
        Path papersDir = Paths.get("D:\\codeabc");

        // 构建词汇表
        List<String> vocabulary = buildVocabulary(papersDir);

        // 存储每篇论文的词频向量
        List<RealVector> vectors = new ArrayList<>();

        // 逐篇处理论文
        Files.list(papersDir).forEach(path -> {
            try {
                String content = readFile(path.toString());
                Map<String, Integer> wordFrequency = textToWordFrequency(content);
                RealVector vector = wordFrequencyToVector(wordFrequency, vocabulary);
                vectors.add(vector);
            } catch (IOException e) {
                e.printStackTrace();
            }
        });

        System.err.println("共有=" + vectors.size() + "文章");

        // 比较每对论文的相似度
        for (int i = 0; i < vectors.size(); i++) {
            for (int j = i + 1; j < vectors.size(); j++) {
                double similarity = cosineSimilarity(vectors.get(i), vectors.get(j));
                if (similarity > 0.9) { // 假设相似度大于0.9认为是抄袭
                    System.out.printf("Paper %d and Paper %d are similar with cosine similarity: %.2f%n", i, j, similarity);
                }
            }
        }
    }
}

运行截图如下：