对比代码重复率 并且输出excel指明

v2

import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* 这个代码已经在昨天基础上改善了 作为版本2
* @Param
* @return
**/

public class CodeSimilarityChecker2 {
    // 计算两个代码文件的相似度
    private static double calculateSimilarity(File file1, File file2) throws IOException {
        BufferedReader reader1 = new BufferedReader(new FileReader(file1));
        BufferedReader reader2 = new BufferedReader(new FileReader(file2));

        String line1, line2;
        StringBuilder content1 = new StringBuilder();
        StringBuilder content2 = new StringBuilder();

        while ((line1 = reader1.readLine()) != null) {
            content1.append(line1).append("\n");
        }
        while ((line2 = reader2.readLine()) != null) {
            content2.append(line2).append("\n");
        }

        double similarity = similarity(content1.toString(), content2.toString());

        reader1.close();
        reader2.close();

        return similarity;
    }

    // 使用编辑距离算法计算字符串相似度
    private static double similarity(String s1, String s2) {
        Set<String> tokens1 = tokenize(s1);
        Set<String> tokens2 = tokenize(s2);

        int common = 0;
        Set<String> visited = new HashSet<>();
        for (String token : tokens1) {
            if (!visited.contains(token) && tokens2.contains(token)) {
                common++;
                visited.add(token);
            }
        }

        return (double) common / (tokens1.size() + tokens2.size() - common);
    }

    private static Set<String> tokenize(String s) {
        Set<String> tokens = new HashSet<>();
        String pattern = "\\b[A-Za-z_]\\w*\\b"; // 使用非单词字符进行切分,这样只会提取单词
        Matcher matcher = Pattern.compile(pattern).matcher(s);
        while (matcher.find()) {
            String token = matcher.group().trim();
            if (!token.isEmpty() && !token.startsWith("//") && !token.startsWith("/*") && !token.startsWith("*") && !token.startsWith("\"")) {
                tokens.add(token);
            }
        }
        return tokens;
    }

    // 查找相似代码块
    private static List<String> findSimilarBlocks(File file1, File file2, Set<String> filterStrings) throws IOException {
        BufferedReader reader1 = new BufferedReader(new FileReader(file1));
        BufferedReader reader2 = new BufferedReader(new FileReader(file2));

        String line1, line2;
        StringBuilder content1 = new StringBuilder();
        StringBuilder content2 = new StringBuilder();

        while ((line1 = reader1.readLine()) != null) {
            content1.append(line1).append("\n");
        }
        while ((line2 = reader2.readLine()) != null) {
            content2.append(line2).append("\n");
        }

        List<String> similarBlocks = new ArrayList<>();
        String pattern = "\\W+"; // 使用非单词字符进行切分,这样只会提取单词
        String[] lines1 = content1.toString().split("\n");
        String[] lines2 = content2.toString().split("\n");
        int lineNum = 1;
        for (String line : lines1) {
            if (!line.trim().isEmpty()) {
                boolean skipLine = false;
                for (String filterString : filterStrings) {
                    if (line.contains(filterString)) {
                        skipLine = true;
                        break;
                    }
                }
                if (skipLine) {
                    continue;
                }
                String[] tokens1 = line.split(pattern);
                for (String line3 : lines2) {
                    if (!line3.trim().isEmpty()) {
                        boolean skipLine2 = false;
                        for (String filterString : filterStrings) {
                            if (line3.contains(filterString)) {
                                skipLine2 = true;
                                break;
                            }
                        }
                        if (skipLine2) {
                            continue;
                        }
                        String[] tokens2 = line3.split(pattern);
                        int common = countCommonTokens(tokens1, tokens2);
                        double lineSimilarRate = (double) common / (tokens1.length + tokens2.length - common);
                        if (lineSimilarRate >= 0.5) {
                            similarBlocks.add(lineNum + ": " + line.trim());
                            break;
                        }
                    }
                }
            }
            lineNum++;
        }

        reader1.close();
        reader2.close();

        return similarBlocks;
    }

    private static int countCommonTokens(String[] tokens1, String[] tokens2) {
        Set<String> set = new HashSet<>(Arrays.asList(tokens2));
        int common = 0;
        for (String token : tokens1) {
            if (set.contains(token)) {
                common++;
            }
        }
        return common;
    }

    // 遍历文件夹,查找指定后缀的代码文件
    private static List<File> findCodeFiles(File folder, String[] fileExtensions) {
        List<File> codeFiles = new ArrayList<>();
        Queue<File> queue = new LinkedList<>();
        queue.add(folder);
        while (!queue.isEmpty()) {
            File currentFolder = queue.poll();
            File[] files = currentFolder.listFiles();
            if (files != null) {
                for (File file : files) {
                    if (file.isDirectory()) {
                        queue.add(file);
                    } else {
                        for (String fileExtension : fileExtensions) {
                            if (file.getName().endsWith(fileExtension)) {
                                codeFiles.add(file);
                                break; // 找到匹配的后缀后就跳出循环,不再继续检查其他后缀
                            }
                        }
                    }
                }
            }
        }
        return codeFiles;
    }

    // 将结果写入 CSV 文件
    private static void writeResultToCSV(List<String[]> results, String filePath) {
        int fileCounter = 1;
        File reportFile = new File(filePath);

        // 如果文件已存在,则在文件名后添加数字
        while (reportFile.exists()) {
            String fileName = filePath.substring(0, filePath.lastIndexOf(".csv"));
            reportFile = new File(fileName + "_" + fileCounter + ".csv");
            fileCounter++;
        }

        if (!results.isEmpty()) {
            results.sort((a, b) -> {
                double similarityA = Double.parseDouble(a[2].replace("%", ""));
                double similarityB = Double.parseDouble(b[2].replace("%", ""));
                return Double.compare(similarityB, similarityA);
            });

            try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) {
                writer.println("B项目文件,A项目文件,重复率,相似代码块");
                for (String[] result : results) {
                    writer.println(String.join(",", result));
                }
                System.out.println("CSV report generated successfully!");
                System.out.println("CSV report absolute path: " + reportFile.getAbsolutePath());
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("No similar code blocks found.");
        }
    }


    // 主函数
    public static void main(String[] args) {
        File A_project_folder = new File("E:\\code\\gulimall-2022\\gulimall-common");
        File B_project_folder = new File("E:\\code\\gulimall-2022\\gulimall-coupon");

        String[] fileExtensions = {".java", ".impl",".vue",".ts"}; // 要统计的文件后缀列表

        List<File> A_files = findCodeFiles(A_project_folder, fileExtensions);
        List<File> B_files = findCodeFiles(B_project_folder, fileExtensions);

        List<String[]> results = new ArrayList<>();
        double totalSimilarity = 0.0; // 总代码重复率
        int totalComparisons = 0; // 总文件对比次数

        int blockSize = 10; // 每次处理的文件块大小
        for (int i = 0; i < B_files.size(); i += blockSize) {
            List<File> subBFiles = B_files.subList(i, Math.min(i + blockSize, B_files.size()));
            for (File B_file : subBFiles) {
                for (File A_file : A_files) {
                    try {
                        double similarity = calculateSimilarity(A_file, B_file);
                        if (similarity >= 0.2) {
                            // 过滤掉一些不想要的代码行
                            Set<String> filterStrings = new HashSet<>();
                            filterStrings.add("//"); // 单行注释
                            filterStrings.add("/*"); // 多行注释的起始标识符
                            filterStrings.add("*");
                            filterStrings.add("*/"); // 多行注释的结束标识符
                            List<String> similarBlocks = findSimilarBlocks(A_file, B_file, filterStrings);
                            DecimalFormat df = new DecimalFormat("0.000%");
                            String[] result = new String[]{B_file.getAbsolutePath(), A_file.getAbsolutePath(),
                                    df.format(similarity), String.join(", ", similarBlocks)};
                            results.add(result);
                            totalSimilarity += similarity;
                            totalComparisons++;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }

        // 计算整体代码重复率
        double overallSimilarity = totalSimilarity / totalComparisons;
        System.out.println("Overall Code Similarity: " + new DecimalFormat("0.000%").format(overallSimilarity));

        // 将结果写入 CSV 文件
        writeResultToCSV(results, "code_similarity_report.csv");
    }

}

v2Plus

import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CodeSimilarityChecker2Plus {
    private static final Set<String> filterStrings = new HashSet<>(); // 用于过滤不想要的代码行

    private static double calculateSimilarity(File targetFile, File originalFile) throws IOException {
        BufferedReader targetReader = new BufferedReader(new FileReader(targetFile));
        BufferedReader originalReader = new BufferedReader(new FileReader(originalFile));

        String targetLine, originalLine;
        StringBuilder targetContent = new StringBuilder();
        StringBuilder originalContent = new StringBuilder();

        while ((targetLine = targetReader.readLine()) != null) {
            targetContent.append(targetLine).append("\n");
        }
        while ((originalLine = originalReader.readLine()) != null) {
            originalContent.append(originalLine).append("\n");
        }

        double similarity = similarity(targetContent.toString(), originalContent.toString());

        targetReader.close();
        originalReader.close();

        return similarity;
    }

    private static double similarity(String s1, String s2) {
        Set<String> tokens1 = tokenize(s1);
        Set<String> tokens2 = tokenize(s2);

        int common = 0;
        Set<String> visited = new HashSet<>();
        for (String token : tokens1) {
            if (!visited.contains(token) && tokens2.contains(token)) {
                common++;
                visited.add(token);
            }
        }

        return (double) common / (tokens1.size() + tokens2.size() - common);
    }

    private static Set<String> tokenize(String s) {
        Set<String> tokens = new HashSet<>();
        String pattern = "\\b[A-Za-z_]\\w*\\b"; // 使用非单词字符进行切分,这样只会提取单词
        Matcher matcher = Pattern.compile(pattern).matcher(s);
        while (matcher.find()) {
            String token = matcher.group().trim();
            if (!token.isEmpty() && !token.startsWith("//") && !token.startsWith("/*") && !token.startsWith("*") && !token.startsWith("\"")) {
                tokens.add(token);
            }
        }
        return tokens;
    }

    private static List<String> findSimilarBlocks(File targetFile, File originalFile) throws IOException {
        BufferedReader targetReader = new BufferedReader(new FileReader(targetFile));
        BufferedReader originalReader = new BufferedReader(new FileReader(originalFile));

        String targetLine, originalLine;
        StringBuilder targetContent = new StringBuilder();
        StringBuilder originalContent = new StringBuilder();

        while ((targetLine = targetReader.readLine()) != null) {
            targetContent.append(targetLine).append("\n");
        }
        while ((originalLine = originalReader.readLine()) != null) {
            originalContent.append(originalLine).append("\n");
        }

        List<String> similarBlocks = new ArrayList<>();
        String pattern = "\\W+"; // 使用非单词字符进行切分,这样只会提取单词
        String[] targetLines = targetContent.toString().split("\n");
        String[] originalLines = originalContent.toString().split("\n");
        int lineNum = 1;
        for (String line : targetLines) {
            if (!line.trim().isEmpty()) {
                boolean skipLine = false;
                for (String filterString : filterStrings) {
                    if (line.contains(filterString)) {
                        skipLine = true;
                        break;
                    }
                }
                if (skipLine) {
                    continue;
                }
                String[] tokens1 = line.split(pattern);
                for (String line2 : originalLines) {
                    if (!line2.trim().isEmpty()) {
                        boolean skipLine2 = false;
                        for (String filterString : filterStrings) {
                            if (line2.contains(filterString)) {
                                skipLine2 = true;
                                break;
                            }
                        }
                        if (skipLine2) {
                            continue;
                        }
                        String[] tokens2 = line2.split(pattern);
                        int common = countCommonTokens(tokens1, tokens2);
                        double lineSimilarRate = (double) common / (tokens1.length + tokens2.length - common);
                        if (lineSimilarRate >= 0.5) {
                            similarBlocks.add(lineNum + ": " + line.trim());
                            break;
                        }
                    }
                }
            }
            lineNum++;
        }

        targetReader.close();
        originalReader.close();

        return similarBlocks;
    }

    private static int countCommonTokens(String[] tokens1, String[] tokens2) {
        Set<String> set = new HashSet<>(Arrays.asList(tokens2));
        int common = 0;
        for (String token : tokens1) {
            if (set.contains(token)) {
                common++;
            }
        }
        return common;
    }

    private static List<File> findCodeFiles(File folder, String[] fileExtensions) {
        List<File> codeFiles = new ArrayList<>();
        Queue<File> queue = new LinkedList<>();
        queue.add(folder);
        while (!queue.isEmpty()) {
            File currentFolder = queue.poll();
            File[] files = currentFolder.listFiles();
            if (files != null) {
                for (File file : files) {
                    if (file.isDirectory()) {
                        queue.add(file);
                    } else {
                        for (String fileExtension : fileExtensions) {
                            if (file.getName().endsWith(fileExtension)) {
                                codeFiles.add(file);
                                break; // 找到匹配的后缀后就跳出循环,不再继续检查其他后缀
                            }
                        }
                    }
                }
            }
        }
        return codeFiles;
    }

    private static void writeResultToCSV(List<String[]> results, String filePath) {
        int fileCounter = 1;
        File reportFile = new File(filePath);

        // 如果文件已存在,则在文件名后添加数字
        while (reportFile.exists()) {
            String fileName = filePath.substring(0, filePath.lastIndexOf(".csv"));
            reportFile = new File(fileName + "_" + fileCounter + ".csv");
            fileCounter++;
        }

        if (!results.isEmpty()) {
            results.sort((a, b) -> {
                double similarityA = Double.parseDouble(a[2].replace("%", ""));
                double similarityB = Double.parseDouble(b[2].replace("%", ""));
                return Double.compare(similarityB, similarityA);
            });

            try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) {
                writer.println("目标项目文件,原始项目文件,重复率,相似代码块");
                for (String[] result : results) {
                    writer.println(String.join(",", result));
                }
                System.out.println("CSV 报告生成成功!");
                System.out.println("CSV 报告绝对路径:" + reportFile.getAbsolutePath());
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("未发现相似的代码块。");
        }
    }

    public static void main(String[] args) {
        // 初始化 filterStrings 变量
        filterStrings.add("//"); // 单行注释
        filterStrings.add("/*"); // 多行注释的起始标识符
        filterStrings.add("*");
        filterStrings.add("*/"); // 多行注释的结束标识符

        File targetProjectFolder = new File("E:\\code\\gulimall-2022\\gulimall-common");
        File originalProjectFolder = new File("E:\\code\\gulimall-2022\\gulimall-coupon");

        String[] fileExtensions = {".java", ".impl",".vue",".ts"}; // 要统计的文件后缀列表

        List<File> targetFiles = findCodeFiles(targetProjectFolder, fileExtensions);
        List<File> originalFiles = findCodeFiles(originalProjectFolder, fileExtensions);

        List<String[]> results = new ArrayList<>();
        double totalSimilarity = 0.0; // 总代码重复率
        int totalComparisons = 0; // 总文件对比次数

        int blockSize = 10; // 每次处理的文件块大小
        for (int i = 0; i < originalFiles.size(); i += blockSize) {
            List<File> subOriginalFiles = originalFiles.subList(i, Math.min(i + blockSize, originalFiles.size()));
            for (File originalFile : subOriginalFiles) {
                for (File targetFile : targetFiles) {
                    try {
                        double similarity = calculateSimilarity(targetFile, originalFile);
                        if (similarity >= 0.2) {
                            List<String> similarBlocks = findSimilarBlocks(targetFile, originalFile);
                            DecimalFormat df = new DecimalFormat("0.000%");
                            String[] result = new String[]{originalFile.getAbsolutePath(), targetFile.getAbsolutePath(),
                                    df.format(similarity), String.join(", ", similarBlocks)};
                            results.add(result);
                            totalSimilarity += similarity;
                            totalComparisons++;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }

        // 计算整体代码重复率
        double overallSimilarity = totalSimilarity / totalComparisons;
        System.out.println("整体代码相似度: " + new DecimalFormat("0.000%").format(overallSimilarity));

        // 将结果写入 CSV 文件
        writeResultToCSV(results, "code_similarity_report.csv");
    }
}

V2PP 进一步优化 防止栈溢出 但是结果好像不太一样

import java.io.*;
import java.text.DecimalFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CodeSimilarityChecker2PP {
    private static Set<String> filterStrings = new HashSet<>(); // 过滤字符串集合

    // 计算两个代码文件的相似度
    private static double calculateSimilarity(File targetFile, File originalFile) throws IOException {
        try (BufferedReader targetReader = new BufferedReader(new FileReader(targetFile));
             BufferedReader originalReader = new BufferedReader(new FileReader(originalFile))) {

            StringBuilder targetContent = new StringBuilder();
            StringBuilder originalContent = new StringBuilder();
            String line;

            while ((line = targetReader.readLine()) != null) {
                targetContent.append(line).append("\n");
            }
            while ((line = originalReader.readLine()) != null) {
                originalContent.append(line).append("\n");
            }

            double similarity = similarity(targetContent.toString(), originalContent.toString());

            return similarity;
        }
    }

    // 使用编辑距离算法计算字符串相似度
    private static double similarity(String targetContent, String originalContent) {
        Set<String> targetTokens = tokenize(targetContent);
        Set<String> originalTokens = tokenize(originalContent);

        int common = 0;
        Set<String> visited = new HashSet<>();
        for (String token : targetTokens) {
            if (!visited.contains(token) && originalTokens.contains(token)) {
                common++;
                visited.add(token);
            }
        }

        return (double) common / (targetTokens.size() + originalTokens.size() - common);
    }

    // 利用正则表达式将字符串切分为单词
    private static Set<String> tokenize(String content) {
        Set<String> tokens = new HashSet<>();
        String pattern = "\\b[A-Za-z_]\\w*\\b"; // 使用非单词字符进行切分,这样只会提取单词
        Matcher matcher = Pattern.compile(pattern).matcher(content);
        while (matcher.find()) {
            String token = matcher.group().trim();
            if (!token.isEmpty() && !token.startsWith("//") && !token.startsWith("/*") && !token.startsWith("*") && !token.startsWith("\"")) {
                tokens.add(token);
            }
        }
        return tokens;
    }

    // 查找相似代码块
    private static List<String> findSimilarBlocks(File targetFile, File originalFile) throws IOException {
        List<String> similarBlocks = new ArrayList<>();
        String pattern = "\\W+"; // 使用非单词字符进行切分,这样只会提取单词

        // 读取原始文件的内容到列表中
        List<String> originalLines = new ArrayList<>();
        try (BufferedReader originalReader = new BufferedReader(new FileReader(originalFile))) {
            String line;
            int lineNumber = 1;
            while ((line = originalReader.readLine()) != null) {
                if (!line.trim().isEmpty() && !containsFilterString(line)) {
                    originalLines.add(lineNumber + ": " + line);
                }
                lineNumber++;
            }
        }

        // 逐行读取目标文件的内容并比较
        try (BufferedReader targetReader = new BufferedReader(new FileReader(targetFile))) {
            String line;
            int lineNumber = 1;
            while ((line = targetReader.readLine()) != null) {
                if (!line.trim().isEmpty() && !containsFilterString(line)) {
                    String[] tokens1 = line.split(pattern);
                    for (String originalLine : originalLines) {
                        String[] tokens2 = originalLine.split(pattern);
                        int common = countCommonTokens(tokens1, tokens2);
                        double lineSimilarRate = (double) common / (tokens1.length + tokens2.length - common);
                        if (lineSimilarRate >= 0.5) {
                            similarBlocks.add(lineNumber + ": " + line.trim());
                            break;
                        }
                    }
                }
                lineNumber++;
            }
        }

        return similarBlocks;
    }



    // 检查行是否包含过滤字符串
    private static boolean containsFilterString(String line) {
        for (String filterString : filterStrings) {
            if (line.contains(filterString)) {
                return true;
            }
        }
        return false;
    }

    // 计算两个字符串数组的相同元素数量
    private static int countCommonTokens(String[] tokens1, String[] tokens2) {
        Set<String> set = new HashSet<>(Arrays.asList(tokens2));
        int common = 0;
        for (String token : tokens1) {
            if (set.contains(token)) {
                common++;
            }
        }
        return common;
    }

    // 遍历文件夹,查找指定后缀的代码文件
    private static List<File> findCodeFiles(File folder, String[] fileExtensions) {
        List<File> codeFiles = new ArrayList<>();
        Queue<File> queue = new LinkedList<>();
        queue.add(folder);
        while (!queue.isEmpty()) {
            File currentFolder = queue.poll();
            File[] files = currentFolder.listFiles();
            if (files != null) {
                for (File file : files) {
                    if (file.isDirectory()) {
                        queue.add(file);
                    } else {
                        for (String fileExtension : fileExtensions) {
                            if (file.getName().endsWith(fileExtension)) {
                                codeFiles.add(file);
                                break; // 找到匹配的后缀后就跳出循环,不再继续检查其他后缀
                            }
                        }
                    }
                }
            }
        }
        return codeFiles;
    }

    // 将结果写入 CSV 文件
    private static void writeResultToCSV(List<String[]> results, String filePath) {
        int fileCounter = 1;
        File reportFile = new File(filePath);

        // 如果文件已存在,则在文件名后添加数字
        while (reportFile.exists()) {
            String fileName = filePath.substring(0, filePath.lastIndexOf(".csv"));
            reportFile = new File(fileName + "_" + fileCounter + ".csv");
            fileCounter++;
        }

        if (!results.isEmpty()) {
            results.sort((a, b) -> {
                double similarityA = Double.parseDouble(a[2].replace("%", ""));
                double similarityB = Double.parseDouble(b[2].replace("%", ""));
                return Double.compare(similarityB, similarityA);
            });

            try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) {
                writer.println("目标项目文件,原始项目文件,重复率,相似代码块");
                for (String[] result : results) {
                    String similarBlocks = formatSimilarBlocks(result[3]);
                    writer.println(result[0] + "," + result[1] + "," + result[2] + "," + similarBlocks);
                }

                System.out.println("CSV 报告生成成功!");
                System.out.println("CSV 报告绝对路径:" + reportFile.getAbsolutePath());
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("未发现相似的代码块。");
        }
    }

    // 格式化相似代码块,包含行号信息
    private static String formatSimilarBlocks(String similarBlocks) {
        StringBuilder formattedBlocks = new StringBuilder();
        String[] blocks = similarBlocks.split(", ");
        for (String block : blocks) {
            String[] parts = block.split(": ");
            if (parts.length == 2) { // 检查数组长度以避免越界风险
                formattedBlocks.append(parts[0]).append(": ").append(parts[1]).append(", ");
            }
        }
        return formattedBlocks.toString();
    }



    // 主函数
    public static void main(String[] args) {
        // 初始化 filterStrings 变量
        filterStrings.add("//"); // 单行注释
        filterStrings.add("/*"); // 多行注释的起始标识符
        filterStrings.add("*");
        filterStrings.add("*/"); // 多行注释的结束标识符

        File targetProjectFolder = new File("E:\\code\\gulimall-2022\\gulimall-common");
        File originalProjectFolder = new File("E:\\code\\gulimall-2022\\gulimall-common");

        String[] fileExtensions = {".java", ".impl",".vue",".ts"}; // 要统计的文件后缀列表

        List<File> targetFiles = findCodeFiles(targetProjectFolder, fileExtensions);
        List<File> originalFiles = findCodeFiles(originalProjectFolder, fileExtensions);

        List<String[]> results = new ArrayList<>();
        double totalSimilarity = 0.0; // 总代码重复率
        int totalComparisons = 0; // 总文件对比次数

        int blockSize = 10; // 每次处理的文件块大小
        for (int i = 0; i < originalFiles.size(); i += blockSize) {
            List<File> subOriginalFiles = originalFiles.subList(i, Math.min(i + blockSize, originalFiles.size()));
            for (File originalFile : subOriginalFiles) {
                for (File targetFile : targetFiles) {
                    try {
                        double similarity = calculateSimilarity(targetFile, originalFile);
                        if (similarity >= 0.2) {
                            // 过滤掉一些不想要的代码行
                            List<String> similarBlocks = findSimilarBlocks(targetFile, originalFile);
                            DecimalFormat df = new DecimalFormat("0.000%");
                            String[] result = new String[]{originalFile.getAbsolutePath(), targetFile.getAbsolutePath(),
                                    df.format(similarity), String.join(", ", similarBlocks)};
                            results.add(result);
                            totalSimilarity += similarity;
                            totalComparisons++;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }

        // 计算整体代码重复率
        double overallSimilarity = totalSimilarity / totalComparisons;
        System.out.println("Overall Code Similarity: " + new DecimalFormat("0.000%").format(overallSimilarity));

        // 将结果写入 CSV 文件
        writeResultToCSV(results, "code_similarity_report.csv");
    }
}

V3 还有问题

import com.github.javaparser.StaticJavaParser;
import com.github.javaparser.ast.CompilationUnit;
import com.github.javaparser.ast.Node;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;

/**
* v3 更改两个文件代码相似度的算法
 * 目前还有问题
 * 需要导入新依赖
 <dependency>
 <groupId>com.github.javaparser</groupId>
 <artifactId>javaparser-core</artifactId>
 <version>3.24.0</version>
 </dependency>
* @Param
* @return
**/

public class CodeSimilarityCheckerV3 {

    public static double calculateSimilarity(File file1, File file2) throws FileNotFoundException {
        CompilationUnit cu1 = StaticJavaParser.parse(new FileInputStream(file1));
        CompilationUnit cu2 = StaticJavaParser.parse(new FileInputStream(file2));

        return calculateSimilarity(cu1, cu2);
    }

    private static double calculateSimilarity(Node node1, Node node2) {
        if (node1.getClass() != node2.getClass()) {
            return 0.0;
        }

        double similarity = 0.0;

        // 计算当前节点的相似度
        if (node1.toString().equals(node2.toString())) {
            similarity += 1.0;
        }

        // 递归计算子节点的相似度
        int childCount1 = node1.getChildNodes().size();
        int childCount2 = node2.getChildNodes().size();
        int minChildCount = Math.min(childCount1, childCount2);
        for (int i = 0; i < minChildCount; i++) {
            Node child1 = node1.getChildNodes().get(i);
            Node child2 = node2.getChildNodes().get(i);
            double childSimilarity = calculateSimilarity(child1, child2);
            // 检查子节点的相似度是否超出范围
            if (Double.isFinite(childSimilarity)) {
                similarity += childSimilarity;
            }
        }

        // 归一化相似度
        int maxChildCount = Math.max(childCount1, childCount2);
        return maxChildCount > 0 ? similarity / maxChildCount : 0.0;
    }


    // 查找相似代码块
    private static List<String> findSimilarBlocks(File file1, File file2, Set<String> filterStrings) throws IOException {
        BufferedReader reader1 = new BufferedReader(new FileReader(file1));
        BufferedReader reader2 = new BufferedReader(new FileReader(file2));

        String line1, line2;
        StringBuilder content1 = new StringBuilder();
        StringBuilder content2 = new StringBuilder();

        while ((line1 = reader1.readLine()) != null) {
            content1.append(line1).append("\n");
        }
        while ((line2 = reader2.readLine()) != null) {
            content2.append(line2).append("\n");
        }

        List<String> similarBlocks = new ArrayList<>();
        String pattern = "\\W+"; // 使用非单词字符进行切分,这样只会提取单词
        String[] lines1 = content1.toString().split("\n");
        String[] lines2 = content2.toString().split("\n");
        int lineNum = 1;
        for (String line : lines1) {
            if (!line.trim().isEmpty()) {
                boolean skipLine = false;
                for (String filterString : filterStrings) {
                    if (line.contains(filterString)) {
                        skipLine = true;
                        break;
                    }
                }
                if (skipLine) {
                    continue;
                }
                String[] tokens1 = line.split(pattern);
                for (String line3 : lines2) {
                    if (!line3.trim().isEmpty()) {
                        boolean skipLine2 = false;
                        for (String filterString : filterStrings) {
                            if (line3.contains(filterString)) {
                                skipLine2 = true;
                                break;
                            }
                        }
                        if (skipLine2) {
                            continue;
                        }
                        String[] tokens2 = line3.split(pattern);
                        int common = countCommonTokens(tokens1, tokens2);
                        double lineSimilarRate = (double) common / (tokens1.length + tokens2.length - common);
                        if (lineSimilarRate >= 0.5) {
                            similarBlocks.add(lineNum + ": " + line.trim());
                            break;
                        }
                    }
                }
            }
            lineNum++;
        }

        reader1.close();
        reader2.close();

        return similarBlocks;
    }

    private static int countCommonTokens(String[] tokens1, String[] tokens2) {
        Set<String> set = new HashSet<>(Arrays.asList(tokens2));
        int common = 0;
        for (String token : tokens1) {
            if (set.contains(token)) {
                common++;
            }
        }
        return common;
    }

    // 遍历文件夹,查找指定后缀的代码文件
    private static List<File> findCodeFiles(File folder, String[] fileExtensions) {
        List<File> codeFiles = new ArrayList<>();
        Queue<File> queue = new LinkedList<>();
        queue.add(folder);
        while (!queue.isEmpty()) {
            File currentFolder = queue.poll();
            File[] files = currentFolder.listFiles();
            if (files != null) {
                for (File file : files) {
                    if (file.isDirectory()) {
                        queue.add(file);
                    } else {
                        for (String fileExtension : fileExtensions) {
                            if (file.getName().endsWith(fileExtension)) {
                                codeFiles.add(file);
                                break; // 找到匹配的后缀后就跳出循环,不再继续检查其他后缀
                            }
                        }
                    }
                }
            }
        }
        return codeFiles;
    }

    // 将结果写入 CSV 文件
    private static void writeResultToCSV(List<String[]> results, String filePath) {
        int fileCounter = 1;
        File reportFile = new File(filePath);

        // 如果文件已存在,则在文件名后添加数字
        while (reportFile.exists()) {
            String fileName = filePath.substring(0, filePath.lastIndexOf(".csv"));
            reportFile = new File(fileName + "_" + fileCounter + ".csv");
            fileCounter++;
        }

        if (!results.isEmpty()) {
            results.sort((a, b) -> {
                double similarityA = Double.parseDouble(a[2].replace("%", ""));
                double similarityB = Double.parseDouble(b[2].replace("%", ""));
                return Double.compare(similarityB, similarityA);
            });

            try (PrintWriter writer = new PrintWriter(new FileWriter(reportFile))) {
                writer.println("B项目文件,A项目文件,重复率,相似代码块");
                for (String[] result : results) {
                    writer.println(String.join(",", result));
                }
                System.out.println("CSV report generated successfully!");
                System.out.println("CSV report absolute path: " + reportFile.getAbsolutePath());
            } catch (IOException e) {
                e.printStackTrace();
            }
        } else {
            System.out.println("No similar code blocks found.");
        }
    }


    // 主函数
    public static void main(String[] args) {
//        File A_project_folder = new File("E:\\code\\gulimall-2022\\gulimall-common");
//        File B_project_folder = new File("E:\\code\\gulimall-2022\\gulimall-coupon");
        File A_project_folder = new File("E:\\fp\\Folder1");
        File B_project_folder = new File("E:\\fp\\Folder2");

        String[] fileExtensions = {".java", ".impl"}; // 要统计的文件后缀列表

        List<File> A_files = findCodeFiles(A_project_folder, fileExtensions);
        List<File> B_files = findCodeFiles(B_project_folder, fileExtensions);

        List<String[]> results = new ArrayList<>();
        double totalSimilarity = 0.0; // 总代码重复率
        int totalComparisons = 0; // 总文件对比次数

        int blockSize = 10; // 每次处理的文件块大小
        for (int i = 0; i < B_files.size(); i += blockSize) {
            List<File> subBFiles = B_files.subList(i, Math.min(i + blockSize, B_files.size()));
            for (File B_file : subBFiles) {
                for (File A_file : A_files) {
                    try {
                        double similarity = calculateSimilarity(A_file, B_file);
                        if (similarity >= 0.21) {
                            // 过滤掉一些不想要的代码行
                            Set<String> filterStrings = new HashSet<>();
//                            filterStrings.add("//"); // 单行注释
//                            filterStrings.add("/*"); // 多行注释的起始标识符
//                            filterStrings.add("*");
//                            filterStrings.add("@");
//                            filterStrings.add("import");
//                            filterStrings.add("*/"); // 多行注释的结束标识符
                            List<String> similarBlocks = findSimilarBlocks(A_file, B_file, filterStrings);
                            DecimalFormat df = new DecimalFormat("0.000%");
                            String[] result = new String[]{B_file.getAbsolutePath(), A_file.getAbsolutePath(),
                                    df.format(similarity), String.join(", ", similarBlocks)};
                            results.add(result);
                            totalSimilarity += similarity;
                            totalComparisons++;
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }
        }

        // 计算整体代码重复率
        double overallSimilarity = totalSimilarity / totalComparisons;
        System.out.println("Overall Code Similarity: " + new DecimalFormat("0.000%").format(overallSimilarity));

        // 将结果写入 CSV 文件
        writeResultToCSV(results, "code_similarity_report.csv");
    }

}

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值