华为OD机试笔试2024年C卷D卷 - 音乐小说内容重复识别 (java/c++/python)

算法之旅

于 2024-08-18 01:21:41 发布

阅读量216

点赞数 3

文章标签：华为od c语言 java python c++ 数据结构算法

本文链接：https://blog.csdn.net/hrr397117313/article/details/141289777

版权

华为OD机试（C卷+D卷）2024真题目录(Java & c++ & python)

题目描述

实现一个简易的重复内容识别系统，通过给定的两个内容名称，和相似内容符号，判断两个内容是否相似；如果相似，返回相似内容；如果不相似，返回不相似的内容。

初始化：给出两个字符串，一些相似字符对，如顿号和逗号相似，的和de相似，猪和潴，给出两个字符串的相似判断结果

输入：两条语句，给出是否相似，对于相似的语句，返回True和相似的字符对；对于不相似的内容，则返回第一个内容的不相似信息，方便后续补充

注意：

相似关系是具有传递性的。例如，如果"顿号"和"逗号"是相似的，"逗号"和"分号"是相似的，则"顿号"和"逗号"是相似的。
为了过滤一些无意义的信息，这里***可以匹配任意长度的内容，例如：

给出相似对"(****)“,”"时，“异世邪君（人气玄幻作家）” 和 “异世邪君” 认为是相似，此时相似符号返回 *** 即可

不相似的内容，需要给出不相似的字符串，多处不相似的字符串用空格分隔

输入描述

第一行表示第一张专辑的专辑名，其中 0 < 专辑长度 ≤ 50

第二行表示第二张专辑的专辑名，其中 0 < 专辑长度 ≤ 50

第三行开始每行为相似的字符串，每行一组，每组字符串不超过10个

总共相似字符串行不超过10行

输出描述
第一行返回相似判断的结果，即True或者False

第二行开始返回相似/不相似的字符串，每行一组

用例1

输入

林汉达上下五千年
林汉达上下5千年
五 5 ⑤ 伍 wu

输出

True
五 5

用例2

输入

幸福de猪的个人专辑
幸福的猪的个人专辑
得 的
得 de

输出

True
de 的

用例3

输入

异世邪君(人气玄幻作家)
异世邪君
(***)

输出

True
(***)

用例4

输入

浩然爸爸讲成语
浩然爸爸讲论语
论语 三字经

输出

False
成语 论语

解题思路

并查集。

判断任意两个字符串是否相似，这块逻辑是可以使用并查集实现的。

关于字符串的并查集实现，请直接看代码中 UnionFindSet 类实现。

假设第一个专辑名是a串，第二个专辑名是b串，那么：

首先遍历所有相似字符串p，然后去匹配a,b开头，即找到 a.startsWith(p1) 和 b.startsWith(p2) 结果为真的p1，p2，如果p1和p2是相似串，则把a开头p1去掉，b开头p2去掉，然后继续比较剩余部分
如果a,b串开头没有相似串匹配，那么尝试对比a[0]和b[0]是否相同，如果相同，则去除a[0]和b[0]，然后继续比较剩余部分
如果a[0]和b[0]也不相同，比如 “浩然爸爸讲” 这部分可以被第2步比较掉，但是a,b串剩余部分：成语、论语不能形成相似串，且a[0]和b[0]也不相同，此时就认为a,b串剩余部分不相似，结束比较逻辑

C++、Java、Python代码如下：

C++参考代码

#include <bits/stdc++.h>
 
using namespace std;
 
// 并查集类，用于管理字符串之间的相似性
class UnionFindSet {
public:
    map<string, string> fa; // fa记录每个字符串的“父”节点，用于管理并查集的结构
 
    // 初始化并查集，将每个字符串指向自己
    void init(vector<string> &words) {
        for (const auto &word: words) {
            if (fa.count(word) == 0) {
                fa[word] = word;
            }
        }
    }
 
    // 查找字符串s的“根”节点，并路径压缩
    string find(string &s) {
        if (fa.count(s) == 0 || s == fa[s]) {
            return s;
        } else {
            fa[s] = find(fa[s]);
            return fa[s];
        }
    }
 
    // 合并两个字符串的集合
    void merge(string x, string y) {
        string x_fa = find(x);
        string y_fa = find(y);
 
        if (x_fa != y_fa) {
            fa[y_fa] = x_fa;
        }
    }
};

// 全局变量
map<string, string> dic; // 记录字符串和其对应的正则表达式
UnionFindSet ufs; // 并查集实例，判断字符串是否相似
vector<string> simi; // 记录相似的字符串对
vector<string> diff; // 记录不相似的字符串对

// 比较两个字符串a和b，判断其相似性
void compare(const string &a, const string &b) {
    if (a.empty() && b.empty()) {
        return; // 如果两个字符串都为空，直接返回
    }
 
    // 优先使用正则表达式进行模式匹配
    for (const auto &item1: dic) {
        string word1 = item1.first;
        regex pattern1 = regex(item1.second);
        smatch match1;
        if (regex_search(a, match1, pattern1) && match1.position() == 0) {
            for (const auto &item2: dic) {
                string word2 = item2.first;
                regex pattern2 = regex(item2.second);
                smatch match2;
                if (regex_search(b, match2, pattern2) && match2.position() == 0) {
                    if (word1.empty() && word2.empty()) continue;

                    // 如果word1和word2相似，递归比较剩余的部分
                    if (ufs.find(word1) == ufs.find(word2)) {
                        if (word1 != word2) {
                            simi.emplace_back(word1 + " " + word2); // 记录相似的字符串对
                        }
                        compare(a.substr(match1[0].length()), b.substr(match2[0].length()));
                        return;
                    }
                }
            }
        }
    }
 
    // 如果正则匹配失败，逐个字符比较
    if (!a.empty() && !b.empty() && a[0] == b[0]) {
        compare(a.substr(1), b.substr(1)); // 如果开头字符相同，继续比较剩余部分
    } else {
        diff.emplace_back(a + " " + b); // 记录不相似的字符串对
    }
}

// 替换字符串中的子串
string replace(string s, const string &oldSub, const string &newSub) {
    size_t index;
    size_t len = oldSub.size();
    if ((index = s.find(oldSub)) != string::npos) {
        s = s.replace(index, len, newSub);
    }
    return s;
}

// 分割字符串
vector<string> split(string &s, char delimiter) {
    stringstream ss(s);
    string token;
    vector<string> res;
    while (getline(ss, token, delimiter)) {
        res.emplace_back(token);
    }
    return res;
}

int main() {
    string a, b;
    getline(cin, a); // 读取第一个字符串a
    getline(cin, b); // 读取第二个字符串b
 
    string line;
    while (getline(cin, line)) {
        vector<string> words = split(line, ' '); // 将输入行按空格分割成多个词汇
 
        for (auto &word: words) {
            if (word.find("***") != string::npos) {
                // 将模式字符串 "***" 替换为正则表达式 ".*"
                dic[word] = replace(replace(replace(replace(replace(word,
                                                                    "***", ".*"),
                                                            "(", "\\("),
                                                    ")", "\\)"),
                                            "[", "\\["),
                                    "]", "\\]");
            } else {
                dic[word] = word; // 普通字符串直接作为正则表达式
            }
        }
 
        // 初始化并查集并合并相似的字符串
        ufs.init(words);
        for (int i = 1; i < words.size(); i++) {
            ufs.merge(words[0], words[i]);
        }
 
        // 如果只有一个词汇，将其与空串合并，表示它和空串相似
        if (words.size() == 1) {
            ufs.merge(words[0], "");
            dic[""] = "";
        }
    }
 
    // 比较字符串a和b
    compare(a, b);
 
    // 输出结果
    if (diff.empty()) {
        cout << "True" << endl;
        for (const auto &item: simi) {
            cout << item << endl;
        }
    } else {
        cout << "False" << endl;
        for (const auto &item: diff) {
            cout << item << endl;
        }
    }
 
    return 0;
}

Java参考代码

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {
    // 用于判断任意两个字符串是否相似的并查集
    static UnionFindSet ufs = new UnionFindSet();

    // 记录字符串和对应的正则表达式
    static HashMap<String, String> map = new HashMap<>();

    // 记录相似的字符串对
    static ArrayList<String> simi = new ArrayList<>();
    // 记录不相似的字符串对
    static ArrayList<String> diff = new ArrayList<>();

    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);

        String a = sc.nextLine();  // 读取字符串 a
        String b = sc.nextLine();  // 读取字符串 b

        // 读取相似字符串对
        while (sc.hasNextLine()) {
            String[] words = sc.nextLine().split(" ");  // 按空格分割输入的字符串

            for (String word : words) {
                if (word.contains("***")) {
                    // 将 "***" 替换为 ".*" 用于正则匹配，并处理括号和方括号的转义
                    map.put(word, word.replace("***", ".*")
                                      .replace("(", "\\(")
                                      .replace(")", "\\)")
                                      .replace("[", "\\[")
                                      .replace("]", "\\]"));
                } else {
                    // 如果不包含 "***"，则直接保存为普通字符串
                    map.put(word, word);
                }
            }

            // 初始化并查集，关联相似字符串
            ufs.init(words);
            for (int i = 1; i < words.length; i++) {
                ufs.union(words[0], words[i]);
            }

            // 如果输入只有一个单词，则该单词与空字符串 "" 相似
            if (words.length == 1) {
                ufs.union(words[0], "");
                map.put("", "");
            }
        }

        // 比较字符串 a 和 b
        compare(a, b);

        // 输出结果
        if (diff.isEmpty()) {
            // 如果没有不相似的字符串对
            System.out.println("True");
            simi.forEach(System.out::println);
        } else {
            // 如果存在不相似的字符串对
            System.out.println("False");
            diff.forEach(System.out::println);
        }
    }

    /**
     * 比较两个字符串 a 和 b 是否相似
     * @param a 字符串 a
     * @param b 字符串 b
     */
    public static void compare(String a, String b) {
        if (a.isEmpty() && b.isEmpty()) {
            return;  // 如果 a 和 b 都为空，直接返回
        }

        // 优先通过模式匹配进行相似性比较
        for (String word1 : map.keySet()) {
            String regex1 = map.get(word1);
            Matcher matcher1 = Pattern.compile(regex1).matcher(a);
            if (!matcher1.find() || matcher1.start() != 0) continue;

            for (String word2 : map.keySet()) {
                String regex2 = map.get(word2);
                Matcher matcher2 = Pattern.compile(regex2).matcher(b);
                if (!matcher2.find() || matcher2.start() != 0) continue;

                if (word1.isEmpty() && word2.isEmpty()) continue;

                // 检查 word1 和 word2 是否相似
                if (ufs.find(word1).equals(ufs.find(word2))) {
                    // 如果 word1 和 word2 不相等，将它们记录为相似对
                    if (!word1.equals(word2)) {
                        simi.add(word1 + " " + word2);
                    }
                    // 递归比较去掉匹配部分后的剩余字符串
                    compare(a.substring(matcher1.group().length()), b.substring(matcher2.group().length()));
                    return;
                }
            }
        }

        // 如果模式匹配失败，则逐字符比较
        if (!a.isEmpty() && !b.isEmpty() && a.charAt(0) == b.charAt(0)) {
            compare(a.substring(1), b.substring(1));
        } else {
            // 如果不相似，将其记录到不相似对列表中
            diff.add(a + " " + b);
        }
    }
}

/**
 * 字符串并查集实现
 */
class UnionFindSet {
    HashMap<String, String> fa = new HashMap<>();

    /**
     * 初始化并查集
     * @param words 输入的字符串数组
     */
    public void init(String[] words) {
        for (String word : words) {
            fa.putIfAbsent(word, word);
        }
    }

    /**
     * 查找字符串 s 的根
     * @param s 输入的字符串
     * @return 字符串 s 的根
     */
    public String find(String s) {
        if (!this.fa.containsKey(s) || s.equals(this.fa.get(s))) {
            return s;
        } else {
            this.fa.put(s, this.find(this.fa.get(s)));
            return this.fa.get(s);
        }
    }

    /**
     * 合并两个字符串所在的集合
     * @param x 字符串 x
     * @param y 字符串 y
     */
    public void union(String x, String y) {
        String x_fa = this.find(x);
        String y_fa = this.find(y);

        if (!x_fa.equals(y_fa)) {
            this.fa.put(y_fa, x_fa);
        }
    }
}

Python参考代码

import re

# 字符串并查集实现，用于管理字符串的相似性关系
class UnionFindSet:
    def __init__(self):
        self.fa = {}  # fa 表示父节点（父节点的映射）

    def init(self, words):
        for w in words:
            self.fa.setdefault(w, w)  # 初始化并查集，每个字符串的父节点默认为它自己

    def find(self, s):
        if s not in self.fa or s == self.fa[s]:
            return s  # 如果 s 不在并查集中或 s 是其自身的父节点，则返回 s
        else:
            self.fa[s] = self.find(self.fa[s])  # 路径压缩，找到 s 的根节点并进行压缩
            return self.fa[s]

    def union(self, x, y):
        x_fa = self.find(x)
        y_fa = self.find(y)

        if x_fa != y_fa:
            self.fa[y_fa] = x_fa  # 将 y_fa 的根节点指向 x_fa，实现合并

# 全局变量
dic = {}  # 记录字符串和其对应的正则串
ufs = UnionFindSet()  # ufs 用于判断任意两个字符串是否相似
simi = []  # 记录相似对
diff = []  # 记录不相似对

# 比较两个字符串 a 和 b
def compare(a, b):
    if len(a) == 0 and len(b) == 0:
        return

    # a,b 开头优先进行相似对比较（模式对比）
    for word1 in dic:
        match1 = re.search(dic[word1], a)  # 尝试匹配 a 的开头
        if match1 and match1.start() == 0:

            for word2 in dic:
                match2 = re.search(dic[word2], b)  # 尝试匹配 b 的开头
                if match2 and match2.start() == 0:

                    if len(word1) == 0 and len(word2) == 0:
                        continue

                    # a 串以 word1 匹配开头，b 串以 word2 匹配开头
                    # word1 和 word2 相似
                    if ufs.find(word1) == ufs.find(word2):
                        # word1 和 word2 不相等，则记录相似对
                        if word1 != word2:
                            simi.append(word1 + " " + word2)
                        compare(a[len(match1.string):], b[len(match2.string):])  # 递归比较剩余部分
                        return

    # 如果开头无法形成相似对，则逐个字符比较
    if len(a) != 0 and len(b) != 0 and a[0] == b[0]:
        compare(a[1:], b[1:])
    else:
        # 如果开头字符也不一样，则整体不相似
        diff.append(a + " " + b)

if __name__ == '__main__':
    a = input()  # 读取第一个字符串
    b = input()  # 读取第二个字符串

    try:
        while True:
            words = input().split()  # 读取一行输入并分割成单词

            for word in words:
                if "***" in word:
                    # 将 "***" 替换为正则表达式的 ".*" 来匹配任意字符序列，并处理特殊字符的转义
                    dic[word] = word.replace("***", ".*").replace("(", "\\(").replace(")", "\\)").replace("[", "\\[").replace("]", "\\]")
                else:
                    dic[word] = word  # 普通字符直接匹配

            # 并查集初始化和合并操作，用于关联相似字符串
            ufs.init(words)
            for i in range(1, len(words)):
                ufs.union(words[0], words[i])

            # 如果一行输入只有一个单词，则将该单词与空字符串视为相似
            if len(words) == 1:
                ufs.union(words[0], "")
                dic[""] = ""
    except:
        pass

    # 比对 a 和 b 串
    compare(a, b)

    if len(diff) == 0:
        # 如果没有不相似串，输出 True 并打印相似对
        print("True")
        for s in simi:
            print(s)
    else:
        # 如果存在不相似串，输出 False 并打印不相似对
        print("False")
        for s in diff:
            print(s)

算法之旅

关注

3
点赞
踩
10

收藏

觉得还不错? 一键收藏
打赏
0
评论
华为OD机试笔试2024年C卷D卷 - 音乐小说内容重复识别 (java/c++/python)

实现一个简易的重复内容识别系统，通过给定的两个内容名称，和相似内容符号，判断两个内容是否相似；如果不相似，返回不相似的内容。输入：两条语句，给出是否相似，对于相似的语句，返回True和相似的字符对；对于不相似的内容，则返回第一个内容的不相似信息，方便后续补充。给出相似对"(****)“,”"时，“异世邪君（人气玄幻作家）” 和 “异世邪君” 认为是相似，此时相似符号返回 *** 即可。初始化：给出两个字符串，一些相似字符对，如顿号和逗号相似，的和de相似，猪和潴，给出两个字符串的相似判断结果。
复制链接

扫一扫