一个需要引用一个Unihan表，实现unicode转拼音的c++代码

ASDWYang

已于 2024-08-04 21:11:26 修改

阅读量270

点赞数 6

文章标签： c++ 算法开发语言

于 2024-08-04 13:47:55 首次发布

本文链接：https://blog.csdn.net/m0_47489229/article/details/140905275

版权

有一个小功能是实现汉字转拼音，但是别人封装好的库都是GPL的，使用就得开源自己的代码。别的一些MIT的写的很差，实时性不好，因此，自己写一个unicode转拼音的代码，而且Unihan表也是实时更新的，比引用别人的库要好很多，实时打补丁可以更新。缺点：这个代码是读取.txt文件的东西，性能比较差，查询时间用的比较长，但是生僻字和多音字啥的都是可以查到的，后面.txt文件里面的东西需要加载到代码里面，类需要重新进行集成封装提高性能。屎山版测试代码如下：


#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <iostream>
#include <thread>  // 引入线程库
#include <chrono>  // 引入时间库
#include <iostream>
#include <string>
#include <unordered_map>
#include <iostream>
#include <string>
#include <unordered_map>

// 替换字符串中的带音调字符为无音调字符，并将所有音调数字添加到字符串末尾
void replaceTonedCharacters(std::string &str) {
	// 定义映射表，用于将带音调的字符替换为无音调字符，并保存音调数字
	std::unordered_map<std::string, std::pair<std::string, int>> toneMap = {
		{ "ā", std::make_pair("a", 1) },{ "á", std::make_pair("a", 2) },{ "ǎ", std::make_pair("a", 3) },{ "à", std::make_pair("a", 4) },
		{ "ō", std::make_pair("o", 1) },{ "ó", std::make_pair("o", 2) },{ "ǒ", std::make_pair("o", 3) },{ "ò", std::make_pair("o", 4) },
		{ "ē", std::make_pair("e", 1) },{ "é", std::make_pair("e", 2) },{ "ě", std::make_pair("e", 3) },{ "è", std::make_pair("e", 4) },
		{ "ī", std::make_pair("i", 1) },{ "í", std::make_pair("i", 2) },{ "ǐ", std::make_pair("i", 3) },{ "ì", std::make_pair("i", 4) },
		{ "ū", std::make_pair("u", 1) },{ "ú", std::make_pair("u", 2) },{ "ǔ", std::make_pair("u", 3) },{ "ù", std::make_pair("u", 4) },
		{ "ǖ", std::make_pair("ü", 1) },{ "ǘ", std::make_pair("ü", 2) },{ "ǚ", std::make_pair("ü", 3) },{ "ǜ", std::make_pair("ü", 4) }
	};

	std::string tones; // 保存音调数字
	size_t i = 0;

	while (i < str.size()) {
		bool found = false;
		for (const auto &pair : toneMap) {
			const std::string &tone = pair.first;
			const std::string &replacement = pair.second.first;
			int toneNumber = pair.second.second;

			// 使用 compare 方法判断当前子串是否匹配目标字符
			if (str.compare(i, tone.length(), tone) == 0) {
				// 替换为无音调字符
				str.replace(i, tone.length(), replacement);
				// 将音调数字添加到保存音调数字的字符串
				tones += std::to_string(toneNumber);
				i += replacement.length(); // 移动到下一个字符
				found = true;
				break;
			}
		}
		if (!found) {
			++i; // 如果没有匹配，移动到下一个字符
		}
	}

	// 将保存的音调数字添加到字符串末尾
	str += tones;
}

int main() {
	
	std::ifstream infile("Unihan_Readings123.txt");  // 打开输入文件
	if (!infile) {
		std::cerr << "无法打开文件 unicode_data.txt！" << std::endl;
		return 1;
	}

	std::string line, inputUnicode, inputKey;
	std::cout << "请输入Unicode码（例如：U+3433）: ";
	std::cin >> inputUnicode;
	inputKey = "kHanyuPinyin";

	bool found = false;

	// 逐行读取文件内容
	while (std::getline(infile, line)) {
		
		std::istringstream iss(line);
		std::string unicode, key, value;
		std::cout << (line) << std::endl;
		// 解析每一行中的unicode码和字符1
		if (!(iss >> unicode >> key)) {
			continue;  // 如果解析失败，跳过这一行
		}

		

		// 检查Unicode码和字符1是否匹配用户输入
		if (unicode == inputUnicode && key == inputKey) {
			found = true;

			std::getline(iss, value);  // 读取第二个TAB键后的字符2

			std::cout << (value) << std::endl;
			std::this_thread::sleep_for(std::chrono::seconds(1));

			std::istringstream valueStream(value);
			std::string reading;

			// 使用逗号分隔字符2中的每个读音并逐个输出

			// 只输出:后的拼音部分
			std::size_t pos = value.find(':');
			if (pos != std::string::npos) {
				std::string pinyin = value.substr(pos + 1);

				std::cout << (pinyin) << std::endl;
				replaceTonedCharacters(pinyin);
				std::cout << pinyin << std::endl;

				
				

			}
			break;  // 找到后退出循环
		}
	}

	if (!found) {
		std::cout << "未找到对应的Unicode码和字符1。" << std::endl;
	}

	infile.close();

	std::cout << "按任意键继续..." << std::endl;
	std::cin.ignore();  // 忽略上一个输入留下的换行符
	std::cin.get();     // 等待用户按键

	getchar();
	return 0;


}


#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <iostream>
#include <thread>  // 引入线程库
#include <chrono>  // 引入时间库
#include <iostream>
#include <string>
#include <unordered_map>
#include <iostream>
#include <string>
#include <unordered_map>

// 替换字符串中的带音调字符为无音调字符，并将所有音调数字添加到字符串末尾
void replaceTonedCharacters(std::string &str) {
	// 定义映射表，用于将带音调的字符替换为无音调字符，并保存音调数字
	std::unordered_map<std::string, std::pair<std::string, int>> toneMap = {
		{ "ā", std::make_pair("a", 1) },{ "á", std::make_pair("a", 2) },{ "ǎ", std::make_pair("a", 3) },{ "à", std::make_pair("a", 4) },
		{ "ō", std::make_pair("o", 1) },{ "ó", std::make_pair("o", 2) },{ "ǒ", std::make_pair("o", 3) },{ "ò", std::make_pair("o", 4) },
		{ "ē", std::make_pair("e", 1) },{ "é", std::make_pair("e", 2) },{ "ě", std::make_pair("e", 3) },{ "è", std::make_pair("e", 4) },
		{ "ī", std::make_pair("i", 1) },{ "í", std::make_pair("i", 2) },{ "ǐ", std::make_pair("i", 3) },{ "ì", std::make_pair("i", 4) },
		{ "ū", std::make_pair("u", 1) },{ "ú", std::make_pair("u", 2) },{ "ǔ", std::make_pair("u", 3) },{ "ù", std::make_pair("u", 4) },
		{ "ǖ", std::make_pair("ü", 1) },{ "ǘ", std::make_pair("ü", 2) },{ "ǚ", std::make_pair("ü", 3) },{ "ǜ", std::make_pair("ü", 4) }
	};

	std::string tones; // 保存音调数字
	size_t i = 0;

	while (i < str.size()) {
		bool found = false;
		for (const auto &pair : toneMap) {
			const std::string &tone = pair.first;
			const std::string &replacement = pair.second.first;
			int toneNumber = pair.second.second;

			// 使用 compare 方法判断当前子串是否匹配目标字符
			if (str.compare(i, tone.length(), tone) == 0) {
				// 替换为无音调字符
				str.replace(i, tone.length(), replacement);
				// 将音调数字添加到保存音调数字的字符串
				tones += std::to_string(toneNumber);
				i += replacement.length(); // 移动到下一个字符
				found = true;
				break;
			}
		}
		if (!found) {
			++i; // 如果没有匹配，移动到下一个字符
		}
	}

	// 将保存的音调数字添加到字符串末尾
	str += tones;
}

int main() {
	
	std::ifstream infile("Unihan_Readings123.txt");  // 打开输入文件
	if (!infile) {
		std::cerr << "无法打开文件 unicode_data.txt！" << std::endl;
		return 1;
	}

	std::string line, inputUnicode, inputKey, inputKey1, inputKey2, inputKey3;
	std::cout << "请输入Unicode码（例如：U+3433）: ";
	std::cin >> inputUnicode;
	inputKey = "kHanyuPinyin";//存在：
	inputKey1 = "kMandarin";//不存在：
	inputKey2 = "kTGHZ2013";//存咋：
	inputKey3 = "kXHC1983";//cunzai：
	bool found = false;

	// 逐行读取文件内容
	while (std::getline(infile, line)) {
		
		std::istringstream iss(line);
		std::string unicode, key, value;
		//std::cout << (line) << std::endl;
		// 解析每一行中的unicode码和字符1
		if (!(iss >> unicode >> key)) {
			continue;  // 如果解析失败，跳过这一行
		}

		

		// 检查Unicode码和字符1是否匹配用户输入
		if (unicode == inputUnicode && key == inputKey) {
			found = true;

			std::getline(iss, value);  // 读取第二个TAB键后的字符2

			std::istringstream valueStream(value);
			std::string reading;

			// 使用逗号分隔字符2中的每个读音并逐个输出

			// 只输出:后的拼音部分
			std::size_t pos = value.find(':');
			if (pos != std::string::npos) {
				std::string pinyin = value.substr(pos + 1);

				//std::cout << (pinyin) << std::endl;

				// 替换逗号为空格
				for (char &ch : pinyin) {
					if (ch == ',') {
						ch = ' ';  // 将逗号替换为空格
					}
				}

				// 使用字符串流分割字符串并逐个输出
				std::istringstream iss(pinyin);
				std::string word;
				while (iss >> word) {

					replaceTonedCharacters(word);
					std::cout << word << std::endl;
				}
			}
			break;  // 找到后退出循环
		}
		else if (unicode == inputUnicode && key == inputKey1) {
			found = true;

			std::getline(iss, value);  // 读取第二个TAB键后的字符2

									   // 查找最后一个制表符的位置
			std::size_t lastTabPos = value.find_last_of('\t');

			// 如果找到了制表符
			if (lastTabPos != std::string::npos) {
				// 提取最后一个制表符之后的字符串
				std::string afterTab = value.substr(lastTabPos + 1);

				replaceTonedCharacters(afterTab);
				std::cout << afterTab << std::endl;
			}
			else {
				std::cout << "没有找到制表符。" << std::endl;
			}

			break;  // 找到后退出循环
		}
		else if (unicode == inputUnicode && key == inputKey2) {
			found = true;

			std::getline(iss, value);  // 读取第二个TAB键后的字符2

			std::istringstream valueStream(value);
			std::string reading;

			// 使用逗号分隔字符2中的每个读音并逐个输出

			// 只输出:后的拼音部分
			std::size_t pos = value.find(':');
			if (pos != std::string::npos) {
				std::string pinyin = value.substr(pos + 1);

				//std::cout << (pinyin) << std::endl;

				// 替换逗号为空格
				for (char &ch : pinyin) {
					if (ch == ',') {
						ch = ' ';  // 将逗号替换为空格
					}
				}

				// 使用字符串流分割字符串并逐个输出
				std::istringstream iss(pinyin);
				std::string word;
				while (iss >> word) {

					replaceTonedCharacters(word);
					std::cout << word << std::endl;
				}
			}
			break;  // 找到后退出循环
		}
		else if (unicode == inputUnicode && key == inputKey3) {
			found = true;

			std::getline(iss, value);  // 读取第二个TAB键后的字符2

			std::istringstream valueStream(value);
			std::string reading;

			// 使用逗号分隔字符2中的每个读音并逐个输出

			// 只输出:后的拼音部分
			std::size_t pos = value.find(':');
			if (pos != std::string::npos) {
				std::string pinyin = value.substr(pos + 1);

				//std::cout << (pinyin) << std::endl;

				// 替换逗号为空格
				for (char &ch : pinyin) {
					if (ch == ',') {
						ch = ' ';  // 将逗号替换为空格
					}
				}

				// 使用字符串流分割字符串并逐个输出
				std::istringstream iss(pinyin);
				std::string word;
				while (iss >> word) {

					replaceTonedCharacters(word);
					std::cout << word << std::endl;
				}
			}
			break;  // 找到后退出循环
		}
	}

	if (!found) {
		std::cout << "未找到对应的Unicode码和字符1。" << std::endl;
	}

	infile.close();

	std::cout << "按任意键继续..." << std::endl;
	std::cin.ignore();  // 忽略上一个输入留下的换行符
	std::cin.get();     // 等待用户按键

	getchar();
	return 0;


}

踩坑：c++代码读取utf-8函数总是出问题，会出现乱码的情况，尝试了很多方法什么u8，tchat_w、wstring类型以及都不可以问题很大，需要将utf-8转换为 ANSI文件，.txt文件直接另存的时候有个选项，可以直接进行转换，看这部分转换浪费了半天。
如果要是不转换的话，带有音调的字母根本就完全识别不出来，key-value的方式不能够使用。