文本相似度

说明:在VS2015 环境的Release 的模式下(Debug模式下运行速度太慢)

TextSimilarity.h

#pragma once
#include <unordered_map>
#include <string>
#include <unordered_set>
#include <cppjieba/jieba.hpp>

class TextSimilarity
{
public:
	typedef std::unordered_map<std::string, int> wordFreq;//unordered_map的查找效率较高
	typedef std::unordered_set<std::string> wordSet;
	TextSimilarity(std::string dict);
	void getStopWordTable(const char* stopWordFile);
	wordFreq getWordFreq(const char* file);
	std::string UTF8ToGBK(std::string str);
	std::string GBKToUTF8(std::string str);
	std::vector<std::pair<std::string, int>> sortByValueReverse(wordFreq& wf);
	void selectAimWords(std::vector<std::pair<std::string, int>>& wfvec, wordSet& wset);
	std::vector<double> getOneHot(wordSet& wset, wordFreq& wf);
	double cosine(std::vector<double> oneHot1, std::vector<double> oneHot2);


private:
	
	std::string DICT;
	std::string DICT_PATH;
	std::string HMM_PATH;
	std::string USER_DICT_PATH;
	std::string IDF_PATH;
	std::string STOP_WORD_PATH;
	cppjieba::Jieba _jieba;

	wordSet _stopWordSet;
	int _maxWordNumber;

	
};

TextSimilarity.cpp

#define _CRT_SECURE_NO_WARNINGS 1
#include "TextSimilarity.h"
#include <fstream>
#include <iostream>
#include "cppjieba/Jieba.hpp"
#include <windows.h>
#include <algorithm>
using namespace std;
//利用jieba分词实现文本相似度功能
//1.对两个文档进行分词
//2.统计文档中的词频
//3.将两个文本中的所有有效次全部编码,排序,去出钱n个关键词
//4.按照码值构建词频向量如:
//文档1中的词频:[0:1,1:1,2:0,3:1]
//文档2中的词频:[0:2,1:1,2:1,3:1]
//文档1词频向量:[1,1,0,1]
//文档2词频向量:[2, 1, 1, 1]
//通过计算向量相似度(欧几里得距离,余弦相似度等等)来评估他们的相似度
//算法缺点:没有很好的解决同义词和多义词问题
TextSimilarity::TextSimilarity(string dict)
	:DICT(dict)
	, DICT_PATH(dict + "/Jieba.dict.utf8")
	,HMM_PATH(dict + "/hmm_model.utf8")
    ,USER_DICT_PATH(dict + "/user.dict.utf8")
    ,IDF_PATH(dict + "/idf.utf8")
    ,STOP_WORD_PATH(dict + "/stop_words.utf8")
	,_jieba(DICT_PATH,
		    HMM_PATH,
		    USER_DICT_PATH,
		    IDF_PATH,
		    STOP_WORD_PATH)
	,_maxWordNumber(10)
{
	getStopWordTable(STOP_WORD_PATH.c_str());
}

TextSimilarity::wordFreq TextSimilarity::getWordFreq(const char* filename)
{
	ifstream fin(filename);
	if (!fin.is_open())
	{
		cout << "open file:" << filename << "failed" << endl;
		return wordFreq();
	}
	string line;
	wordFreq wf;
	while (!fin.eof())
	{
		getline(fin, line);//读取一行数据
		line = GBKToUTF8(line);//将数据从GBK格式转换为UTF8格式
		vector<string> words;
		//对文本当前行分词
		_jieba.Cut(line, words, true);//调用“结巴”的分词接口
        //统计词频
		for (const auto& e : words)
		{
			//去掉停用词
			if (_stopWordSet.count(e) > 0)
				continue;
			else
			{
				if (wf.count(e) > 0)
					wf[e]++;
				else
					wf[e] = 1;
			}
		}
	}
	return wf;
}

void TextSimilarity::getStopWordTable(const char* stopWordFile)
{
	ifstream fin(stopWordFile);
	if (!fin.is_open())
	{
		cout << "open file:" << stopWordFile << "failed" << endl;
		return;
	}
	string line;
	while (!fin.eof())
	{
		getline(fin, line);
		//UTF8
		_stopWordSet.insert(line);//获取停用词
	}
	fin.close();
}
bool cmpReverse(pair<string, int> lp, pair<string, int> rp)
{
	return lp.second > rp.second;
}
vector<std::pair<std::string, int>> TextSimilarity::sortByValueReverse(TextSimilarity::wordFreq& wf)
{
	vector<pair<string, int>> wfvector(wf.begin(), wf.end());//因为sort函数只能给有顺序的数据结构排序所以用vector
	sort(wfvector.begin(), wfvector.end(), cmpReverse);//第三个参数是函数指针,只写函数名即可
	return wfvector;
}
void TextSimilarity::selectAimWords(std::vector<std::pair<std::string, int>>& wfvec, wordSet& wset)
{
    
	int len = wfvec.size();
	int sz = len > _maxWordNumber ? _maxWordNumber : len;
	for (int i = 0; i < sz; i++)
	{
		wset.insert(wfvec[i].first);//获得两个文档所给出的词频的并集
	}
}

vector<double> TextSimilarity::getOneHot(TextSimilarity::wordSet& wset, TextSimilarity::wordFreq& wf)
{
	vector<double> oneHot;
	for (const auto& e : wset)
	{
		if (wf.count(e))
			oneHot.push_back(wf[e]);//若词频存在则把该次品放入词频向量中
		else
			oneHot.push_back(0);//否则将0放入词频向量中

	}
	return oneHot;
}
double TextSimilarity::cosine(std::vector<double> oneHot1, std::vector<double> oneHot2)
{
	//计算余弦相似度(余弦向量角公式)
	double modular1 = 0, modular2 = 0;
	double products = 0;
	assert(oneHot1.size() == oneHot2.size());
	for (size_t i = 0; i < oneHot1.size(); i++)
	{
		products += oneHot1[i] * oneHot2[i];
	}
	for (size_t i = 0; i < oneHot1.size(); i++)
	{
		modular1 += pow(oneHot1[i], 2);
		modular2 += pow(oneHot2[i], 2);
	}
	return products / (pow(modular1, 0.5) * pow(modular2, 0.5));


}
string TextSimilarity::GBKToUTF8(const string str)
{
	int len = MultiByteToWideChar(CP_ACP, 0, str.c_str(), -1, NULL, 0);
	//参数1:转换为UTF16的字符编码格式
	//参数2:转换类型标记,对于UTF8或者GBK而言,此值要设为0或者MB_ERR_INVALID_CHARS
	//参数3:要转换的字符串指针
	//参数4.要转换的字节大小,如果设为-1,则处理整个字符串,包括结束字符,返回值也包括
	//参数5.保存转换之后的字符串buffer(URF-16)
	//参数6:参数五的buffer大小。如果此值为0,函数返回buffer所要求的大小,包括结束字符
	//返回值:写入到参数五的buffer字符数量
	wchar_t* wstr = new wchar_t[len];
	MultiByteToWideChar(CP_ACP, 0, str.c_str(), -1, wstr, len);

	len = WideCharToMultiByte(CP_UTF8, 0, wstr, -1, NULL, 0, NULL, NULL);
	//UTF16转换为UTF8
	//参数7、8:默认检查,一般设为NULL
	char* utf8char = new char[len];
	WideCharToMultiByte(CP_UTF8, 0, wstr, -1, utf8char, len, NULL, NULL);
	string temp = utf8char;
	if (wstr)
	{
		delete[] wstr;
		wstr = NULL;
	}

	if (utf8char)
	{
		delete[] utf8char;
		utf8char = NULL;
	}

	return temp;
}
string TextSimilarity::UTF8ToGBK(const string str)
{
	int len = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);
	wchar_t* wstr = new wchar_t[len];
	MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, wstr, len);

	len = WideCharToMultiByte(CP_ACP, 0, wstr, -1, NULL, 0, NULL, NULL);
	char* gbkchar = new char[len];
	WideCharToMultiByte(CP_ACP, 0, wstr, -1, gbkchar, len, NULL, NULL);
	string temp = gbkchar;
	if (wstr)
	{
		delete[] wstr;
		wstr = NULL;
	}

	if (gbkchar)
	{
		delete[] gbkchar;
		gbkchar = NULL;
	}

	return temp;
}

test.cpp

#define _CRT_SECURE_NO_WARNINGS 1
#include "TextSimilarity.h"
#include <iostream>
using namespace std;
void testTextSimilarity()
{
	TextSimilarity ts("dict");
	TextSimilarity::wordFreq wf = ts.getWordFreq("test.txt");
	TextSimilarity::wordFreq wf2 = ts.getWordFreq("test2.txt");
	vector<pair<string, int>> wfvec = ts.sortByValueReverse(wf);
	vector<pair<string, int>> wfvec2 = ts.sortByValueReverse(wf2);
	 
	cout << "wfvec:" << endl;
	for (int i = 0; i < 10; i++)
	{
	    //将字符编码格式从UTF8转到GBK进行打印
		cout << ts.UTF8ToGBK(wfvec[i].first) << ":" << wfvec[i].second << " ";
	}
	cout << endl;
	cout << "wfvec2:" << endl;
	for (int i = 0; i < 10; i++)
	{
		cout << ts.UTF8ToGBK(wfvec2[i].first) << ":" << wfvec2[i].second << " ";
	}
	cout << endl;
	TextSimilarity::wordSet wset;
    ts.selectAimWords(wfvec, wset);
    ts.selectAimWords(wfvec2, wset);
	cout << "wset" << endl;
	for (const auto& e : wset)
	{
		cout << ts.UTF8ToGBK(e) << " ";
	}
	cout << endl;
	vector<double> oneHot = ts.getOneHot(wset, wf);
	vector<double> oneHot2 = ts.getOneHot(wset, wf2);
	cout << "oneHot:" << endl;
	for (const auto& v : oneHot)
	{
		cout << v << " ";
	}
	cout << endl;
	cout << "oneHot2:" << endl;
	for (const auto& v : oneHot2)
	{
		cout << v << " ";
	}
	cout << endl;
	double db = 0;
	db = ts.cosine(oneHot, oneHot2);
	cout << "文档相似度为:"<<db << endl;

}
	


int main()
{
	testTextSimilarity();
	return 0;
}

测试结果

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值