如果我们要使用C++批量处理html文件, 查找一些关键词什么的, 很重要的一点是我们要对html文件进行编码解码, 否则会发生中文乱码的现象。
对于文件批处理我们可以参考:点击打开链接
对于编码解码的问题我们可以参考:点击打开链接
因为html文件是UTF-8编码的, 所以读出来之后需要编码转换一下才能正确显示。
关于编码解码的函数如下:
wchar_t *UTF8ConverUncoide(const char *strUtf8)
{
wchar_t *strUn;
int len = (int)strlen(strUtf8) + 1;
strUn = new wchar_t[len];
memset(strUn, 0, len << 1);
MultiByteToWideChar(CP_UTF8,0, strUtf8, len, strUn, len);
return strUn;
}
char *UncoideConverGB(const wchar_t *strUn)
{
char *strGb;
int len = (int)(wcslen(strUn) + 1);
strGb = new char [len << 1];
memset(strGb, 0, len << 1);
WideCharToMultiByte( CP_ACP, 0, strUn, -1, strGb, len << 1, NULL, NULL );
return strGb;
}
我们根据后缀名来选取html文件进行打开和读取:
#include
#include
#include
#include
#define Max(a,b) ((a)>(b)?(a):(b))
#define Min(a,b) ((a)
using namespace std;
typedef long long ll;
typedef long double ld;
const double eps = 1e-6;
const double PI = acos(-1);
const int mod = 1000000000 + 7;
const int INF = 0x3f3f3f3f;
// & 0x7FFFFFFF
const int seed = 131;
const ll INF64 = ll(1e18);
const int maxn = 1e6 + 10;
int n, m, k;
wchar_t *UTF8ConverUncoide(const char *strUtf8)
{
wchar_t *strUn;
int len = (int)strlen(strUtf8) + 1;
strUn = new wchar_t[len];
memset(strUn, 0, len << 1);
MultiByteToWideChar(CP_UTF8,0, strUtf8, len, strUn, len);
return strUn;
}
char *UncoideConverGB(const wchar_t *strUn)
{
char *strGb;
int len = (int)(wcslen(strUn) + 1);
strGb = new char [len << 1];
memset(strGb, 0, len << 1);
WideCharToMultiByte( CP_ACP, 0, strUn, -1, strGb, len << 1, NULL, NULL );
return strGb;
}
int main() {
// int system("dir");
// locale loc( "Chinese-simplified" );
//
// wcin.imbue(loc);
//
// wcout.imbue( loc );
// setlocale(LC_ALL,"Chinese-simplified");
fstream file_list("file_list.txt", std::ios::out);
file_list.close();
system("dir / a / b >> file_list.txt");
int sum_code = 0;
fstream code_file;
file_list.open("file_list.txt", std::ios::in);
string str_line = "";
string t_str;
int locc = 0;
string str_last = "";
while(!file_list.eof()) {
getline(file_list, str_line);
cout<
locc = str_line.find_last_of(".");
if(locc == (int)string :: npos) continue;
str_last = str_line.substr(locc);
cout << str_last << endl;
if(str_last == ".html") {
code_file.open(str_line.c_str(), std::ios::in);
cout << str_line.c_str() << endl;
}
else continue;
while(!code_file.eof()) {
getline(code_file, t_str);
// wcout.imbue(locale("chs"));
// FromUTF8ToUnicode(t_str);
int len = t_str.size();
char* strutf8 = new char[len+1];
for(int i = 0; i < len; i++) strutf8[i] = t_str[i];
wchar_t* ccc = UTF8ConverUncoide(strutf8);
char *ans = UncoideConverGB(ccc);
cout<
// cout << t_str << endl;
}
code_file.close();
cout<
}
return 0;
}