void GetTd(const std::string &trString, vector<std::string> &tdv) {
if (trString.empty()) {
return;
}
HTML::ParserDom parser;
tree<HTML::Node> dom = parser.parseTree(trString.data());
tree<HTML::Node>::iterator it;
tree<HTML::Node>::iterator end;
//输出所有的文本节点
it = dom.begin();
end = dom.end();
for (; it != end; ++it) {
if (stricmp(it->tagName().c_str(), "td") == 0) {
int offset1 = it->offset() + it->text().length();
int length1 = it->length() - it->text().length() - it->closingText().length();
string JStr1 = trString.substr(offset1, length1);
tdv.push_back(JStr1);
}
}
}
bool __stdcall Html2CSV(const char* lpFileName, const char* startwith, char* outFilename) {
try {
ofstream outfile;
ifstream ifs(lpFileName);
char szLineHeader[4096] = { 0 };
string sLine;
bool isDeadLine = false;
while (!ifs.eof())//
{
ifs.getline(szLineHeader, 4095);
if (strstr(szLineHeader, startwith))
{
sLine.append(szLineHeader);
break;
}
if (strstr(szLineHeader, "</table>") ||
strstr(szLineHeader, "</html>") ||
strstr(szLineHeader, "</body>")) {
return false;
}
}
remove(outFilename);
outfile.open(outFilename, ios::out);
while (!ifs.eof()) {
ifs.getline(szLineHeader, 4095);
if (strstr(szLineHeader, "</table>") ||
strstr(szLineHeader, "</html>") ||
strstr(szLineHeader, "</body>")) {
break;
}
string s;
if (sLine.size())
{
s.append(sLine);
sLine.swap(string());
}
s.append(szLineHeader);
while (!ifs.eof()) {
char szLineHeader[4096] = { 0 };
ifs.getline(szLineHeader, 4095);
if (strstr(szLineHeader, "</tr>")) {
s.append(szLineHeader);
break;
}
if (strstr(szLineHeader, "</table>") ||
strstr(szLineHeader, "</html>") ||
strstr(szLineHeader, "</body>")) {
isDeadLine = true;
break;
}
s.append(szLineHeader);
}
if (isDeadLine)
{
break;
}
if (s.length())
{
vector<string> tr;
HTML::ParserDom parser2;
tree<HTML::Node>::iterator it2;
tree<HTML::Node>::iterator end2;
tree<HTML::Node> dom2;
string s1 = Utf8ToGbk(s.data());
string temp = s1.data();
temp += "\r\n";
//OutputDebugStringA(temp.data());
dom2 = parser2.parseTree(s1);
//输出所有的文本节点
it2 = dom2.begin();
end2 = dom2.end();
for (it2 = dom2.begin(); it2 != end2; ++it2) {
if (stricmp(it2->tagName().c_str(), "tr") == 0) {
int offset1 = it2->offset();
int length1 = it2->length();
string JStr1 = s.substr(offset1, length1);
tr.push_back(JStr1);
}
}
for (int i = 0; i < tr.size(); i++) {
vector<std::string> tdv;
GetTd(tr[i], tdv);
for (int j = 0; j < tdv.size(); j++) {
if (strlen(tdv[j].data())) {
outfile << "\"" << (tdv[j].data()) << "\"";
}
else {
outfile << "\"\"";
}
if (j != 14)
outfile << ",";
}
outfile << "\n";
}
}
else {
break;
}
}
ifs.close();
outfile.close();
return true;
}
catch (...) { return false; }
}