html2csv 简单网络表格转CSV文件(单个html table)

void GetTd(const std::string &trString, vector<std::string> &tdv) {

    if (trString.empty()) {
        return;
    }
    HTML::ParserDom parser;
    tree<HTML::Node> dom = parser.parseTree(trString.data());
    tree<HTML::Node>::iterator it;
    tree<HTML::Node>::iterator end;
    //输出所有的文本节点
    it = dom.begin();
    end = dom.end();
    for (; it != end; ++it) {
        if (stricmp(it->tagName().c_str(), "td") == 0) {
            int offset1 = it->offset() + it->text().length();
            int length1 = it->length() - it->text().length() - it->closingText().length();
            string JStr1 = trString.substr(offset1, length1);
            tdv.push_back(JStr1);
        }
    }
}

bool __stdcall Html2CSV(const char* lpFileName, const char* startwith, char* outFilename) {
    try {
        ofstream outfile;
        ifstream ifs(lpFileName);
        char szLineHeader[4096] = { 0 };
        string sLine;
        bool isDeadLine = false;
        while (!ifs.eof())//
        {
            
            ifs.getline(szLineHeader, 4095);

            if (strstr(szLineHeader, startwith))
            {
                sLine.append(szLineHeader);
                break;
            }

            if (strstr(szLineHeader, "</table>") || 
                strstr(szLineHeader, "</html>") || 
                strstr(szLineHeader, "</body>")) {
                return false;
            }
        }

        remove(outFilename);
        outfile.open(outFilename, ios::out);


        while (!ifs.eof()) {
            ifs.getline(szLineHeader, 4095);
            if (strstr(szLineHeader, "</table>") ||
                strstr(szLineHeader, "</html>") ||
                strstr(szLineHeader, "</body>")) {
                break;
            }
            string s;
            if (sLine.size())
            {
                s.append(sLine);
                sLine.swap(string());
            }
            s.append(szLineHeader);
            while (!ifs.eof()) {
                char szLineHeader[4096] = { 0 };
                ifs.getline(szLineHeader, 4095);
                if (strstr(szLineHeader, "</tr>")) {
                    s.append(szLineHeader);
                    break;
                }
                if (strstr(szLineHeader, "</table>") ||
                    strstr(szLineHeader, "</html>") ||
                    strstr(szLineHeader, "</body>")) {
                    isDeadLine = true;
                    break;
                }
                s.append(szLineHeader);
            }
            if (isDeadLine)
            {
                break;
            }
            if (s.length())
            {
                vector<string> tr;
                HTML::ParserDom parser2;
                tree<HTML::Node>::iterator it2;
                tree<HTML::Node>::iterator end2;
                tree<HTML::Node> dom2;

                string s1 = Utf8ToGbk(s.data());
                string temp = s1.data();
                temp += "\r\n";
                //OutputDebugStringA(temp.data());
                dom2 = parser2.parseTree(s1);
                //输出所有的文本节点
                it2 = dom2.begin();
                end2 = dom2.end();
                for (it2 = dom2.begin(); it2 != end2; ++it2) {
                    if (stricmp(it2->tagName().c_str(), "tr") == 0) {
                        int offset1 = it2->offset();
                        int length1 = it2->length();
                        string JStr1 = s.substr(offset1, length1);
                        tr.push_back(JStr1);
                    }
                }
                for (int i = 0; i < tr.size(); i++) {
                    vector<std::string> tdv;
                    GetTd(tr[i], tdv);
                    for (int j = 0; j < tdv.size(); j++) {

                        if (strlen(tdv[j].data())) {
                            outfile << "\"" << (tdv[j].data()) << "\"";
                        }
                        else {
                            outfile << "\"\"";
                        }
                        if (j != 14)
                            outfile << ",";
                    }
                    outfile << "\n";
                }
            }
            else {
                break;
            }
        }
        ifs.close();
        outfile.close();
        return true;
    }
    catch (...) { return false; }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值