C++实现新浪新闻部分抓取存储mysql

#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include<windows.h>
#include <fstream>  
#include<Wininet.h>
#include<iostream>
#include<fstream>
#include<string.h>
#include <mysql.h>
#include <stdlib.h>
#include <locale.h>
#include <time.h>
#include "ParserDom.h"
#include "utils.h"

#include "mysql_connection.h"

#include <cppconn/driver.h>
#include <cppconn/resultset.h>
#include <cppconn/statement.h>
#include <cppconn/prepared_statement.h>

#pragma comment(lib,"htmlcxx.lib")
#pragma comment(lib,"WinInet.lib")
#pragma comment(lib,"libmysql.lib")
#pragma comment(lib, "mysqlcppconn.lib")

#define BUFFSIZE 1024*1024

using namespace htmlcxx;
using namespace std;

//菜单标志符
int flag = 1;
const int N = 50;

MYSQL mysql;
string url = "";
string title = "";
string title2 = "";
int num = 0;

//sql字符串
string sqlstr;

//新闻结构体
struct News
{
    int new_id;
    string url;
    string title;
    string time;
};

//定义结构数组
struct News ns[N];

//菜单界面手动界面
void show_menu() 
{
    if (flag == 1) {
        printf("是否开始抓取数据?请输入1;\n");
    }
    else if(flag == 2){
        printf("抓取成功!是否存入数据库?请输入2;\n");
    }
    else {
        printf("存入成功!请查看数据库\n");
    }
}

//网站数据转化生成html
int catch_to_html ()
{
    HINTERNET hINet, hHttpFile;
    char szSizeBuffer[32];
    DWORD dwLengthSizeBuffer = sizeof(szSizeBuffer);

    hINet = InternetOpen("IE6.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);    //InternetOpen初始化WININET.DLL
    string url = "http://www.sina.com";          //抓新浪网

    if (!hINet)
    {
        cout << "InternetOpen fail" << endl;
        return 0;
    }

    hHttpFile = InternetOpenUrl(hINet, url.c_str(), NULL, 0, 0, 0);  //这个函数连接到一个网络服务器上并且最被从服务器上读取数据
    if (!hHttpFile)
    {
        cout << "error open url" << endl;
        return 0;
    }
    BOOL bQuery = HttpQueryInfo(hHttpFile,
        HTTP_QUERY_CONTENT_LENGTH,
        szSizeBuffer,
        &dwLengthSizeBuffer, NULL); //得到关于文件的信息
    if (bQuery == false)
    {
        InternetCloseHandle(hINet);
        cout << "error query info" << endl;
        return 0;
    }
    int FileSize = atol(szSizeBuffer);    //atol函数把字符串转换成长整型数
    string revData;
    revData.resize(FileSize);
    DWORD dwBytesRead;
    BOOL bRead = InternetReadFile(hHttpFile, &revData[0], FileSize, &dwBytesRead);     //web浏览器将在InternetReadFile上循环 ,不停地从Internet上读入数据块。
    if (!bRead)
    {
        cout << "error to read file" << endl;
        return 0;
    }
    ofstream   out_file("sina.html");
    out_file << revData;              //输出到文件
    InternetCloseHandle(hHttpFile);   //关闭句柄
    InternetCloseHandle(hINet);
    //cout << "抓取成功!/n" << endl;
    return 0;
}

//获取当前系统时间
string get_current_time() 
{
    /*SYSTEMTIME st = { 0 };
    string time;
    GetLocalTime(&st);
    printf("%d-%02d-%02d %02d:%02d:%02d\n",
        st.wYear,
        st.wMonth,
        st.wDay,
        st.wHour,
        st.wMinute,
        st.wSecond);
    return time;*/

    time_t timep;
    char s[30];
    time(&timep);
    strcpy(s, ctime(&timep));
    return s;
}

    //sqlstr =
    //"INSERT INTO `test`.`news` (`id`, `url`, `title`, `time`) ";
    //sqlstr += "VALUES (0, '组织者', '方提出更换存放骨灰存放', '2015-09-01 14:29:51');";


//参数化查询 注:开始考虑connecter库来实现(未实现),用本地库bind来实现该功能
void insert_mysql(int i)
{
    MYSQL* pConn;
    pConn = mysql_init(NULL);
    if (!mysql_real_connect(pConn, "127.0.0.1", "root", "12345678", "test", 3306, NULL, 0))
    {
        printf("数据库连接失败:%s", mysql_error(pConn));
        return;
    }

    mysql_query(pConn, "set names gbk");
    MYSQL_STMT    *stmt;
    MYSQL_BIND    bind[4];
    memset(bind, 0, sizeof(bind));//把is_null、length等字段默认值设置为NULL等默认值,否则执行会报错

    stmt = mysql_stmt_init(pConn);
    char* insertSQL = "insert into news(id, url, title, time) values(?,?,?,?)";
    //char* insertSQL = "insert into test.news(id) values(?)";
    if (mysql_stmt_prepare(stmt, insertSQL, strlen(insertSQL)))
    {
        fprintf(stderr, " mysql_stmt_prepare(), INSERT failed,%s\n", mysql_error(pConn));
        return;
    }

    //cout << ns[1].new_id << ns[1].time << ns[1].url << ns[1].title << endl;
    //
    /*int a = 0;
    bind[0].buffer_type = MYSQL_TYPE_LONG;
    bind[0].buffer = &a;
    bind[0].buffer_length = sizeof(a);
    printf("11 \n %d",a);*/

    //此处....
    bind[1].buffer_type = MYSQL_TYPE_STRING;
    bind[1].buffer = (char *)ns[i].url.c_str();
    bind[1].buffer_length = strlen(ns[i].url.c_str()) + 1;
    printf("\naa\n");
    printf(ns[i].url.c_str());

    bind[2].buffer_type = MYSQL_TYPE_STRING;
    bind[2].buffer = (char *)ns[i].title.c_str();
    bind[2].buffer_length = strlen(ns[i].title.c_str()) + 1;
    printf("\nbb\n");
    printf(ns[i].title.c_str());

    bind[3].buffer_type = MYSQL_TYPE_STRING;
    bind[3].buffer = (char *)ns[i].time.c_str();
    bind[3].buffer_length = strlen(ns[i].time.c_str()) + 1;
    printf("\ncc\n");
    printf(ns[i].time.c_str());

    /* Bind the buffers */
    if (mysql_stmt_bind_param(stmt, bind))
    {
        fprintf(stderr, " mysql_stmt_bind_param() failed %s\n", mysql_stmt_error(stmt));
        return;
    }

    /* Execute the INSERT statement - 1*/
    if (mysql_stmt_execute(stmt))
    {
        fprintf(stderr, " mysql_stmt_execute(), failed %s\n", mysql_stmt_error(stmt));
        return;
    }
    /* Close the statement */
    mysql_stmt_close(stmt);
    mysql_close(pConn);
    printf("参数化执行SQL结束");
}

//初始化数据库 表格创建,需要cmd创建数据库名称
int init_mysql()
{
    if (0 == mysql_library_init(0, NULL, NULL))
    {
        cout << "mysql_lib_init() succeed" << endl;
    }
    else
    {
        cout << "mysql_lib_init() failed" << endl;
        return -1;
    }

    //初始化数据结构
    if (NULL != mysql_init(&mysql)) {
        cout << "mysql_init() succeed" << endl;
    }
    else {
        cout << "mysql_init() failed" << endl;
        return -1;
    }

    //在连接数据库之前,设置额外的连接选项
    //可以设置的选项很多,这里设置字符集,否则无法处理中文
    if (0 == mysql_options(&mysql, MYSQL_SET_CHARSET_NAME, "gbk")) {设置编码格式,否则在cmd下无法显示中文
        cout << "mysql_options() succeed" << endl;
    }
    else {
        cout << "mysql_options() failed" << endl;
        return -1;
    }
    ;
    //连接数据库
    if (NULL != mysql_real_connect(&mysql, "localhost", "root", "12345678", "test",
        3306, NULL, 0))
    {
        cout << "mysql_real_connect() succeed" << endl;
    }
    else {
        cout << "mysql_real_connect() failed" << endl;
        return -1;
    }

    //创建一个表
    sqlstr = "CREATE TABLE IF NOT EXISTS news (";
    sqlstr += " id int(4) NOT NULL AUTO_INCREMENT,";
    sqlstr += " url varchar(255) NOT NULL,";
    sqlstr += " title varchar(255) NOT NULL,";
    sqlstr += " time varchar(255) NOT NULL,";
    sqlstr += " PRIMARY KEY(id)";
    sqlstr += " ) ";

    if (0 == mysql_query(&mysql, sqlstr.c_str())) {
        //printf(sqlstr.c_str());
        cout << "mysql_query() create table succeed" << endl;
    }
    else {
        cout << "mysql_query() create table failed" << endl;
        mysql_close(&mysql);
        return -1;
    }
    return 0;
}

//UTF8字节码转换为GBK
string UTF8ToGBK(const std::string& strUTF8)
{
    int nLen = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
    unsigned short * wszGBK = new unsigned short[nLen + 1];
    memset(wszGBK, 0, nLen * 2 + 2);
    MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, (LPWSTR)wszGBK, nLen);

    nLen = WideCharToMultiByte(CP_ACP, 0, (LPWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
    char *szGBK = new char[nLen + 1];
    memset(szGBK, 0, nLen + 1);
    WideCharToMultiByte(CP_ACP, 0, (LPWSTR)wszGBK, -1, szGBK, nLen, NULL, NULL);

    std::string strTemp(szGBK);
    delete[]szGBK;
    delete[]wszGBK;
    return strTemp;
}

//解析HTML数据
int get_html_data() 
{       

    size_t offset = 0, length = 0;
    fstream htmlFileStream;

    htmlFileStream.open("D:\\Program Files\\Vs\\Projects\\catch\\catch\\sina.html", ios::in); //打开指定HTML文件
    istreambuf_iterator<char> fileBeg(htmlFileStream), fileEnd;
    string html(fileBeg, fileEnd);
    htmlFileStream.close();

    HTML::ParserDom parser;
    setlocale(LC_ALL,"");//这句很关键.OCP、.ACP、和环境代码页都受控制面板中“区域与语言选项”的设置影响。默认装完简体中文版 Windows 后,活动的 ANSI 代码页为:936(即 GBK),可用 chcp 控制台程序查看活动代码页。
    tree<HTML::Node> dom = parser.parseTree(html);
    tree<HTML::Node>::iterator it = dom.begin();
    tree<HTML::Node>::iterator end = dom.end();
    //遍历HTML文档 DOM解析

    for (; it != end; ++it)
    {   
        if (it->tagName() == "ul")
        {
            it->parseAttributes();// 附上节点属性
            // 获取class 的属性first 如果不存在为false
            if (it->attribute("class").first) 
            {
                if (it->attribute("class").second == "list-a news_top")
                {   
                    tree<HTML::Node>::iterator it0 = dom.begin(it);
                    tree<HTML::Node>::iterator end0 = dom.end(it);

                    for (;it0 != end0;++it0)
                    {
                        if (it0->tagName() == "li")
                        {
                            tree<HTML::Node>::iterator it1 = dom.begin(it0);
                            tree<HTML::Node>::iterator end1 = dom.end(it0);
                            for (;it1 != end1;++it1)
                            {   
                                it1->parseAttributes();
                                //std::cout << it1->text() << std::endl;
                                if (it1->tagName() == "a")  //查找链接<a href="http://">
                                {
                                    it1->parseAttributes();
                                    //指向内部属性href
                                    if (it1->attribute("href").first)
                                    {
                                        url = it1->attribute("href").second;
                                        //cout << url << endl;
                                    }
                                    // 再次获取子标签
                                    tree<HTML::Node>::iterator it2 = dom.begin(it1);
                                    tree<HTML::Node>::iterator end2 = dom.end(it1);
                                    for (;it2 != end2;++it2)
                                    {
                                        if ((!it2->isTag()) && (!it2->isComment()))
                                        {
                                            // title = it.node->first_child->data;//提取xxx.html
                                            //title = it->text(); //<a target="_blank" href="http://games.sina.com.cn/o/kb/12392.shtml"  suda-uatrack="key=index_new_menu&value=sina_apps_list_click">
                                            title = it2->text();
                                            title2 = UTF8ToGBK(title.c_str());
                                            //cout << title2 << endl;

                                            //存入结构体
                                            string time = get_current_time();   
                                            ns[num].new_id = num;
                                            ns[num].url = url;
                                            ns[num].title = title2;
                                            ns[num].time = time;
                                            cout << ns[num].new_id << ns[num].time << ns[num].url << ns[num].title <<endl;
                                        }
                                        num++;  
                                    }       
                                }
                            }
                        }
                    }           
                }
            }       
        }
    }
    system("pause");
    return 0;
}

//主函数
int main(int argc, char *argv[]) 
{
    int cmd = 0;
    while (1)
    {
        cmd = 0;
        show_menu();
        scanf("%d", &cmd);
        fflush(stdin);
        switch (cmd)
        {
        case 1:
            printf("正在抓取....\n");
            catch_to_html();
            flag = 2;
            break;
        case 2:
            get_html_data();
            printf("获取数据成功....\n");
            init_mysql();
            for (int i = 0;i < num;i++) 
            {
                insert_mysql(i);
            }
            printf("正在存入....\n");
            flag = 0;
            break;
        default:
            break;
        }
    }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值