#define _CRT_SECURE_NO_WARNINGS
#include <stdio.h>
#include<windows.h>
#include <fstream>
#include<Wininet.h>
#include<iostream>
#include<fstream>
#include<string.h>
#include <mysql.h>
#include <stdlib.h>
#include <locale.h>
#include <time.h>
#include "ParserDom.h"
#include "utils.h"
#include "mysql_connection.h"
#include <cppconn/driver.h>
#include <cppconn/resultset.h>
#include <cppconn/statement.h>
#include <cppconn/prepared_statement.h>
#pragma comment(lib,"htmlcxx.lib")
#pragma comment(lib,"WinInet.lib")
#pragma comment(lib,"libmysql.lib")
#pragma comment(lib, "mysqlcppconn.lib")
#define BUFFSIZE 1024*1024
using namespace htmlcxx;
using namespace std;
//菜单标志符
int flag = 1;
const int N = 50;
MYSQL mysql;
string url = "";
string title = "";
string title2 = "";
int num = 0;
//sql字符串
string sqlstr;
//新闻结构体
struct News
{
int new_id;
string url;
string title;
string time;
};
//定义结构数组
struct News ns[N];
//菜单界面手动界面
void show_menu()
{
if (flag == 1) {
printf("是否开始抓取数据?请输入1;\n");
}
else if(flag == 2){
printf("抓取成功!是否存入数据库?请输入2;\n");
}
else {
printf("存入成功!请查看数据库\n");
}
}
//网站数据转化生成html
int catch_to_html ()
{
HINTERNET hINet, hHttpFile;
char szSizeBuffer[32];
DWORD dwLengthSizeBuffer = sizeof(szSizeBuffer);
hINet = InternetOpen("IE6.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); //InternetOpen初始化WININET.DLL
string url = "http://www.sina.com"; //抓新浪网
if (!hINet)
{
cout << "InternetOpen fail" << endl;
return 0;
}
hHttpFile = InternetOpenUrl(hINet, url.c_str(), NULL, 0, 0, 0); //这个函数连接到一个网络服务器上并且最被从服务器上读取数据
if (!hHttpFile)
{
cout << "error open url" << endl;
return 0;
}
BOOL bQuery = HttpQueryInfo(hHttpFile,
HTTP_QUERY_CONTENT_LENGTH,
szSizeBuffer,
&dwLengthSizeBuffer, NULL); //得到关于文件的信息
if (bQuery == false)
{
InternetCloseHandle(hINet);
cout << "error query info" << endl;
return 0;
}
int FileSize = atol(szSizeBuffer); //atol函数把字符串转换成长整型数
string revData;
revData.resize(FileSize);
DWORD dwBytesRead;
BOOL bRead = InternetReadFile(hHttpFile, &revData[0], FileSize, &dwBytesRead); //web浏览器将在InternetReadFile上循环 ,不停地从Internet上读入数据块。
if (!bRead)
{
cout << "error to read file" << endl;
return 0;
}
ofstream out_file("sina.html");
out_file << revData; //输出到文件
InternetCloseHandle(hHttpFile); //关闭句柄
InternetCloseHandle(hINet);
//cout << "抓取成功!/n" << endl;
return 0;
}
//获取当前系统时间
string get_current_time()
{
/*SYSTEMTIME st = { 0 };
string time;
GetLocalTime(&st);
printf("%d-%02d-%02d %02d:%02d:%02d\n",
st.wYear,
st.wMonth,
st.wDay,
st.wHour,
st.wMinute,
st.wSecond);
return time;*/
time_t timep;
char s[30];
time(&timep);
strcpy(s, ctime(&timep));
return s;
}
//sqlstr =
//"INSERT INTO `test`.`news` (`id`, `url`, `title`, `time`) ";
//sqlstr += "VALUES (0, '组织者', '方提出更换存放骨灰存放', '2015-09-01 14:29:51');";
//参数化查询 注:开始考虑connecter库来实现(未实现),用本地库bind来实现该功能
void insert_mysql(int i)
{
MYSQL* pConn;
pConn = mysql_init(NULL);
if (!mysql_real_connect(pConn, "127.0.0.1", "root", "12345678", "test", 3306, NULL, 0))
{
printf("数据库连接失败:%s", mysql_error(pConn));
return;
}
mysql_query(pConn, "set names gbk");
MYSQL_STMT *stmt;
MYSQL_BIND bind[4];
memset(bind, 0, sizeof(bind));//把is_null、length等字段默认值设置为NULL等默认值,否则执行会报错
stmt = mysql_stmt_init(pConn);
char* insertSQL = "insert into news(id, url, title, time) values(?,?,?,?)";
//char* insertSQL = "insert into test.news(id) values(?)";
if (mysql_stmt_prepare(stmt, insertSQL, strlen(insertSQL)))
{
fprintf(stderr, " mysql_stmt_prepare(), INSERT failed,%s\n", mysql_error(pConn));
return;
}
//cout << ns[1].new_id << ns[1].time << ns[1].url << ns[1].title << endl;
//
/*int a = 0;
bind[0].buffer_type = MYSQL_TYPE_LONG;
bind[0].buffer = &a;
bind[0].buffer_length = sizeof(a);
printf("11 \n %d",a);*/
//此处....
bind[1].buffer_type = MYSQL_TYPE_STRING;
bind[1].buffer = (char *)ns[i].url.c_str();
bind[1].buffer_length = strlen(ns[i].url.c_str()) + 1;
printf("\naa\n");
printf(ns[i].url.c_str());
bind[2].buffer_type = MYSQL_TYPE_STRING;
bind[2].buffer = (char *)ns[i].title.c_str();
bind[2].buffer_length = strlen(ns[i].title.c_str()) + 1;
printf("\nbb\n");
printf(ns[i].title.c_str());
bind[3].buffer_type = MYSQL_TYPE_STRING;
bind[3].buffer = (char *)ns[i].time.c_str();
bind[3].buffer_length = strlen(ns[i].time.c_str()) + 1;
printf("\ncc\n");
printf(ns[i].time.c_str());
/* Bind the buffers */
if (mysql_stmt_bind_param(stmt, bind))
{
fprintf(stderr, " mysql_stmt_bind_param() failed %s\n", mysql_stmt_error(stmt));
return;
}
/* Execute the INSERT statement - 1*/
if (mysql_stmt_execute(stmt))
{
fprintf(stderr, " mysql_stmt_execute(), failed %s\n", mysql_stmt_error(stmt));
return;
}
/* Close the statement */
mysql_stmt_close(stmt);
mysql_close(pConn);
printf("参数化执行SQL结束");
}
//初始化数据库 表格创建,需要cmd创建数据库名称
int init_mysql()
{
if (0 == mysql_library_init(0, NULL, NULL))
{
cout << "mysql_lib_init() succeed" << endl;
}
else
{
cout << "mysql_lib_init() failed" << endl;
return -1;
}
//初始化数据结构
if (NULL != mysql_init(&mysql)) {
cout << "mysql_init() succeed" << endl;
}
else {
cout << "mysql_init() failed" << endl;
return -1;
}
//在连接数据库之前,设置额外的连接选项
//可以设置的选项很多,这里设置字符集,否则无法处理中文
if (0 == mysql_options(&mysql, MYSQL_SET_CHARSET_NAME, "gbk")) {设置编码格式,否则在cmd下无法显示中文
cout << "mysql_options() succeed" << endl;
}
else {
cout << "mysql_options() failed" << endl;
return -1;
}
;
//连接数据库
if (NULL != mysql_real_connect(&mysql, "localhost", "root", "12345678", "test",
3306, NULL, 0))
{
cout << "mysql_real_connect() succeed" << endl;
}
else {
cout << "mysql_real_connect() failed" << endl;
return -1;
}
//创建一个表
sqlstr = "CREATE TABLE IF NOT EXISTS news (";
sqlstr += " id int(4) NOT NULL AUTO_INCREMENT,";
sqlstr += " url varchar(255) NOT NULL,";
sqlstr += " title varchar(255) NOT NULL,";
sqlstr += " time varchar(255) NOT NULL,";
sqlstr += " PRIMARY KEY(id)";
sqlstr += " ) ";
if (0 == mysql_query(&mysql, sqlstr.c_str())) {
//printf(sqlstr.c_str());
cout << "mysql_query() create table succeed" << endl;
}
else {
cout << "mysql_query() create table failed" << endl;
mysql_close(&mysql);
return -1;
}
return 0;
}
//UTF8字节码转换为GBK
string UTF8ToGBK(const std::string& strUTF8)
{
int nLen = MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, NULL, 0);
unsigned short * wszGBK = new unsigned short[nLen + 1];
memset(wszGBK, 0, nLen * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, strUTF8.c_str(), -1, (LPWSTR)wszGBK, nLen);
nLen = WideCharToMultiByte(CP_ACP, 0, (LPWSTR)wszGBK, -1, NULL, 0, NULL, NULL);
char *szGBK = new char[nLen + 1];
memset(szGBK, 0, nLen + 1);
WideCharToMultiByte(CP_ACP, 0, (LPWSTR)wszGBK, -1, szGBK, nLen, NULL, NULL);
std::string strTemp(szGBK);
delete[]szGBK;
delete[]wszGBK;
return strTemp;
}
//解析HTML数据
int get_html_data()
{
size_t offset = 0, length = 0;
fstream htmlFileStream;
htmlFileStream.open("D:\\Program Files\\Vs\\Projects\\catch\\catch\\sina.html", ios::in); //打开指定HTML文件
istreambuf_iterator<char> fileBeg(htmlFileStream), fileEnd;
string html(fileBeg, fileEnd);
htmlFileStream.close();
HTML::ParserDom parser;
setlocale(LC_ALL,"");//这句很关键.OCP、.ACP、和环境代码页都受控制面板中“区域与语言选项”的设置影响。默认装完简体中文版 Windows 后,活动的 ANSI 代码页为:936(即 GBK),可用 chcp 控制台程序查看活动代码页。
tree<HTML::Node> dom = parser.parseTree(html);
tree<HTML::Node>::iterator it = dom.begin();
tree<HTML::Node>::iterator end = dom.end();
//遍历HTML文档 DOM解析
for (; it != end; ++it)
{
if (it->tagName() == "ul")
{
it->parseAttributes();// 附上节点属性
// 获取class 的属性first 如果不存在为false
if (it->attribute("class").first)
{
if (it->attribute("class").second == "list-a news_top")
{
tree<HTML::Node>::iterator it0 = dom.begin(it);
tree<HTML::Node>::iterator end0 = dom.end(it);
for (;it0 != end0;++it0)
{
if (it0->tagName() == "li")
{
tree<HTML::Node>::iterator it1 = dom.begin(it0);
tree<HTML::Node>::iterator end1 = dom.end(it0);
for (;it1 != end1;++it1)
{
it1->parseAttributes();
//std::cout << it1->text() << std::endl;
if (it1->tagName() == "a") //查找链接<a href="http://">
{
it1->parseAttributes();
//指向内部属性href
if (it1->attribute("href").first)
{
url = it1->attribute("href").second;
//cout << url << endl;
}
// 再次获取子标签
tree<HTML::Node>::iterator it2 = dom.begin(it1);
tree<HTML::Node>::iterator end2 = dom.end(it1);
for (;it2 != end2;++it2)
{
if ((!it2->isTag()) && (!it2->isComment()))
{
// title = it.node->first_child->data;//提取xxx.html
//title = it->text(); //<a target="_blank" href="http://games.sina.com.cn/o/kb/12392.shtml" suda-uatrack="key=index_new_menu&value=sina_apps_list_click">
title = it2->text();
title2 = UTF8ToGBK(title.c_str());
//cout << title2 << endl;
//存入结构体
string time = get_current_time();
ns[num].new_id = num;
ns[num].url = url;
ns[num].title = title2;
ns[num].time = time;
cout << ns[num].new_id << ns[num].time << ns[num].url << ns[num].title <<endl;
}
num++;
}
}
}
}
}
}
}
}
}
system("pause");
return 0;
}
//主函数
int main(int argc, char *argv[])
{
int cmd = 0;
while (1)
{
cmd = 0;
show_menu();
scanf("%d", &cmd);
fflush(stdin);
switch (cmd)
{
case 1:
printf("正在抓取....\n");
catch_to_html();
flag = 2;
break;
case 2:
get_html_data();
printf("获取数据成功....\n");
init_mysql();
for (int i = 0;i < num;i++)
{
insert_mysql(i);
}
printf("正在存入....\n");
flag = 0;
break;
default:
break;
}
}
}
C++实现新浪新闻部分抓取存储mysql
最新推荐文章于 2023-11-20 21:09:04 发布