实时汇率转换小程序(c++爬虫)
利用c++网络爬虫爬取网页的实时汇率进行汇率的转换!
其中也利用了QT进行了页面设计!
#define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>
#pragma comment(lib, "ws2_32.lib")
#pragma warning(disable : 4996)
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576
queue<string> hrefUrl;
hash_set<string> visitedUrl;
hash_set<string> visitedImg;
int depth = 0;
int g_ImgCnt = 1;
//解析URL,解析出主机名,资源名
bool ParseURL(const string &url,
string &host, string &resource)
{
if (strlen(url.c_str()) > 2000)
{
return false;
}
const char *pos = strstr(url.c_str(), "http://");
if (pos == NULL) pos = url.c_str();
else pos += strlen("http://");
if (strstr(pos, "/") == 0)
return false;
char pHost[100];
char pResource[2000];
sscanf(pos, "%[^/]%s", pHost, pResource);
host = pHost;
resource = pResource;
return true;
}
//使用Get请求,得到响应
bool GetHttpResponse(const string &url,
char *&response, int &bytesRead)
{
string host, resource;
if (!ParseURL(url, host, resource))
{
//cout << "Can not parse the url" << endl;
return false;
}
//建立socket
struct hostent *hp = gethostbyname(host.c_str());
if (hp == NULL)
{
//cout << "Can not find host address" << endl;
return false;
}
SOCKET sock = socket(AF_INET, SOCK_STREAM,
IPPROTO_TCP);
if (sock == -1 || sock == -2)
{
//cout << "Can not create sock." << endl;
return false;
}
//建立服务器地址
SOCKADDR_IN sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(80); //char addr[5];
//memcpy( addr, hp->h_addr, 4 );
//sa.sin_addr.s_addr = inet_addr(hp->h_addr);
memcpy(&sa.sin_addr, hp->h_addr, 4);
//建立连接
if (0 != connect(sock, (SOCKADDR*)&sa, sizeof(sa)))
{
//cout << "Can not connect: " << url << endl;
closesocket(sock);
return false;
};
//准备发送数据
string request = "GET " + resource
+ " HTTP/1.1\r\nHost:" + host
+ "\r\nConnection:Close\r\n\r\n";
//发送数据
if (SOCKET_ERROR == send(sock, request.c_str(),
request.size(), 0))
{
//cout << "send error" << endl;
closesocket(sock);
return false;
}
//接收数据
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
char *pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);
bytesRead = 0;
int ret = 1;
//cout << "Read: ";
while (ret > 0)
{
ret = recv(sock, pageBuf + bytesRead,
m_nContentLength - bytesRead, 0);
if (ret > 0)
{
bytesRead += ret;
}
if (m_nContentLength - bytesRead<100)
{
//cout << "\nRealloc memorry" << endl;
m_nContentLength *= 2;
//重新分配内存
pageBuf = (char*)realloc(pageBuf,
m_nContentLength);
}
//cout << ret << " ";
}
//cout << endl;
pageBuf[bytesRead] = '\0';
response = pageBuf;
closesocket(sock);
return true;
//cout<< response <<endl;
}
//提取所有的URL以及图片URL
void HTMLParse(string &htmlResponse,
vector<string> &imgurls,
const string &host)
{
//找所有连接,加入queue中
const char *p = htmlResponse.c_str();
char *tag = "href=\"";
const char *pos = strstr(p, tag);
ofstream ofile("url.txt", ios::app);
while (pos)
{
pos += strlen(tag);
const char * nextQ = strstr(pos, "\"");
if (nextQ)
{
char * url = new char[nextQ - pos + 1];
//char url[100];
//固定大小的会发生缓冲区溢出的危险
sscanf(pos, "%[^\"]", url);
// 转换成string类型,可以自动释放内存
string surl = url;
if (visitedUrl.find(surl) ==
visitedUrl.end())
{
visitedUrl.insert(surl);
ofile << surl << endl;
hrefUrl.push(surl);
}
pos = strstr(pos, tag);
delete[] url; // 释放掉申请的内存
}
}
ofile << endl << endl;
ofile.close();
tag = "<img ";
const char* att1 = "src=\"";
const char* att2 = "lazy-src=\"";
const char *pos0 = strstr(p, tag);
while (pos0)
{
pos0 += strlen(tag);
const char* pos2 = strstr(pos0, att2);
if (!pos2 || pos2 > strstr(pos0, ">"))
{
pos = strstr(pos0, att1);
if (!pos)
{
pos0 = strstr(att1, tag);
continue;
}
else
{
pos = pos + strlen(att1);
}
}
else
{
pos = pos2 + strlen(att2);
}
const char * nextQ = strstr(pos, "\"");
if (nextQ)
{
char * url = new char[nextQ - pos + 1];
sscanf(pos, "%[^\"]", url);
//cout << url << endl;
string imgUrl = url;
if (visitedImg.find(imgUrl) ==
visitedImg.end())
{
visitedImg.insert(imgUrl);
imgurls.push_back(imgUrl);
}
pos0 = strstr(pos0, tag);
delete[] url;
}
}
//cout << "end of Parse this html" << endl;
}
//把URL转化为文件名
string ToFileName(const string &url)
{
string fileName;
fileName.resize(url.size());
int k = 0;
for (int i = 0; i<(int)url.size(); i++)
{
char ch = url[i];
if (ch != '\\'&&ch != '/'&&ch != ':'&&ch != '*'
&&ch != '?'&&ch != '"'&&ch != '<'
&&ch != '>'&&ch != '|')
fileName[k++] = ch;
}
return fileName.substr(0, k) + ".txt";
}
//下载图片到img文件夹
void DownLoadImg(vector<string> & imgurls,
const string &url)
{
//生成保存该url下图片的文件夹
string foldname = ToFileName(url);
foldname = "./img/" + foldname;
if (!CreateDirectory(foldname.c_str(), NULL)) {}
//cout << "Can not create directory:"
//<< foldname << endl;
char *image;
int byteRead;
for (int i = 0; i<imgurls.size(); i++)
{
//判断是否为图片,bmp,jgp,jpeg,gif
string str = imgurls[i];
int pos = str.find_last_of(".");
if (pos == string::npos)
continue;
else
{
string ext = str.substr(pos + 1,
str.size() - pos - 1);
if (ext != "bmp"&& ext != "jpg"
&& ext != "jpeg"&& ext != "gif"
&&ext != "png")
continue;
}
//下载其中的内容
if (GetHttpResponse(imgurls[i],
image,
byteRead))
{
if (strlen(image) == 0)
{
continue;
}
const char *p = image;
const char *pos = strstr(p, "\r\n\r\n")
+ strlen("\r\n\r\n");
int index = imgurls[i].find_last_of("/");
if (index != string::npos)
{
string imgname = imgurls[i].substr(index,
imgurls[i].size());
ofstream ofile(foldname + imgname,
ios::binary);
if (!ofile.is_open())
continue;
//cout << g_ImgCnt++
// << foldname + imgname << endl;
ofile.write(pos, byteRead - (pos - p));
ofile.close();
}
free(image);
}
}
}
//广度遍历
void BFS(const string & url)
{
char * response;
int bytes;
// 获取网页的相应,放入response中。
if (!GetHttpResponse(url, response, bytes))
{
//cout << "The url is wrong! ignore." << endl;
return;
}
string httpResponse = response;
free(response);
string filename = ToFileName(url);
ofstream ofile("./html/" + filename);
if (ofile.is_open())
{
// 保存该网页的文本内容
ofile << httpResponse << endl;
ofile.close();
}
vector<string> imgurls;
//解析该网页的所有图片链接,放入imgurls里面
HTMLParse(httpResponse, imgurls, url);
//下载所有的图片资源
DownLoadImg(imgurls, url);
}
//change UTF8 to String
string UTF8_To_string(const std::string & str)
{
int nwLen = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), -1, NULL, 0);
wchar_t * pwBuf = new wchar_t[nwLen + 1];//一定要加1,不然会出现尾巴
memset(pwBuf, 0, nwLen * 2 + 2);
MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), pwBuf, nwLen);
int nLen = WideCharToMultiByte(CP_ACP, 0, pwBuf, -1, NULL, NULL, NULL, NULL);
char * pBuf = new char[nLen + 1];
memset(pBuf, 0, nLen + 1);
WideCharToMultiByte(CP_ACP, 0, pwBuf, nwLen, pBuf, nLen, NULL, NULL);
std::string retStr = pBuf;
delete[]pBuf;
delete[]pwBuf;
pBuf = NULL;
pwBuf = NULL;
return retStr;
}
//my job
struct exchange {
string name;
double ex1, ex2, ex3, ex4, ex5;//现汇买入价,现钞买入价,现汇卖出价,现钞卖出价,中行折算价
string date;
};
struct exchange p[28];
//get exchange from string
double gotex(string a) {
if (a[28] == '<') { return 0; }
else if (a[28]) {
int i = 0;
while (a[28 + i] != '<') i++;
string str;
str.assign(a, 28, i);
return stof(str);
}
}
//get date from string
string gotstring(string a) {
int i = 0;
while (a[28 + i] != '<') i++;
string str;
if (i == 0) i = 1;
str.assign(a, 28, i);
return str;
}
//get name from string
string gotname(string a) {
int i = 0;
while (a[i] != '>') i++;
int k = i;
while (a[k] != '<') k++;
string str;
if (i == 0) i = 1;
str.assign(a, i + 1, k - 1 - i);
return str;
}
//get all imformation from txt
void Get_exchange(string filename) {
//change UTF8 to ANSI
ifstream fin(filename.c_str());
ofstream fout("data.txt");
string str1, str2, str3;
bool tab = false;
while (getline(fin, str1)) {
fout << UTF8_To_string(str1) << endl;
}
fin.close();
fout.close();
//get imformation
ifstream fin_data("data.txt");
while (getline(fin_data, str1)) {
if (str1.length() >= 30) { str2.assign(str1, 1, 27); }
if (str2 == " <td>") {
tab = true;
}
else tab = false;
if (tab == true) {
static int count = 1, i = 0;
switch (count)
{
case 1: p[i].ex1 = gotex(str1); p[i].name = gotname(str3); count++; break;
case 2: p[i].ex2 = gotex(str1); count++; break;
case 3: p[i].ex3 = gotex(str1); count++; break;
case 4:p[i].ex4 = gotex(str1); count++; break;
case 5:p[i].ex5 = gotex(str1); count++; break;
case 6:p[i].date = gotstring(str1); count++; break;
case 7:p[i].date += " "; p[i].date += gotstring(str1); count++; i++; break;
case 8: count++; break;
case 9: count++; break;
case 10: count++; break;
case 11: count++; break;
case 12: count = 1; break;
default:break;
}
}
if (str1.length() >= 30) str3.assign(str1, 1, str1.size());
}
}
void main()
{
//初始化socket,用于tcp网络连接
WSADATA wsaData;
if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0)
{
return;
}
// 创建文件夹,保存图片和网页文本文件
CreateDirectory("./img", 0);
CreateDirectory("./html", 0);
// 遍历的起始地址
string urlStart = "http://www.boc.cn/sourcedb/whpj/";
// 使用广度遍历
// 提取网页中的超链接放入hrefUrl中,
//提取图片链接,下载图片。
BFS(urlStart);
// 访问过的网址保存起来
visitedUrl.insert(urlStart);
while (hrefUrl.size() != 0)
{
// 从队列的最开始取出一个网址
string url = hrefUrl.front();
BFS(url);
hrefUrl.pop();
}
WSACleanup();
Get_exchange("./html/httpwww.boc.cnsourcedbwhpj.txt");
for (int i = 0; i<27; i++)
{
cout << p[i].name << endl;
cout << p[i].ex1 << endl;
cout << p[i].ex2 << endl;
cout << p[i].ex3 << endl;
cout << p[i].ex4 << endl;
cout << p[i].ex5 << endl;
cout << p[i].date << endl;
}
return;
}