出于对之后找工作的考虑,最近决定将实验的编码工作都转到C++上来做。本来想所有的程序都在linux平台下编写,但是GCC糟糕的代码调试功能让我望而却步。唉,因此决定,简单的算法练习在GCC下做。实验中的工程就在VC平台上做。
之前在Java上的网页抓取程序需要重新写,因此在网络上进行了检索,找到了相应的源码,自己使用后,留在这里给自己备忘。
/*
crawl the HTML code from specified URL
input: URL --the head address of the string which saved the URL of a webpage
output: content --The HTML code of the specified webpage
*/
CString getHTMLCode (const char * url)
{
CString content;
CInternetSession session("HttpClient");
CHttpFile* pfile = (CHttpFile *)session.OpenURL(url);
DWORD dwStatusCode;
pfile -> QueryInfoStatusCode(dwStatusCode);
if(dwStatusCode == HTTP_STATUS_OK)
{
CString data;
while (pfile -> ReadString(data))
{
content += data + "\r\n";
}
content.TrimRight();
printf(" %s\n " ,content);
}
pfile -> Close();
delete pfile;
session.Close();
return content;
}
另外还有一个检索到的源码是在Windows下用socket实现网页的抓取():
#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>
#pragma comment(lib, "ws2_32.lib")
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576
void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
{
return;
}
// timer is start
clock_t start, finish;
double duration;
start = clock();
char host[] = "www.sina.com.cn";
char *request = "GET / HTTP/1.0\r\nHost: www.sina.com.cn\r\nConnection: Close\r\n\r\n";
struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
}
// 获取域名对应的IP
struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4);
int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));
sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
}
// send the "GET" data
ret = send(sock, request, strlen(request), 0);
// 网页内容长度。可以从http头部数据中获取 "Content-Length:"
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);
int bytesRead = 0;
while(ret > 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0';
cout << bytesRead << endl;
// write the html content to the file
ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl;
ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();
// timer is finish
finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n";
return;
}