1.首先,我们随便找个写文章的小网站。http://www.duanmeiwen.com/xinqing/
我们可以看到,这个网站的有以下一些文章:
2.下面开始写我们的爬虫程序:
#include <stdio.h>
#include <winsock2.h>
int main () {
int j = 1;
int flag = 0;
WSADATA wsaData;
WSAStartup(MAKEWORD(1, 1), &wsaData);
//文章的域名地址
char szWeb[] = "www.duanmeiwen.com";
HOSTENT *pHost = gethostbyname(szWeb);
const char* pIPAddr = inet_ntoa(*((struct in_addr *)pHost->h_addr));
printf("web server ip is : %s\n", pIPAddr);
SOCKADDR_IN webServerAddr;
webServerAddr.sin_family = AF_INET;
webServerAddr.sin_addr.S_un.S_addr = inet_addr(pIPAddr);
webServerAddr.sin_port = htons(80);
SOCKET sockClient = socket(AF_INET, SOCK_STREAM, 0);
int nRet = connect(sockClient, (struct sockaddr*)&webServerAddr, sizeof(webServerAddr));
if (nRet < 0)
{
printf("connect error\n");
return ;
}
// 该博主博客列表访问格式/s/articlelist_5890965060_0_%d.html
char szHttpRest[1024] = { 0 };
sprintf(szHttpRest, "GET /xinqing HTTP/1.1\r\nHost:%s\r\nConnection: Keep-Alive\r\n\r\n", szWeb);
printf("send buf is:\n");
printf("%s\n", szHttpRest);
nRet = send(sockClient, szHttpRest, strlen(szHttpRest) + 1, 0);
if (nRet < 0)
{
printf("send error\n");
return ;
}
FILE *fp = fopen("test.txt", "a+");
while (1)
{
char szRecvBuf[2] = { 0 };
nRet = recv(sockClient, szRecvBuf, 1, 0);
if (nRet < 0)
{
printf("recv error\n");
goto LABEL;
}
if (0 == nRet)
{
printf("connection has been closed by web server\n");
goto LABEL;
}
if (0 == flag)
{
printf("writing data to file...\n");
flag = 1;
}
fputc(szRecvBuf[0], fp);
}
LABEL:
fclose(fp);
closesocket(sockClient);
WSACleanup();
printf("list index is ------------------------------------> %d\n\n\n", j);
Sleep(1000 * getRand(1));
printf("\n\n\ndone!!!!!!\n\n\n");
return ;
}
3:执行完程序,我们看到会生成这样的一个文件:
这样我们就能看到这个网页上的所有的html信息:
希望对各位有用。本爬虫程序,切勿用于损害他人利益的用途。