linux网页数据采集,一个简单Http获取网页程序

/*

*A simple http client download software

*For learning http protocol

*writen by linux-person

*v1.0

*/

#include

#include

#include

#include

#include

#include

#include

#include

#define MAXSIZE 1024

char *pos_html(const char *src, int len)

{

return strstr(src, "");

}

int parse_url(const char *url, int len, char *host, int hsize, char *port, int ptsize, char **page, int *pgsize)

{

#define HOST 0

#define PORT 1

#define PAGE 2

int type = HOST;

char *p, *port_init, ch;

port_init = port;

if ((p = strstr(url, "www."))) {

} else if ((p = strstr(url, "://"))) {

p += strlen("://");

} else

p = (char *)url;

while ((ch = *p)) {

if (':' == ch) {

type = PORT;

} else if ('/' == ch && (HOST == type || PORT == type)) {

type = PAGE;

}

if (HOST == type) {

*host = ch;

host++;

} else if (PORT == type) {

*port = ch;

port++;

} else if (PAGE == type) {

break;

}

p++;

}

/*url中无端口号给默认端口*/

if (port && port_init == port)

strcpy(port, "80");

if (*p == '\0')

*page = NULL;

else

*page = p;

*pgsize = len - (p - url);

return 0;

}

int main(int argc, char **argv)

{

/*flag:all response data*/

int flag = 0, n, fd, sockfd;

int pgsize, sndlen;

char recvbuf[MAXSIZE];

char *p, sndmsg[1024];

char host[512];

char port[8];

char *page;

struct in_addr **pptr;

struct hostent *hp;

struct servent *sp;

struct sockaddr_in srvaddr;

bzero(host, sizeof(host));

bzero(port, sizeof(port));

if (argc < 2) {

printf("Usage:%s url\n", argv[0]);

return 0;

}

parse_url(argv[1], strlen(argv[1]), host, sizeof(host), port, sizeof(port), &page, &pgsize);

if (-1 == (fd = open((page && strchr(page, '.')) ? page : "index.html", O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH))) {

printf("open file failed\n");

exit(EXIT_FAILURE);

}

sockfd = socket(AF_INET, SOCK_STREAM, 0);

if ((hp = gethostbyname(host)) == NULL) {

printf("gethostbyname error\n");

exit(EXIT_FAILURE);

} else

pptr = (struct in_addr **)hp->h_addr_list;

if ((sp = getservbyname("http", "tcp")) == NULL) {

printf("getservbyname error\n");

exit(EXIT_FAILURE);

}

for (; *pptr != NULL;pptr++) {

bzero(&srvaddr, sizeof(srvaddr));

srvaddr.sin_family = AF_INET;

srvaddr.sin_port   = sp->s_port;

memcpy(&srvaddr.sin_addr, *pptr, sizeof(struct in_addr));

if ((0 == connect(sockfd, (const struct sockaddr *)&srvaddr, sizeof(srvaddr)))) {

printf("connected\n");

break;

}

printf("connect error\n");

}

if (*pptr == NULL) {

printf("Unable to connect\n");

exit(EXIT_FAILURE);

}

snprintf(sndmsg, 1024, "GET %s HTTP/1.1\r\nAccept-Encoding: identity\r\nHost: %s\r\nConnection: close\r\nUser-Agent: HttpClient\r\n\r\n", (page ? page : "/"), host);

//printf("snd:%s\n", sndmsg);

sndlen = strlen(sndmsg);

n = write(sockfd, sndmsg, sndlen);

while ((n = read(sockfd, recvbuf, MAXSIZE)) > 0) {

if (!flag) {

if ((p = pos_html(recvbuf, n))) {

flag = 1;

write(fd, p, n - (p - recvbuf));

}

} else

write(fd, recvbuf, n);

}

close(fd);

close(sockfd);

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值