linux 爬虫网页内容,linux c网页爬虫应用

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#define PORT 80

#define BUFSIZE 8184

static FILE *frontier;

void parse(char *buf)

{

//解析所get的存储于buf的信息,从中解析出将要爬的超链接,存储于frontier.txt文件中

char *pts=buf,*qts;

while((pts=strstr(pts,"a href="http:"))&&(qts=strstr(pts+9,"""))) //从buf中查找"a =href"http:"

{

fwrite(pts+15,qts-pts-15,1,frontier);//向frontier.txt文件中写找到的超链接

putc('n',frontier);

fflush(frontier);

pts=qts;

}

}

//函数封装

int httpget(char *url)

{

FILE *fp;

char *host_id;

struct hostent *host;

int sockfd, ret, i, h;

struct sockaddr_in servaddr;

char str1[4096],buf[8184],*str;

socklen_t len;

fd_set t_set1;

struct timeval tv;

if((host = gethostbyname(url)) == NULL)

{

printf("gethostbyname error");

exit(-1);

}

host_id = inet_ntoa(*((struct in_addr*)host -> h_addr));

printf("ip adress %sn",host_id);

if ((sockfd = socket(AF_INET, SOCK_STREAM, 0)) < 0 ) {

printf("socket error!n");

exit(0);

}

bzero(&servaddr, sizeof(servaddr));

servaddr.sin_family = AF_INET;

servaddr.sin_port = htons(PORT);

if (inet_pton(AF_INET,host_id,&servaddr.sin_addr) <= 0 ) {

printf("inet_pton error!n");

exit(0);

}

if (connect(sockfd, (struct sockaddr *)&servaddr, sizeof(servaddr)) < 0)

{

printf("connect error!n");

exit(0);

}

printf("connect success n");

memset(str1, 0, 4096);

//初始get请求信息

strcat(str1, "GET / HTTP/1.0rn");

strcat(str1, "Accept: */*rn");

strcat(str1, "Accept-Language: zh-CNrn");

strcat(str1, "User-Agent: Mozilla/4.0rn");

sprintf(str1,"HOST: %srn",url);

strcat(str1,"Connection: Keep-Alivern");

strcat(str1, "rnrn");

printf("%sn",str1);

ret = send(sockfd,(void *)str1,strlen(str1),0);

if (ret < 0) {

printf("send error %d,Error message'%s'n",errno, strerror(errno));

exit(0);

}else{

printf("send success ,total send %d n", ret);

}

while(1){

sleep(2);

printf("******n");

tv.tv_sec= 0;

tv.tv_usec= 0;

h= 0;

FD_ZERO(&t_set1);

FD_SET(sockfd, &t_set1);

printf("--------------->1n");

h= select(sockfd +1, &t_set1, NULL, NULL, &tv);

printf("--------------->2n");

if (h == 0) continue;

if (h < 0) {

close(sockfd);

printf("some thing read error!n");

return -1;

};

if (h > 0){

memset(buf, 0, 8184);

i= recv(sockfd, (void *)buf, 8184,0);

printf("i = %dn",i);

if (i==0){

close(sockfd);

printf("read message find error,stop!n");

return -1;

}

// fwrite(buf,sizeof(char),strlen(buf),fp);

// fflush(fp);

parse(buf);

printf("%sn", buf);

}

}

close(sockfd);

return 0;

}

int main(int argc, char *argv[])

{

FILE *fp;

int res;

char buf[BUFSIZE], *str;

if(argc != 2)

{

fprintf(stderr,"input domain name");

exit(-1);

}

frontier=fopen("frontier.txt","a+");

if(frontier==NULL)

{

printf("open error");

return 1;

}

if((fp = fopen("111.txt","a+")) < 0)

{

printf("fopen error");

exit(-1);

}

if((res = httpget(argv[1])) == 0)

{

printf("httpget success\n");

exit(-1);

}

return 0;

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值