因为vs不检测函数使用时的参数传递多了或者类型错误,可能会产生非常奇怪的bug
#pragma comment(lib, "ws2_32.lib")
#define _CRT_SECURE_NO_DEPRECATE
#define SOCK_STREAM 1
#define AF_INET 2
#define SIZE_OF_SENDBUF 10000
#define SIZE_OF_RECEIVEBUF 1000
#define REASON_SIZE 128
#define SIZE_OF_IP 128
#define SOURCE 10
#define SIZE_OF_DOMAIN_NAME 128
#define SIZE_OF_HOST 128
#define SIZE_OF_URL 100//每个url最长不超过100字节
#define NUMBER_OF_VISITED_URLS 1000
#define XUN_HUAN_CI_SHU 3
#define SIZE_OF_PATH 70
#define SIZE_OF_INFORMATION 100
struct node
{
char url[SIZE_OF_URL];
struct node * next;
};
typedef struct node * List;
struct hashtable
{
List *url;
int tablesize;
};
typedef struct hashtable *ahash;
#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <winSock.h>
#include <string.h>
#include <mysql.h>
void add_one_in_name(MYSQL *mysql, char *name);
void collect_information(FILE *file, char *text, char *before, char *after, int max_size);
void insert_in_list(List head, char *string);
int standardization_url(char *url, char *root_domain_name, int max_size);
void delete_in_list(List head, List ptr);
void insert_in_hash(ahash ahash, char *url);
int isvisited(ahash hash, char *url);
void viste_one_cycle(FILE* collect_informations, FILE *writeto, List head, ahash hash);//只有4个参数
int hash(ahash hash, char *string);
int Nextprime(int i);
int Isprime(int i);
void get_domain_name_and_path_from_url(char *url, char *path, char *domain_name);
ahash initialize_hash(int up_limit);
List initialize_url_list(FILE *readfrom);
SOCKET build_connect(char *ID, int port);
void domain_name_to_IP(char *domain_name, char *IP);
void myexit(char * error_position);
int Socket(int family, int type, int protocal);
void Connect(SOCKET s, const struct sockaddr * name, int namelen);
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size);//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
int main(void)
{
LPWSADATA wsaData;
WSAStartup(MAKEWORD(2, 1), &wsaData);
MYSQL * mysql = mysql_init(0);
mysql_real_connect(mysql, "localhost", "root", "root", "webrobot", 0, 0, 0);
FILE *readfrom = fopen("1.txt", "r");
FILE *collect_informations = fopen("3.txt", "w");
FILE *write_visited_url_to = fopen("2.txt", "w");
ahash hash = initialize_hash(NUMBER_OF_VISITED_URLS);
List head = initialize_url_list(readfrom);
for (int i = 1; i <= XUN_HUAN_CI_SHU; i++)
viste_one_cycle(mysql, collect_informations, write_visited_url_to, head, hash);//传了5个参数导致错位,但vs不会报错,由此产生奇怪的bug
fclose(collect_informations);
fclose(readfrom);
fclose(write_visited_url_to);
mysql_close(mysql);
WSACleanup();
return 0;
}
void domain_name_to_IP(char *domain_name, char *IP)
{
struct hostent *target_server;
target_server = gethostbyname(domain_name);
if (target_server == NULL)
myexit("获取DNS服务失败\n");
struct in_addr addr;
addr.S_un.S_addr = ((struct in_addr *)target_server->h_addr_list[0])->S_un.S_addr;
strcpy(IP, inet_ntoa(addr));
}
SOCKET build_connect(char * IP, int port)
{
SOCKET s = Socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in serveraddr;
memset(serveraddr.sin_zero, 0, sizeof(serveraddr.sin_zero));
serveraddr.sin_port = htons(port);
serveraddr.sin_family = AF_INET;
serveraddr.sin_addr.S_un.S_addr = inet_addr(IP);
Connect(s, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
return s;
}
void Connect(SOCKET s, const struct sockaddr * name, int namelen)
{
if (connect(s, name, namelen) == 0)
return;
else
{
printf("%d\n", WSAGetLastError);
myexit("与服务器建立连接失败\n");
}
}
int Socket(int family, int type, int protocal)
{
SOCKET s = socket(family, type, protocal);
if (s > 0)
return s;
else
{
printf("%d", WSAGetLastError);
myexit("创建套接字失败\n");
}
}
void myexit(char * error_position)
{
printf("%s", error_position);
return;
}
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size)//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
{
char *url = (char *)malloc(max_size);
int i = 0, j = 0, k = 0, count = 0;
for (i = 0; i < strlen(text); i++)
{
if (text[i] == '\0')
{
free(url);
return count;
}
while (text[i] != before[j])
{
if (text[i] == '\0')
{
free(url);
return count;
}
i++;
}
while (text[i] == before[j])
{
i++;
j++;
if (text[i] == '\0')
{
free(url);
return count;
}
if (before[j] == '\0')
{
while (1)
{
if (text[i] == '\0')
{
free(url);
return count;
}
if (text[i] == after[0])
{
url[k] = '\0';
if (standardization_url(url, root_domain_name, max_size))
{
insert_in_list(head, url);
fputs(url, file);
fputc('\n', file);
}
k = 0;
j = 0;//不加会有非常坑的坑
count++;
break;
}
if (k >= max_size - 1)
break;
url[k++] = text[i++];
}
}
}
j = 0;
}
free(url);
return count;
}
ahash initialize_hash(int max_number)
{
ahash hash = (ahash)malloc(sizeof(struct hashtable));
hash->tablesize = Nextprime(max_number);
hash->url = (List *)malloc(sizeof(List)*hash->tablesize);
for (int i = 0; i < hash->tablesize; i++)
{
hash->url[i] = (List)malloc(sizeof(struct node));
hash->url[i]->next = NULL;
}
return hash;
}
int Nextprime(int n)
{
while (!Isprime(n++))
;
return n - 1;
}
int Isprime(int n)
{
if (n == 1)
return 0;
int i;
for (i = 2; i <= n - 1; i++)
if (n / i == 0)
return 0;
return 1;
}
List initialize_url_list(FILE *readfrom)
{
List head = (List)malloc(sizeof(struct node));
head->next = NULL;
char url[SIZE_OF_URL];
while (fgets(url, sizeof(url), readfrom))
{
List ptr = (List)malloc(sizeof(struct node));
ptr->next = head->next;
head->next = ptr;
strcpy(ptr->url, url);
}
return head;
}
void get_domain_name_and_path_from_url(char *url, char *domain_name, char *path)
{
puts(url);
int i = -1;
while (url[++i] != '/')
domain_name[i] = url[i];
domain_name[i] = '\0';
i--;
int j = 0;
while (url[++i])
path[j++] = url[i];
path[j] = '\0';
puts(domain_name);
puts(path);
}
int isvisited(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = ahash->url[value]->next;
while (ptr)
{
if (strcmp(ptr->url, url) == 0)
return 1;
ptr = ptr->next;
}
return 0;
}
void insert_in_hash(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, url);
ptr->next = ahash->url[value]->next;
ahash->url[value]->next = ptr;
}
void delete_in_list(List head, List ptr)
{
List ptr2 = head;
while (strcmp(ptr2->next->url, ptr->url) != 0)
ptr2 = ptr2->next;
ptr2->next = ptr->next;
free(ptr);
}
int hash(ahash hash, char *string)
{
int i = -1;
unsigned int hashvalue = 0;
while (string[++i])
hashvalue = (hashvalue << 5) + string[i];
return hashvalue%hash->tablesize;
}
void insert_in_list(List head, char *string)
{
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, string);
ptr->next = head->next;
head->next = ptr;
}
int standardization_url(char *url, char *root_domain_name, int max_size)
{
if (strlen(url) <= 11 || strlen(url) >= max_size)//太长或太短的都不要
return 0;
if (url[0] == 'w'&&url[1] == 'w'&&url[2] == 'w'&&url[4] == '.')//如果已经是规范的,那么直接返回
return 1;
char *copy_url = (char *)malloc(max_size);
strcpy(copy_url, root_domain_name);
int i = 0, j = strlen(root_domain_name);
if (url[0] == '/')//只识别带/的也就是path,在前面加domainname
{
while (url[i])
copy_url[j++] = url[i++];
copy_url[j] = '\0';
strcpy(url, copy_url);
free(copy_url);
return 1;
}
if (url[0] == 'h'&&url[8] == 'w'&&url[9] == 'w')//之识别http://www.xxxxx.xx的,把http://去掉
{
if (url[7] == 'w')
j = strlen("http://");
else if (url[10] == 'w'&&url[4] == 's')
j = strlen("https://");
else
{
free(copy_url);
return 0;
}
while (url[j])
copy_url[i++] = url[j++];
copy_url[i] = '\0';
strcpy(url, copy_url);
free(copy_url);
return 1;
}
return 0;
}
void collect_information(FILE *file, char *text, char *before, char *after, int max_size)
{
char *target_string = (char *)malloc(max_size);
int i = 0, j = 0, k = 0;
for (i = 0; i < strlen(text); i++)
{
if (text[i] == '\0')
{
free(target_string);
return;
}
while (text[i] != before[j])
{
if (text[i] == '\0')
{
free(target_string);
return;
}
i++;
}
while (text[i] == before[j])
{
i++;
j++;
if (text[i] == '\0')
{
free(target_string);
return;
}
if (before[j] == '\0')
{
while (1)
{
if (text[i] == '\0')
{
free(target_string);
return;
}
if (text[i] == after[0])
{
target_string[k] = '\0';
// add_one_in_name(mysql, target_string);
if (fputs(target_string, file) == EOF)
printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
else
printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
fputc('\n', file);
k = 0;
j = 0;//不加会有非常坑的坑
break;
}
if (k >= max_size - 1)
break;
target_string[k++] = text[i++];
}
}
}
j = 0;
}
free(target_string);
}
void add_one_in_name(MYSQL *mysql, char *name)
{
char sql[256];
int flag = 0;
mysql_query(mysql, "select * from robot");
MYSQL_RES * result = mysql_store_result(mysql);
MYSQL_ROW row;
while (row = mysql_fetch_row(result))
{
if (strcmp(row[0], name) == 0)
flag = 1;
}
if (flag == 1)
sprintf(sql, "update robot set value=value+1 where name='%s'", name);
else
sprintf(sql, "insert into robot values('%s',1)", name);
mysql_query(mysql, sql);
mysql_free_result(result);
}
#define _CRT_SECURE_NO_DEPRECATE
#define SOCK_STREAM 1
#define AF_INET 2
#define SIZE_OF_SENDBUF 10000
#define SIZE_OF_RECEIVEBUF 1000
#define REASON_SIZE 128
#define SIZE_OF_IP 128
#define SOURCE 10
#define SIZE_OF_DOMAIN_NAME 128
#define SIZE_OF_HOST 128
#define SIZE_OF_URL 100//每个url最长不超过100字节
#define NUMBER_OF_VISITED_URLS 1000
#define XUN_HUAN_CI_SHU 3
#define SIZE_OF_PATH 70
#define SIZE_OF_INFORMATION 100
struct node
{
char url[SIZE_OF_URL];
struct node * next;
};
typedef struct node * List;
struct hashtable
{
List *url;
int tablesize;
};
typedef struct hashtable *ahash;
#include <windows.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <winSock.h>
#include <string.h>
#include <mysql.h>
void add_one_in_name(MYSQL *mysql, char *name);
void collect_information(FILE *file, char *text, char *before, char *after, int max_size);
void insert_in_list(List head, char *string);
int standardization_url(char *url, char *root_domain_name, int max_size);
void delete_in_list(List head, List ptr);
void insert_in_hash(ahash ahash, char *url);
int isvisited(ahash hash, char *url);
void viste_one_cycle(FILE* collect_informations, FILE *writeto, List head, ahash hash);//只有4个参数
int hash(ahash hash, char *string);
int Nextprime(int i);
int Isprime(int i);
void get_domain_name_and_path_from_url(char *url, char *path, char *domain_name);
ahash initialize_hash(int up_limit);
List initialize_url_list(FILE *readfrom);
SOCKET build_connect(char *ID, int port);
void domain_name_to_IP(char *domain_name, char *IP);
void myexit(char * error_position);
int Socket(int family, int type, int protocal);
void Connect(SOCKET s, const struct sockaddr * name, int namelen);
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size);//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
int main(void)
{
LPWSADATA wsaData;
WSAStartup(MAKEWORD(2, 1), &wsaData);
MYSQL * mysql = mysql_init(0);
mysql_real_connect(mysql, "localhost", "root", "root", "webrobot", 0, 0, 0);
FILE *readfrom = fopen("1.txt", "r");
FILE *collect_informations = fopen("3.txt", "w");
FILE *write_visited_url_to = fopen("2.txt", "w");
ahash hash = initialize_hash(NUMBER_OF_VISITED_URLS);
List head = initialize_url_list(readfrom);
for (int i = 1; i <= XUN_HUAN_CI_SHU; i++)
viste_one_cycle(mysql, collect_informations, write_visited_url_to, head, hash);//传了5个参数导致错位,但vs不会报错,由此产生奇怪的bug
fclose(collect_informations);
fclose(readfrom);
fclose(write_visited_url_to);
mysql_close(mysql);
WSACleanup();
return 0;
}
void domain_name_to_IP(char *domain_name, char *IP)
{
struct hostent *target_server;
target_server = gethostbyname(domain_name);
if (target_server == NULL)
myexit("获取DNS服务失败\n");
struct in_addr addr;
addr.S_un.S_addr = ((struct in_addr *)target_server->h_addr_list[0])->S_un.S_addr;
strcpy(IP, inet_ntoa(addr));
}
SOCKET build_connect(char * IP, int port)
{
SOCKET s = Socket(AF_INET, SOCK_STREAM, 0);
struct sockaddr_in serveraddr;
memset(serveraddr.sin_zero, 0, sizeof(serveraddr.sin_zero));
serveraddr.sin_port = htons(port);
serveraddr.sin_family = AF_INET;
serveraddr.sin_addr.S_un.S_addr = inet_addr(IP);
Connect(s, (struct sockaddr *)&serveraddr, sizeof(serveraddr));
return s;
}
void Connect(SOCKET s, const struct sockaddr * name, int namelen)
{
if (connect(s, name, namelen) == 0)
return;
else
{
printf("%d\n", WSAGetLastError);
myexit("与服务器建立连接失败\n");
}
}
int Socket(int family, int type, int protocal)
{
SOCKET s = socket(family, type, protocal);
if (s > 0)
return s;
else
{
printf("%d", WSAGetLastError);
myexit("创建套接字失败\n");
}
}
void myexit(char * error_position)
{
printf("%s", error_position);
return;
}
int collect_new_url(FILE *file, List head, const char* root_domain_name, const char * text, const char *before, const char *after, int max_size)//获取text中before和after之间的字符串存入save_target中,如果before之后没有after,那么填满为止,返回1
{
char *url = (char *)malloc(max_size);
int i = 0, j = 0, k = 0, count = 0;
for (i = 0; i < strlen(text); i++)
{
if (text[i] == '\0')
{
free(url);
return count;
}
while (text[i] != before[j])
{
if (text[i] == '\0')
{
free(url);
return count;
}
i++;
}
while (text[i] == before[j])
{
i++;
j++;
if (text[i] == '\0')
{
free(url);
return count;
}
if (before[j] == '\0')
{
while (1)
{
if (text[i] == '\0')
{
free(url);
return count;
}
if (text[i] == after[0])
{
url[k] = '\0';
if (standardization_url(url, root_domain_name, max_size))
{
insert_in_list(head, url);
fputs(url, file);
fputc('\n', file);
}
k = 0;
j = 0;//不加会有非常坑的坑
count++;
break;
}
if (k >= max_size - 1)
break;
url[k++] = text[i++];
}
}
}
j = 0;
}
free(url);
return count;
}
ahash initialize_hash(int max_number)
{
ahash hash = (ahash)malloc(sizeof(struct hashtable));
hash->tablesize = Nextprime(max_number);
hash->url = (List *)malloc(sizeof(List)*hash->tablesize);
for (int i = 0; i < hash->tablesize; i++)
{
hash->url[i] = (List)malloc(sizeof(struct node));
hash->url[i]->next = NULL;
}
return hash;
}
int Nextprime(int n)
{
while (!Isprime(n++))
;
return n - 1;
}
int Isprime(int n)
{
if (n == 1)
return 0;
int i;
for (i = 2; i <= n - 1; i++)
if (n / i == 0)
return 0;
return 1;
}
List initialize_url_list(FILE *readfrom)
{
List head = (List)malloc(sizeof(struct node));
head->next = NULL;
char url[SIZE_OF_URL];
while (fgets(url, sizeof(url), readfrom))
{
List ptr = (List)malloc(sizeof(struct node));
ptr->next = head->next;
head->next = ptr;
strcpy(ptr->url, url);
}
return head;
}
void get_domain_name_and_path_from_url(char *url, char *domain_name, char *path)
{
puts(url);
int i = -1;
while (url[++i] != '/')
domain_name[i] = url[i];
domain_name[i] = '\0';
i--;
int j = 0;
while (url[++i])
path[j++] = url[i];
path[j] = '\0';
puts(domain_name);
puts(path);
}
int isvisited(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = ahash->url[value]->next;
while (ptr)
{
if (strcmp(ptr->url, url) == 0)
return 1;
ptr = ptr->next;
}
return 0;
}
void insert_in_hash(ahash ahash, char *url)
{
int value = hash(ahash, url);
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, url);
ptr->next = ahash->url[value]->next;
ahash->url[value]->next = ptr;
}
void delete_in_list(List head, List ptr)
{
List ptr2 = head;
while (strcmp(ptr2->next->url, ptr->url) != 0)
ptr2 = ptr2->next;
ptr2->next = ptr->next;
free(ptr);
}
int hash(ahash hash, char *string)
{
int i = -1;
unsigned int hashvalue = 0;
while (string[++i])
hashvalue = (hashvalue << 5) + string[i];
return hashvalue%hash->tablesize;
}
void insert_in_list(List head, char *string)
{
List ptr = (List)malloc(sizeof(struct node));
strcpy(ptr->url, string);
ptr->next = head->next;
head->next = ptr;
}
int standardization_url(char *url, char *root_domain_name, int max_size)
{
if (strlen(url) <= 11 || strlen(url) >= max_size)//太长或太短的都不要
return 0;
if (url[0] == 'w'&&url[1] == 'w'&&url[2] == 'w'&&url[4] == '.')//如果已经是规范的,那么直接返回
return 1;
char *copy_url = (char *)malloc(max_size);
strcpy(copy_url, root_domain_name);
int i = 0, j = strlen(root_domain_name);
if (url[0] == '/')//只识别带/的也就是path,在前面加domainname
{
while (url[i])
copy_url[j++] = url[i++];
copy_url[j] = '\0';
strcpy(url, copy_url);
free(copy_url);
return 1;
}
if (url[0] == 'h'&&url[8] == 'w'&&url[9] == 'w')//之识别http://www.xxxxx.xx的,把http://去掉
{
if (url[7] == 'w')
j = strlen("http://");
else if (url[10] == 'w'&&url[4] == 's')
j = strlen("https://");
else
{
free(copy_url);
return 0;
}
while (url[j])
copy_url[i++] = url[j++];
copy_url[i] = '\0';
strcpy(url, copy_url);
free(copy_url);
return 1;
}
return 0;
}
void collect_information(FILE *file, char *text, char *before, char *after, int max_size)
{
char *target_string = (char *)malloc(max_size);
int i = 0, j = 0, k = 0;
for (i = 0; i < strlen(text); i++)
{
if (text[i] == '\0')
{
free(target_string);
return;
}
while (text[i] != before[j])
{
if (text[i] == '\0')
{
free(target_string);
return;
}
i++;
}
while (text[i] == before[j])
{
i++;
j++;
if (text[i] == '\0')
{
free(target_string);
return;
}
if (before[j] == '\0')
{
while (1)
{
if (text[i] == '\0')
{
free(target_string);
return;
}
if (text[i] == after[0])
{
target_string[k] = '\0';
// add_one_in_name(mysql, target_string);
if (fputs(target_string, file) == EOF)
printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
else
printf("\n\n\n\n\n\n\n写入文件\n\n\n\n\n\n\n\n");
fputc('\n', file);
k = 0;
j = 0;//不加会有非常坑的坑
break;
}
if (k >= max_size - 1)
break;
target_string[k++] = text[i++];
}
}
}
j = 0;
}
free(target_string);
}
void add_one_in_name(MYSQL *mysql, char *name)
{
char sql[256];
int flag = 0;
mysql_query(mysql, "select * from robot");
MYSQL_RES * result = mysql_store_result(mysql);
MYSQL_ROW row;
while (row = mysql_fetch_row(result))
{
if (strcmp(row[0], name) == 0)
flag = 1;
}
if (flag == 1)
sprintf(sql, "update robot set value=value+1 where name='%s'", name);
else
sprintf(sql, "insert into robot values('%s',1)", name);
mysql_query(mysql, sql);
mysql_free_result(result);
}
void viste_one_cycle(FILE *collect_informations, FILE *write_visited_url_to, List head, ahash hash)
{
int port;
char sendbuf[SIZE_OF_SENDBUF], receivebuf[SIZE_OF_RECEIVEBUF] = { 0 }, target_page[SOURCE];
char IP[SIZE_OF_IP];
char domain_name[SIZE_OF_DOMAIN_NAME];
char path[SIZE_OF_PATH];
List ptr = head->next;
while (ptr)
{
while (isvisited(hash, ptr->url))
{
if (!ptr->next)
{
delete_in_list(head, ptr);
ptr = NULL;
break;
}
List next_ptr = ptr->next;
delete_in_list(head, ptr);
ptr = next_ptr;
}
if (!ptr)
break;
get_domain_name_and_path_from_url(ptr->url, domain_name, path);
domain_name_to_IP(domain_name, IP);
port = 80;
SOCKET client = build_connect(IP, port);
sprintf(sendbuf, "GET %s HTTP 1.1\r\nHost: %s\r\nAccept: *\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586\r\nAccept-Language: zh-CN\r\nAccept-Charset: us-ascii\r\n\r\n", path, domain_name);
send(client, sendbuf, strlen(sendbuf) + 1, 0);
if (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0)>0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
{
int port;
char sendbuf[SIZE_OF_SENDBUF], receivebuf[SIZE_OF_RECEIVEBUF] = { 0 }, target_page[SOURCE];
char IP[SIZE_OF_IP];
char domain_name[SIZE_OF_DOMAIN_NAME];
char path[SIZE_OF_PATH];
List ptr = head->next;
while (ptr)
{
while (isvisited(hash, ptr->url))
{
if (!ptr->next)
{
delete_in_list(head, ptr);
ptr = NULL;
break;
}
List next_ptr = ptr->next;
delete_in_list(head, ptr);
ptr = next_ptr;
}
if (!ptr)
break;
get_domain_name_and_path_from_url(ptr->url, domain_name, path);
domain_name_to_IP(domain_name, IP);
port = 80;
SOCKET client = build_connect(IP, port);
sprintf(sendbuf, "GET %s HTTP 1.1\r\nHost: %s\r\nAccept: *\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586\r\nAccept-Language: zh-CN\r\nAccept-Charset: us-ascii\r\n\r\n", path, domain_name);
send(client, sendbuf, strlen(sendbuf) + 1, 0);
if (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0)>0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
}
}
else
{
closesocket(client);
SOCKET client = build_connect(IP, port);
send(client, sendbuf, strlen(sendbuf) + 1, 0);
while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
}
}
closesocket(client);
insert_in_hash(hash, ptr->url);
List next_ptr = ptr->next;
delete_in_list(head, ptr);
ptr = next_ptr;
}
}
}
else
{
closesocket(client);
SOCKET client = build_connect(IP, port);
send(client, sendbuf, strlen(sendbuf) + 1, 0);
while (recv(client, receivebuf, SIZE_OF_RECEIVEBUF, 0) > 0)
{
puts(receivebuf);
collect_information(collect_informations, receivebuf, "data-author-name=\"", "\"", SIZE_OF_INFORMATION);
collect_new_url(write_visited_url_to, head, domain_name, receivebuf, "href=\"", "\"", SIZE_OF_URL);
}
}
closesocket(client);
insert_in_hash(hash, ptr->url);
List next_ptr = ptr->next;
delete_in_list(head, ptr);
ptr = next_ptr;
}
}