// Larbin // Sebastien Ailleret // 03-02-00 -> 23-11-01 #include <unistd.h> #include <iostream> using namespace std; #include <string.h> #include <sys/types.h> #include <sys/socket.h> #include "options.h" #include "types.h" #include "global.h" #include "utils/text.h" #include "utils/debug.h" #include "interf/input.h" #define INIT -1 #define END -2 /* input connexion */ struct Input { int fds;/* 代表对应的SOCKET*/ uint pos;/* 开始的位置*/ uint end_pos;/* 终止位置*/ uint end_posp; int priority;/* 优先级*/ uint depth; uint test; char buffer[BUF_SIZE];/* 存放内容*/ }; /** socket used for input */ static int inputFds; /** number of opened input connections */ static int nbInput; /** array for the opened input connections */ static Input *inputConns[maxInput]; // declaration of forward functions static bool readMore (Input *in); static char *readline (Input *in); int input () { if (nbInput < 0) { // Input is disabled return -1; } int n = -1; if (nbInput < maxInput-1 && global::ansPoll[inputFds]) { // test if there is a new connection /* 检验下是否有新的连接等待中,有则接受*/ struct sockaddr_in addr; int fdc; socklen_t len = sizeof(addr); fdc = accept(inputFds, (struct sockaddr *) &addr, &len);//接收网络中的连接请求 if (fdc != -1) { global::verifMax(fdc); fcntl(fdc, F_SETFL, O_NONBLOCK);/* 把新的连接加入*/ inputConns[nbInput]->fds = fdc; inputConns[nbInput]->pos = 0; inputConns[nbInput]->end_pos = 0; inputConns[nbInput]->end_posp = 0; inputConns[nbInput]->priority = INIT; nbInput++; #ifdef URL_TAGS ecrire(fdc, "Welcome to larbin input system !/nYour first line should look like /"priority:0 depth:5 test:1/"/nThe following should contain one id and one url separed by one space per line/n/"137 http://pauillac.inria.fr/~ailleret/prog/larbin//" for instance/n/n"); #else ecrire(fdc, "Welcome to larbin input system !/nYour first line should look like /"priority:0 depth:5 test:1/"/nThe following should contain one url per line (http://pauillac.inria.fr/~ailleret/prog/larbin/ for instance)/n/n"); #endif // URL_TAGS } } if (nbInput < maxInput-1) { n = inputFds; global::setPoll(inputFds, POLLIN); } // read open sockets int i=0; while (i<nbInput) {/* 对所有的SOCKETS处理*/ Input *in = inputConns[i]; if (global::ansPoll[in->fds] && readMore(in)) { char *line = readline(in);/* 从网络中读取一行数据 */ while (line != NULL) { if (in->priority == INIT) {/* 如果是初始化的,那么他就是由参数的信息过来的*/ // first line if (sscanf(line, "priority:%d depth:%u test:%u",//提取优先级,挖掘深度等信息。 &in->priority, &in->depth, &in->test) == 3) { line = readline(in); } else { ecrire(in->fds, "Incorrect input/n"); line = NULL; in->priority = END; }//end if ( sscanf() ) } else {/* 不是初始的*/ // this is an url url *u = new url(line, in->depth);/* 新建一个连接*/ if (u->isValid()) {//如果合法 if (in->test) { //如果test==1,则需要通过hash表检测,是否已经爬取过该url if (global::seen->testSet(u)) {/* 如果没有,放到hashUrls中去*/ hashUrls(); // stats 统计信息,hashUrls++ if (in->priority) {//如果prrority==1 即优先级为1 ,优先级比较高 global::URLsPriority->put(u);/* 如果有优先的那么放入优先队列中,否则放磁盘*/ } else {//低优先级 global::URLsDisk->put(u); }//end if(priority) } else { delete u; }//end if (global::seen->testSet(u)) } else {//test!=1,不做检测的情况 hashUrls(); //统计信息,hashUrls++ global::seen->set(u);//将当前的url信息,写入hash表中,以避免以后重复爬 if (in->priority) {//根据优先级,将url添加到不同的队列中去。同上 global::URLsPriority->put(u); } else { global::URLsDisk->put(u); }//end if (priority) }//end if (in->test) } else { delete u; }//end if (u->isValid()) line = readline(in); }//end if (in->priority == INIT) }//end while (line != NULL) }//end if (global::ansPoll[in->fds] && readMore(in)) if (in->priority == END) { // forget this connection ecrire(in->fds, "Bye bye.../n"); close(in->fds);//关闭套接字 nbInput--; Input *tmp = inputConns[i];/* 把最后的connection 放到被删除的 位置上*/ inputConns[i] = inputConns[nbInput]; inputConns[nbInput] = tmp; } else { // go to next connection/* 进行下一个连接*/ if (in->fds > n) n = in->fds; global::setPoll(in->fds, POLLIN); i++; }//end if (in->priority == END) }//end while (i<nbInput) return n; } static bool readMore (Input *in) { assert (in->end_posp == in->end_pos); if (in->end_posp - in->pos > maxUrlSize+100) { // error -> stop connection ecrire(in->fds, "Url submitted too long/n"); in->priority = END; return false; } if (2 * in->pos > BUF_SIZE) { in->end_pos -= in->pos; in->end_posp = in->end_pos; memmove(in->buffer, in->buffer+in->pos, in->end_pos); in->pos = 0; } int nb = read(in->fds, in->buffer+in->end_pos, BUF_SIZE-in->end_pos); if (nb == -1 && errno == EAGAIN) { return false; } else if (nb <= 0) { in->priority = END; return false; } else { in->end_pos += nb; return true; } } /* no allocation */ static char *readline (Input *in) { while (in->end_posp < in->end_pos && in->buffer[in->end_posp] != '/n') { in->end_posp++; } if (in->end_posp == in->end_pos) { return NULL; } else { if (in->buffer[in->end_posp-1] == '/r') { in->buffer[in->end_posp-1] = 0; } else { in->buffer[in->end_posp] = 0; } char *res = in->buffer+in->pos; in->pos = ++in->end_posp; return res; } } /** init everything */ void initInput () { if (global::inputPort != 0) { int allowReuse = 1; struct sockaddr_in addr; memset ((void *) &addr, 0, sizeof(addr));//置零操作 /*创建套接字建立网络连接 */ addr.sin_addr.s_addr = INADDR_ANY; addr.sin_family = AF_INET; addr.sin_port = htons(global::inputPort);//向Larbin添加url等输入信息的telnet接口 if ((inputFds = socket(AF_INET, SOCK_STREAM, 0)) == -1 || setsockopt(inputFds, SOL_SOCKET, SO_REUSEADDR, (char*)&allowReuse, sizeof(allowReuse)) || bind(inputFds, (struct sockaddr *) &addr, sizeof(addr)) != 0//绑定 || listen(inputFds, 4) != 0)/*监听*/ { cerr << "unable to get input socket (port " << global::inputPort << ") : " << strerror(errno) << "/n"; exit(1); } fcntl(inputFds, F_SETFL, O_NONBLOCK);//改变文件状态为非阻塞 for (int i=0; i<maxInput; i++) { inputConns[i] = new Input; } nbInput = 0; } else { nbInput = -1; } }