对从网络上抓取到的网页进行处理:建立网络库,分词,去重,if-tdf计算权重,归一化,然后根据查询词将文本相似度从高到低的依次返回给客户
第一阶段:python网络爬虫抓取网页,并存盘
第二阶段:对磁盘上的网页文件建立网页库,将全部网页写入网页库,并建立相应网页的偏移量索引文件(1 23 100)-->(dofid, offset, size),以便读取网页内容
注意的地方:1.写成格式:<doc>
<docid>1</docid> //对网页进行编号
<url>http://....</url>
<title>...</title>
<content>....
</content>
</doc>
2.处理每行的'\r\n',调用tellp() 也可以用FILE*的ftell()函数,打印当前文件偏移量,
此部分代码:
pagelib.h
1 #ifndef _PAGELIB_H_ 2 #define _PAGELIB_H_ 3 #include <iostream> 4 #include <string> 5 #include <vector> 6 #include <fstream> 7 #include <unistd.h> 8 #include <dirent.h> 9 #include <sys/types.h> 10 #include <sys/stat.h> 11 #include <stdio.h> 12 #include <stdlib.h> 13 #include <string.h> 14 #include <time.h> 15 #include <pwd.h> 16 #include <grp.h> 17 18 class pagelib 19 { 20 public: 21 pagelib(const std::string &dir, 22 const std::string &libname) 23 :dir_(dir), 24 libname_(libname) 25 {} 26 void readdirname() 27 { 28 readfile(dir_); 29 } 30 void store_to_lib(const std::string&); 31 private: 32 void readfile(const std::string &); 33 void store_to_vector(const std::string&); 34 std::string dir_; 35 std::string libname_; 36 static int i; 37 std::vector<std::string> vec; 38 }; 39 40 #endif /*PAGELIB_H*/
pagelib.cpp
1 #include "pagelib.h" 2 using namespace std; 3 int pagelib::i = 1; 4 5 void pagelib::readfile(const string &s) 6 { 7 DIR *dir; 8 struct dirent * mydir; 9 struct stat mystat; 10 char str[256]; 11 dir=opendir(s.c_str()); 12 if(!dir) 13 { 14 cout << s << endl; 15 exit(-1); 16 } 17 18 while((mydir=readdir(dir))!=NULL) 19 { 20 sprintf(str, "%s/%s", s.c_str(), mydir->d_name); 21 stat(str, &mystat); 22 if(!strcmp(mydir->d_name, ".") || !strcmp(mydir->d_name, "..")) 23 continue; 24 if( mystat.st_mode & 0040000 ) 25 { 26 string s1 = str; 27 readfile(s1); 28 } 29 else 30 { 31 cout << str << endl; 32 store_to_vector(str); 33 } 34 } 35 } 36 37 void pagelib::store_to_vector(const string &str) 38 { 39 string content, line; 40 ifstream is(str.c_str()); 41 42 string title; 43 getline(is, title); 44 { 45 for (size_t i = 0; i < title.size(); i++) { 46 if(title[i] == '\r') 47 title[i] = ' '; 48 } 49 } 50 51 while(getline(is, line) > 0) 52 { 53 for (size_t i = 0; i < line.size(); i++) { 54 if(line[i] == '\r') 55 line[i] = '\n'; 56 } 57 content += line; 58 } 59 char s1[5]; 60 sprintf(s1, "%d", i++); 61 string doc = string("<doc>\n <docid>") + s1 + "</docid>\n <url>" + string(str) 62 + "</url>\n <title>" + title + "</title>\n <content>\n" 63 + content + " </content>\n</doc>\n\n"; 64 vec.push_back(doc); 65 is.close(); 66 } 67 68 void pagelib::store_to_lib(const string& index) 69 { 70 ofstream os_lib(libname_.c_str()); 71 ofstream os_index(index.c_str()); 72 vector<string>::iterator it = vec.begin(); 73 int j = 1; 74 os_lib << "<pagelib>\n" << endl; 75 for(; it != vec.end(); it++) 76 { 77 os_index << j++ << " " << os_lib.tellp() ; 78 os_lib << *it; 79 os_index << " " << (*it).size() << endl; 80 } 81 os_lib << "</pagelib>\n" << endl; 82 os_index.close(); 83 os_lib.close(); 84 }