前面的DocIndex程序输入一个Tianwang.raw.*****文件,会产生一下三个文件 Doc.idx, Url.idx, DocId2Url.idx,我们这里对DocSegment程序进行分析。
这里输入 Tianwang.raw.*****,Doc.idx,Url.idx.sort_uniq等三个文件,输出一个Tianwang.raw.***.seg 分词完毕的文件
- int main(int argc, char* argv[])
- {
- string strLine, strFileName=argv[1];
- CUrl iUrl;
- vector<CUrl> vecCUrl;
- CDocument iDocument;
- vector<CDocument> vecCDocument;
- unsigned int docId = 0;
- //ifstream ifs("Tianwang.raw.2559638448");
- ifstream ifs(strFileName.c_str()); //DocSegment Tianwang.raw.****
- if (!ifs)
- {
- cerr << "Cannot open tianwang.img.info for input/n";
- return -1;
- }
- ifstream ifsUrl("Url.idx.sort_uniq"); //排序并消重后的url字典
- if (!ifsUrl)
- {
- cerr << "Cannot open Url.idx.sort_uniq for input/n";
- return -1;
- }
- ifstream ifsDoc("Doc.idx"); //字典文件
- if (!ifsDoc)
- {
- cerr << "Cannot open Doc.idx for input/n";
- return -1;
- }
- while (getline(ifsUrl,strLine)) //偏离url字典存入一个向量内存中
- {
- char chksum[33];
- int docid;
- memset(chksum, 0, 33);
- sscanf( strLine.c_str(), "%s%d", chksum, &docid );
- iUrl.m_sChecksum = chksum;
- iUrl.m_nDocId = docid;
- vecCUrl.push_back(iUrl);
- }
- while (getline(ifsDoc,strLine)) //偏离字典文件将其放入一个向量内存中
- {
- int docid,pos,length;
- char chksum[33];
- memset(chksum, 0, 33);
- sscanf( strLine.c_str(), "%d%d%d%s", &docid, &pos, &length,chksum );
- iDocument.m_nDocId = docid;
- iDocument.m_nPos = pos;
- iDocument.m_nLength = length;
- iDocument.m_sChecksum = chksum;
- vecCDocument.push_back(iDocument);
- }
- strFileName += ".seg";
- ofstream fout(strFileName.c_str(), ios::in|ios::out|ios::trunc|ios::binary); //设置完成分词后的数据输出文件
- for ( docId=0; docId<MAX_DOC_ID; docId++ )
- {
- // find document according to docId
- int length = vecCDocument[docId+1].m_nPos - vecCDocument[docId].m_nPos -1;
- char *pContent = new char[length+1];
- memset(pContent, 0, length+1);
- ifs.seekg(vecCDocument[docId].m_nPos);
- ifs.read(pContent, length);
- char *s;
- s = pContent;
- // skip Head
- int bytesRead = 0,newlines = 0;
- while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
- {
- if (*s == '/n')
- newlines++;
- else
- newlines = 0;
- s++;
- bytesRead++;
- }
- if (bytesRead == HEADER_BUF_SIZE-1) continue;
- // skip header
- bytesRead = 0,newlines = 0;
- while (newlines != 2 && bytesRead != HEADER_BUF_SIZE-1)
- {
- if (*s == '/n')
- newlines++;
- else
- newlines = 0;
- s++;
- bytesRead++;
- }
- if (bytesRead == HEADER_BUF_SIZE-1) continue;
- //iDocument.m_sBody = s;
- iDocument.RemoveTags(s); //去除<>
- iDocument.m_sBodyNoTags = s;
- delete[] pContent;
- string strLine = iDocument.m_sBodyNoTags;
- CStrFun::ReplaceStr(strLine, " ", " ");
- CStrFun::EmptyStr(strLine); // set " /t/r/n" to " "
- // segment the document 具体分词处理
- CHzSeg iHzSeg;
- strLine = iHzSeg.SegmentSentenceMM(iDict,strLine);
- fout << docId << endl << strLine;
- fout << endl;
- }
- return(0);
- }
这里只是浮光掠影式的过一遍大概的代码,后面我会有专题详细讲解 parse html 和 segment docment 等技术