有大量中文繁体的文本,都是网上摘取的,大小有6G。需要提取文本中相同的字符串的最大个数
有几个关键问题:
1.字符串的粒度如何确定?如何表示字符串的唯一位置索引?
2.字符串过多,如何快速确定是否相同,时间复杂度要尽量低
3.文本过多,无法放入内存怎么办
4.如何统计相同hash值的个数,以及记录它们的索引位置
对于1,每次读取一行,然后以100个字节为一组作为比较的字符串。每行最后一组,以实际读取到的字符串大小为准;将行号和所在组号作为二元组,唯一标识字符串hash值位置索引
对于2,必须hash了。但是冲突如何解决?网上有很多字符串hash算法,冲突率比较低。本人用二次hash来尽量避免冲突,就算有个别误差,也可以接受
对于3,由于最终比较的是hash值,将它们再次hash到10个小文件中,这样保证同一个hash值在同一个文件中。然后分别统计每个小文件中最大的重复度,最后取最大值输出即可。
对于4,用STL中的map,将两个hash值作为map的key,而vector<索引>作为map的value值,即map<HashKey_S, vector<PositionOfText_S>, Classcomp_S>,对于key还要定义小于号操作符Classcomp_S
- //map中key大小比较函数对象
- struct Classcomp_S {
- bool operator() (const HashKey_S &lkey, const HashKey_S &rkey) const
- {
- bool flag = false;
- if (lkey.uifirsthash < rkey.uifirsthash)
- flag = true;
- else if(lkey.uifirsthash == rkey.uifirsthash)
- {
- if (lkey.uisecondhash < rkey.uisecondhash)
- flag = true;
- }
- return flag;
- }
- };
写代码中发现,有些字符串重复度很高,它们是一些不可打印的相同字符或者几乎相同的字符。最后用函数countDiffChar统计不同字符个数,将比较小的过滤掉。
代码写的比较乱,"../../data/filelist.txt"是6G原始文本的文件名列表,"../../data/hash2line.txt"是提取hash值和位置索引后存储的文件,之后还要将它切分。
代码如下:
main.cpp
- #include <iostream>
- #include "statSameLine.h"
- using namespace std;
- int main()
- {
- MakeResult();
- return 0;
- }
statSameLine.h
- #ifndef STATSAMELINE_H
- #define STATSAMELINE_H
- typedef unsigned char BYTE;
- typedef unsigned int DWORD;
- typedef unsigned short WORD;
- struct HashKey_S
- {
- HashKey_S(DWORD uiFirst = 0, DWORD uiSecond = 0):uifirsthash(uiFirst), uisecondhash(uiSecond){
- }
- HashKey_S(const HashKey_S &stHash):uifirsthash(stHash.uifirsthash), uisecondhash(stHash.uisecondhash){
- }
- DWORD uifirsthash;
- DWORD uisecondhash;
- };
- struct PositionOfText_S
- {
- PositionOfText_S(DWORD uiFile = 0, DWORD uiLine = 0, DWORD uiBatch = 0):uiFileNum(uiFile), uiLineNum(uiLine), uiBatch(uiBatch){
- }
- PositionOfText_S(const PositionOfText_S &pos):uiFileNum(pos.uiFileNum), uiLineNum(pos.uiLineNum), uiBatch(pos.uiBatch)
- {
- }
- DWORD uiFileNum;
- DWORD uiLineNum;
- DWORD uiBatch;
- };
- struct Hash2LineTable_S
- {
- DWORD uiFileNum;
- DWORD uiLineNum;
- DWORD uiBatch;
- HashKey_S sthashkey;
- };
- void TestStl();
- void MakeResult();
- #endif // STATSAMELINE_H
statSameLine.cpp
- #include "statSameLine.h"
- #include <cstring>
- #include <cassert>
- #include <cstdlib>
- #include <cstdio>
- #include <iostream>
- #include <fstream>
- #include <vector>
- #include <string>
- #include <map>
- #include <algorithm>
- #include <iomanip>
- #include <locale>
- #include <tchar.h>
- #include "hash.h"
- using namespace std;
- //map中key大小比较函数对象
- struct Classcomp_S {
- bool operator() (const HashKey_S &lkey, const HashKey_S &rkey) const
- {
- bool flag = false;
- if (lkey.uifirsthash < rkey.uifirsthash)
- flag = true;
- else if(lkey.uifirsthash == rkey.uifirsthash)
- {
- if (lkey.uisecondhash < rkey.uisecondhash)
- flag = true;
- }
- return flag;
- }
- };
- //去除字符串头尾空格与tab
- string trimEnd(string &str)
- {
- const string delim =" \t\r" ;
- string r=str.erase(str.find_last_not_of(delim)+1);
- return r.erase(0,r.find_first_not_of(delim));
- }
- //统计字符串中不同字符的个数,目的是为了去掉那些大量无意义的相同字符
- int countDiffChar(string &stext)
- {
- const int isize = 256;
- int bitarr[isize] = {0};
- for (size_t i = 0; i < stext.size(); ++i)
- {
- int iindex = (int)stext[i];
- bitarr[iindex] = 1;
- }
- int icount = 0;
- for (int i = 0; i < isize; ++i)
- {
- if (bitarr[i] != 0)
- ++icount;
- }
- return icount;
- }
- //存储一行文本,将6G文件的hash值以及每段文本所在位置保存,便于第二次处理
- void storeOneLine(ofstream &ofs, const string &sLine, DWORD uiFileNum, DWORD uiLineNum, int &iFilterNumber)
- {
- const int iLengthOfBatch = 100;
- int iAllBatches, iLineSize;
- DWORD uiHashcode;
- char szBatchLine[iLengthOfBatch + 1];
- Hash2LineTable_S stHash2LineTable;
- iLineSize = sLine.size();
- iAllBatches = iLineSize / iLengthOfBatch + 1;
- int iHashNum = 0;
- for (int i = 0; i < iAllBatches; ++i)
- {
- int iBegin, iEnd, iLength = iLengthOfBatch;
- iBegin = i * iLengthOfBatch;
- iEnd = iBegin + iLengthOfBatch;
- //
- if (iBegin >= iLineSize)
- break;
- if (iEnd > iLineSize)
- iLength = iLineSize - iBegin;
- memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));
- memset(szBatchLine, 0, sizeof(szBatchLine));
- //here is tangle, .................
- string sSubLine;
- sSubLine = sLine.substr(iBegin, iLength);
- sSubLine = trimEnd(sSubLine);
- size_t uiLenOfSubstr = sSubLine.size();
- //cout << "uilen is " << uiLenOfSubstr << endl;
- if (uiLenOfSubstr < 50)
- {
- continue;
- }
- if (countDiffChar(sSubLine) < 3)
- {
- ++iFilterNumber;
- continue;
- }
- sprintf(szBatchLine, "%s", sSubLine.c_str());
- uiHashcode = BKDRHash(szBatchLine);
- stHash2LineTable.sthashkey.uifirsthash = uiHashcode;
- uiHashcode = APHash(szBatchLine);
- stHash2LineTable.sthashkey.uisecondhash = uiHashcode;
- stHash2LineTable.uiFileNum = uiFileNum;
- stHash2LineTable.uiLineNum = uiLineNum;
- stHash2LineTable.uiBatch = i;
- //wirte one Hash2LineTable_S to file
- ofs.write((const char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));
- }
- }
- //根据行号以及所在组,输出对应文本
- void outputSpecifyFile(ofstream &ofs, const string &sfilename, DWORD uiLine, DWORD uiBatch)
- {
- const int iLengthOfBatch = 100;
- ifstream ifs;
- ifs.open((sfilename.c_str()), ios::in);
- assert(ifs.is_open());
- string sLine, stHash2LineTable;
- DWORD uiLineNum = 0;
- while (getline(ifs, sLine))
- {
- ++uiLineNum;
- if (uiLineNum == uiLine)
- break;
- sLine.clear();
- }
- ifs.close();
- int iLineSize, iBegin, iEnd, iLength = iLengthOfBatch;
- iLineSize = sLine.size();
- iBegin = uiBatch * iLengthOfBatch;
- iEnd = iBegin + iLengthOfBatch;
- //
- if (iBegin >= iLineSize)
- {
- cout << "error" << endl;
- return;
- }
- if (iEnd > iLineSize)
- iLength = iLineSize - iBegin;
- stHash2LineTable = sLine.substr(iBegin, iLength);
- stHash2LineTable = trimEnd(stHash2LineTable);
- if (stHash2LineTable.size() < 50)
- {
- cout << "stHash2LineTable.size() < 50, error" << endl;
- return;
- }
- /*cout << "the size of substring is " << stHash2LineTable.size() << "-->";
- cout << "***************start\n";*/
- ofs.write((const char*)(stHash2LineTable.c_str()), stHash2LineTable.size());
- ofs << endl;
- }
- void parse_one_file(const char *pconefile, ofstream &ofs,
- vector<DWORD> &vecline, DWORD uiFileNum, int &iFilterNumber)
- {
- cout << "parse_one_file:" << pconefile << endl;
- int imaxbytes_line = 0;
- DWORD uiLineNum = 0;
- ifstream ifs;
- ifs.open((pconefile), ios::in);
- assert(ifs.is_open());
- //store every line of char
- string sLine;
- //ofstream ofsUtf8;
- //ofsUtf8.open("utf8_chinese.txt", ios::binary );
- while (getline(ifs, sLine))
- {
- ++uiLineNum;
- //erase some chars: \r, \t, \s
- sLine = trimEnd(sLine);
- int iLineSize = sLine.size();
- if (iLineSize == 0)
- {
- continue;
- }
- if (iLineSize > imaxbytes_line)
- {
- imaxbytes_line = iLineSize;
- }
- storeOneLine(ofs, sLine, uiFileNum, uiLineNum, iFilterNumber);
- /*ofsUtf8.write((char*)(sLine.c_str()), sLine.size());
- ofsUtf8 << endl;
- if (uiLineNum > 100)
- {
- break;
- }*/
- sLine.clear();
- }
- //ofsUtf8.close();
- ifs.close();
- //uiLineNum from 1 to end
- vecline.push_back(uiLineNum);
- cout << imaxbytes_line << " is max bytes of every line" << endl;
- }
- //解析原始文本,存储每个字符串对应的hash值以及索引位置
- int parse_all_file(const char *pcallfile, const char *pcwritefile)
- {
- vector<DWORD> vecline;
- char cread;
- ifstream ifs;
- ofstream ofs;
- int iFilterNumber = 0;
- ofs.open(pcwritefile, ios::out | ios::binary);
- assert(ofs.is_open());
- ifs.open(pcallfile, ios::in);
- assert(ifs.is_open());
- string sLine;
- DWORD uiFileNum = 0;
- while (ifs.read(&cread, 1))
- {
- if (cread != 0x0a)
- {
- sLine.push_back(cread);
- continue;
- }
- ++uiFileNum;
- sLine = string("../../../data/alldata/") + sLine;
- parse_one_file(sLine.c_str(), ofs, vecline, uiFileNum, iFilterNumber);
- sLine.clear();
- }
- ofs.close();
- ifs.close();
- ofs.open("../../../data/linenum.txt");
- for (size_t i = 0; i < vecline.size(); ++i)
- {
- ofs << vecline[i] << endl;
- cout << vecline[i] << endl;
- }
- ofs.close();
- return iFilterNumber;
- }
- //切分文件,便于直接在内存中建立map
- void splitfile(const char *pcfile)
- {
- ifstream ifs;
- ifs.open(pcfile, ios::in | ios::binary);
- assert(ifs.is_open());
- Hash2LineTable_S stHash2LineTable;
- DWORD uifirsthashcode;
- char szcbuffer[150], swritename[80];
- int inum = 0;
- const int ifilenum = 10;
- ofstream szofs[ifilenum];
- for(int i = 0; i < ifilenum; ++i)
- {
- sprintf (swritename, "../../../data/hash_%d.txt", i);
- szofs[i].open(swritename, ios::out | ios::binary);
- memset(swritename, 0, sizeof(swritename));
- }
- while(ifs.read((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S)))
- {
- uifirsthashcode = stHash2LineTable.sthashkey.uifirsthash;
- DWORD uihashvalue;
- sprintf(szcbuffer, "%d", uifirsthashcode);
- uihashvalue = ELFHash(szcbuffer) % ifilenum;
- if (uihashvalue == 1)
- ++inum;
- szofs[uihashvalue].write((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));
- memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));
- memset(swritename, 0, sizeof(swritename));
- }
- ifs.close();
- cout << "splitfile hash value 1, num is " << inum << endl;
- for (int i = 0; i < ifilenum; ++i)
- {
- szofs[i].close();
- }
- }
- void getVecLine(vector<DWORD> &vecline)
- {
- ifstream ifs;
- ifs.open("../../../data/linenum.txt", ios::out);
- assert(ifs.is_open());
- DWORD uiread;
- string sLine;
- vecline.clear();
- while (getline(ifs, sLine))
- {
- uiread = atoi(sLine.c_str());
- vecline.push_back(uiread);
- };
- ifs.close();
- }
- vector<PositionOfText_S> getSameLineOfCount(const char *pcfile, DWORD &uiDegreeOfDup, vector<PositionOfText_S> &stMaxPos)
- {
- cout << "getSameLineOfCount, file name is " << pcfile << endl;
- vector<PositionOfText_S> *pmaxlineOfPos;
- ifstream ifs;
- ifs.open(pcfile, ios::in | ios::binary);
- assert(ifs.is_open());
- Hash2LineTable_S stHash2LineTable;
- memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));
- map<HashKey_S, vector<PositionOfText_S>, Classcomp_S> maphash2postion;
- long lkeynum = 0;
- while (ifs.eof() == false)
- {
- //cout << ifs.tellg() << endl;
- ifs.read((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));
- ++lkeynum;
- DWORD uiFileNum, uiLine, uiBatch;
- uiFileNum = stHash2LineTable.uiFileNum;
- uiLine = stHash2LineTable.uiLineNum;
- if (uiLine > 498417)
- {
- cout << "uilinenum is large than 498417, its value is " << uiLine << endl;
- break;
- }
- uiBatch = stHash2LineTable.uiBatch;
- maphash2postion[stHash2LineTable.sthashkey].push_back(PositionOfText_S(uiFileNum, uiLine, uiBatch));
- }
- cout << "*************map size************\n";
- // cout << "the num of key value is " << lkeynum << endl;
- cout << "the num of map is " << maphash2postion.size() << endl;
- ifs.close();
- DWORD uimaxcount = 0;
- const HashKey_S *pmaxkey = NULL;
- vector<PositionOfText_S> *pvecposOfText = NULL;
- ofstream ofs;
- ofs.open("../../../data/hash2linetable.txt");
- /*保存每个字符串重复度,hashkey,以及位置(虽然本程序用不到)
- *
- *file format
- *1.uicount, the max count of same line, the size of vector<PositionOfText_S>
- *2.HashKey_S
- *3.many PositionOfText_S, the num is uicount
- */
- for (map<HashKey_S, vector<PositionOfText_S>, Classcomp_S >::iterator it = maphash2postion.begin(); it != maphash2postion.end(); ++it)
- {
- DWORD uicount;
- const HashKey_S *pkey = &(it->first);
- pvecposOfText = &(it->second);
- uicount = pvecposOfText->size();
- if (uicount > uimaxcount)
- {
- uimaxcount = uicount;
- pmaxlineOfPos = pvecposOfText;
- pmaxkey = pkey;
- }
- ofs.write((char*)(&uicount), sizeof(DWORD));
- ofs.write((char*)(pkey), sizeof(HashKey_S));
- for (size_t i = 0; i < uicount; ++i)
- ofs.write((char*)(&(pvecposOfText->at(i))), sizeof(PositionOfText_S));
- }
- ofs.close();
- cout << "maxcount is " << uimaxcount << endl;
- // cout << "maxline is " << uimaxline << endl;
- // cout << "max key, first is " << pmaxkey->uifirsthash << ", second is " << pmaxkey->uisecondhash << endl;
- if (uimaxcount > uiDegreeOfDup)
- {
- uiDegreeOfDup = uimaxcount;
- stMaxPos = *pmaxlineOfPos;
- }
- return *pmaxlineOfPos;
- }
- void print_one_max_line(ofstream &ofs, const char *pcFileList, const PositionOfText_S &stPos, int *pibit)
- {
- DWORD uiLine, uiFileNum;
- uiFileNum = stPos.uiFileNum;
- uiLine = stPos.uiLineNum;
- ifstream ifs;
- ifs.open(pcFileList, ios::in);
- assert(ifs.is_open());
- string sLine;
- int inum = 0;
- while (getline(ifs, sLine))
- {
- ++inum;
- if (inum == uiFileNum)
- {
- ofs << sLine << ":the num of line is " << uiLine << ", batch is " << stPos.uiBatch << endl;
- sLine = string("../../../data/alldata/") + sLine;
- outputSpecifyFile(ofs, sLine, uiLine, stPos.uiBatch);
- break;
- }
- sLine.clear();
- }
- ifs.close();
- }
- void output_result(const char *pcFileList, vector<PositionOfText_S> *pmaxlineOfPos, const char *pcName)
- {
- cout << "output_result to file:" << pcName << endl;
- int isum = 0;
- ofstream ofs;
- ofs.open(pcName, ios::out | ios::binary);
- int szBit[30];
- for (vector<PositionOfText_S>::const_iterator cit = pmaxlineOfPos->begin(); cit != pmaxlineOfPos->end(); ++cit)
- {
- /*if (isum++ == 20)
- {
- break;
- }*/
- print_one_max_line(ofs, pcFileList, *cit, szBit);
- }
- ofs.close();
- }
- void MakeResult()
- {
- const char *pcWriteName = "../../../data/hash2line.txt", *pcFileList = "../../../data/filelist.txt";
- const int ifilenum = 10;
- char szName[80];
- int iFilterNumber = 0;
- //解析所有文件
- //iFilterNumber = parse_all_file(pcFileList, pcWriteName);
- //将大文件划分为10个小文件
- //splitfile(pcWriteName);
- DWORD uiDegreeOfDup = 0;
- vector<PositionOfText_S> stPos, stMaxPos;
- for (int i = 0; i < ifilenum; ++i)
- {
- cout << "***every file, the pos of max line***\n";
- memset(szName, 0, sizeof(szName));
- sprintf(szName, "../../../data/hash_%d.txt", i);
- //获取每个小文件的最大重复子串
- stPos = getSameLineOfCount(szName, uiDegreeOfDup, stMaxPos);
- memset(szName, 0, sizeof(szName));
- sprintf(szName, "../../../data/hash_%d_maxpostext.txt", i);
- output_result(pcFileList, &stPos, szName);
- stPos.clear();
- }
- const char *pcOutFileName = "degree_of_dup_result.txt";
- //output_result(pcFileList, &stMaxPos, pcOutFileName);
- ofstream ofs;
- ofs.open(pcOutFileName, ios::out | ios::app);
- ofs << "**************" << endl;
- ofs << "uiDegreeOfDup is " << uiDegreeOfDup << endl;
- ofs << "filter num is " << iFilterNumber << endl;
- ofs.close();
- }
hash.h
- #ifndef HASH_H
- #define HASH_H
- typedef unsigned char BYTE;
- typedef unsigned int DWORD;
- DWORD SDBMHash(char *pcstr);
- // RS Hash Function
- DWORD RSHash(char *pcstr);
- // JS Hash Function
- DWORD JSHash(char *pcstr);
- // P. J. Weinberger Hash Function
- DWORD PJWHash(char *pcstr);
- // ELF Hash Function
- DWORD ELFHash(char *pstr);
- // BKDR Hash Function
- DWORD BKDRHash(char *pcstr);
- // DJB Hash Function
- DWORD DJBHash(char *pcstr);
- // AP Hash Function
- DWORD APHash(char *pcstr);
- #endif // HASH_H
hash.cpp
- #include "hash.h"
- DWORD SDBMHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD hash = 0;
- while (*pcstr)
- {
- // equivalent to: hash = 65599*hash + (*pcstr++);
- hash = (*pcstr++) + (hash << 6) + (hash << 16) - hash;
- }
- return (hash & 0x7FFFFFFF);
- }
- // RS Hash Function
- DWORD RSHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD b = 378551;
- DWORD a = 63689;
- DWORD hash = 0;
- while (*pcstr)
- {
- hash = hash * a + (*pcstr++);
- a *= b;
- }
- return (hash & 0x7FFFFFFF);
- }
- // JS Hash Function
- DWORD JSHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD hash = 1315423911;
- while (*pcstr)
- {
- hash ^= ((hash << 5) + (*pcstr++) + (hash >> 2));
- }
- return (hash & 0x7FFFFFFF);
- }
- // P. J. Weinberger Hash Function
- DWORD PJWHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD BitsInUnignedInt = (DWORD)(sizeof(DWORD) * 8);
- DWORD ThreeQuarters = (DWORD)((BitsInUnignedInt * 3) / 4);
- DWORD OneEighth = (DWORD)(BitsInUnignedInt / 8);
- DWORD HighBits = (DWORD)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
- DWORD hash = 0;
- DWORD test = 0;
- while (*pcstr)
- {
- hash = (hash << OneEighth) + (*pcstr++);
- if ((test = hash & HighBits) != 0)
- {
- hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
- }
- }
- return (hash & 0x7FFFFFFF);
- }
- // ELF Hash Function
- DWORD ELFHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD hash = 0;
- DWORD x = 0;
- while (*pcstr)
- {
- hash = (hash << 4) + (*pcstr++);
- if ((x = hash & 0xF0000000L) != 0)
- {
- hash ^= (x >> 24);
- hash &= ~x;
- }
- }
- return (hash & 0x7FFFFFFF);
- }
- // BKDR Hash Function
- DWORD BKDRHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD seed = 131; // 31 131 1313 13131 131313 etc..
- DWORD hash = 0;
- while (*pcstr)
- {
- hash = hash * seed + (*pcstr++);
- }
- return (hash & 0x7FFFFFFF);
- }
- // DJB Hash Function
- DWORD DJBHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD hash = 5381;
- while (*pcstr)
- {
- hash += (hash << 5) + (*pcstr++);
- }
- return (hash & 0x7FFFFFFF);
- }
- // AP Hash Function
- DWORD APHash(char *pcstr)
- {
- if (*pcstr == 0)
- return 0;
- DWORD hash = 0;
- int i;
- for (i=0; *pcstr; i++)
- {
- if ((i & 1) == 0)
- {
- hash ^= ((hash << 7) ^ (*pcstr++) ^ (hash >> 3));
- }
- else
- {
- hash ^= (~((hash << 11) ^ (*pcstr++) ^ (hash >> 5)));
- }
- }
- return (hash & 0x7FFFFFFF);
- }