统计大量文本中重复字符串的最大个数

最新推荐文章于 2024-05-14 16:56:48 发布

空穴来风

最新推荐文章于 2024-05-14 16:56:48 发布

阅读量7.1k

点赞数 1

分类专栏：算法文章标签： hash 重复度内存时间复杂度

本文链接：https://blog.csdn.net/qiaofangjie/article/details/16945873

版权

算法专栏收录该内容

2 篇文章 0 订阅

订阅专栏

有大量中文繁体的文本，都是网上摘取的，大小有6G。需要提取文本中相同的字符串的最大个数

有几个关键问题：

1.字符串的粒度如何确定？如何表示字符串的唯一位置索引？

2.字符串过多，如何快速确定是否相同，时间复杂度要尽量低

3.文本过多，无法放入内存怎么办

4.如何统计相同hash值的个数，以及记录它们的索引位置

对于1，每次读取一行，然后以100个字节为一组作为比较的字符串。每行最后一组，以实际读取到的字符串大小为准；将行号和所在组号作为二元组，唯一标识字符串hash值位置索引

对于2，必须hash了。但是冲突如何解决？网上有很多字符串hash算法，冲突率比较低。本人用二次hash来尽量避免冲突，就算有个别误差，也可以接受

对于3，由于最终比较的是hash值，将它们再次hash到10个小文件中，这样保证同一个hash值在同一个文件中。然后分别统计每个小文件中最大的重复度，最后取最大值输出即可。

对于4，用STL中的map，将两个hash值作为map的key，而vector<索引>作为map的value值，即map<HashKey_S, vector<PositionOfText_S>, Classcomp_S>，对于key还要定义小于号操作符Classcomp_S

//map中key大小比较函数对象
struct Classcomp_S {
    bool operator() (const HashKey_S &lkey, const HashKey_S &rkey) const
    {
        bool flag = false;
        if (lkey.uifirsthash < rkey.uifirsthash)
            flag = true;
        else if(lkey.uifirsthash == rkey.uifirsthash)
        {
            if (lkey.uisecondhash < rkey.uisecondhash)
                flag = true;
        }
        return flag;
    }
};

写代码中发现，有些字符串重复度很高，它们是一些不可打印的相同字符或者几乎相同的字符。最后用函数countDiffChar统计不同字符个数，将比较小的过滤掉。

代码写的比较乱，"../../data/filelist.txt"是6G原始文本的文件名列表，"../../data/hash2line.txt"是提取hash值和位置索引后存储的文件，之后还要将它切分。

代码如下：

main.cpp

#include <iostream>
#include "statSameLine.h"
using namespace std;

int main()
{
    MakeResult();
    return 0;
}

statSameLine.h

#ifndef STATSAMELINE_H
#define STATSAMELINE_H

typedef unsigned char BYTE;
typedef unsigned int DWORD;
typedef unsigned short WORD;


struct HashKey_S
{
	HashKey_S(DWORD uiFirst = 0, DWORD uiSecond = 0):uifirsthash(uiFirst), uisecondhash(uiSecond){
	}
	HashKey_S(const HashKey_S &stHash):uifirsthash(stHash.uifirsthash), uisecondhash(stHash.uisecondhash){
	}
    DWORD uifirsthash;
    DWORD uisecondhash;
};

struct PositionOfText_S
{
    PositionOfText_S(DWORD uiFile = 0, DWORD uiLine = 0, DWORD uiBatch = 0):uiFileNum(uiFile), uiLineNum(uiLine), uiBatch(uiBatch){

    }

    PositionOfText_S(const PositionOfText_S &pos):uiFileNum(pos.uiFileNum), uiLineNum(pos.uiLineNum), uiBatch(pos.uiBatch)
    {

    }

	DWORD uiFileNum;
    DWORD uiLineNum;
    DWORD uiBatch;
};

struct Hash2LineTable_S
{
	DWORD uiFileNum;
    DWORD uiLineNum;
    DWORD uiBatch;
    HashKey_S sthashkey;
};


void TestStl();
void MakeResult();



#endif // STATSAMELINE_H

statSameLine.cpp

#include "statSameLine.h"

#include <cstring>
#include <cassert>
#include <cstdlib>
#include <cstdio>

#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <map>
#include <algorithm>
#include <iomanip>
#include <locale>
#include <tchar.h>

#include "hash.h"

using namespace std;
//map中key大小比较函数对象
struct Classcomp_S {
    bool operator() (const HashKey_S &lkey, const HashKey_S &rkey) const
    {
        bool flag = false;
        if (lkey.uifirsthash < rkey.uifirsthash)
            flag = true;
        else if(lkey.uifirsthash == rkey.uifirsthash)
        {
            if (lkey.uisecondhash < rkey.uisecondhash)
                flag = true;
        }
        return flag;
    }
};
//去除字符串头尾空格与tab
string trimEnd(string &str)
{
    const string delim =" \t\r" ;
    string r=str.erase(str.find_last_not_of(delim)+1);
    return r.erase(0,r.find_first_not_of(delim));
}
//统计字符串中不同字符的个数，目的是为了去掉那些大量无意义的相同字符
int countDiffChar(string &stext)
{
    const int isize = 256;
    int bitarr[isize] = {0};
    for (size_t i = 0; i < stext.size(); ++i)
    {
        int iindex = (int)stext[i];
        bitarr[iindex] = 1;
    }

    int icount = 0;
    for (int i = 0; i < isize; ++i)
    {
        if (bitarr[i] != 0)
            ++icount;
    }
    return icount;

}
//存储一行文本，将6G文件的hash值以及每段文本所在位置保存，便于第二次处理
void storeOneLine(ofstream &ofs, const string &sLine, DWORD uiFileNum, DWORD uiLineNum, int &iFilterNumber)
{
	
    const int iLengthOfBatch = 100;

    int iAllBatches, iLineSize;


    DWORD uiHashcode;

    char szBatchLine[iLengthOfBatch + 1];

    Hash2LineTable_S stHash2LineTable;

    iLineSize = sLine.size();
    iAllBatches = iLineSize / iLengthOfBatch + 1;

	int iHashNum = 0;
	

    for (int i = 0; i < iAllBatches; ++i)
    {
        int iBegin, iEnd, iLength = iLengthOfBatch;

        iBegin = i * iLengthOfBatch;
        iEnd = iBegin + iLengthOfBatch;
        //
        if (iBegin >= iLineSize)
            break;
        if (iEnd > iLineSize)
            iLength = iLineSize - iBegin;

        memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));
        memset(szBatchLine, 0, sizeof(szBatchLine));

        //here is tangle, .................
        string sSubLine;

        sSubLine = sLine.substr(iBegin, iLength);
        sSubLine = trimEnd(sSubLine);
		size_t uiLenOfSubstr = sSubLine.size();
		//cout << "uilen is " << uiLenOfSubstr << endl;
        if (uiLenOfSubstr < 50)
        {
            continue;
        }
        if (countDiffChar(sSubLine) < 3)
        {
            ++iFilterNumber;
            continue;
        }

		sprintf(szBatchLine, "%s", sSubLine.c_str());

        uiHashcode = BKDRHash(szBatchLine);
        stHash2LineTable.sthashkey.uifirsthash = uiHashcode;
        uiHashcode = APHash(szBatchLine);
        stHash2LineTable.sthashkey.uisecondhash = uiHashcode;
		stHash2LineTable.uiFileNum = uiFileNum;
        stHash2LineTable.uiLineNum = uiLineNum;
		
        stHash2LineTable.uiBatch = i;
        //wirte one Hash2LineTable_S to file
        ofs.write((const char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));
    }
}
//根据行号以及所在组，输出对应文本
void outputSpecifyFile(ofstream &ofs, const string &sfilename, DWORD uiLine, DWORD uiBatch)
{

    const int iLengthOfBatch = 100;

    ifstream ifs;
    ifs.open((sfilename.c_str()), ios::in);
    assert(ifs.is_open());
    string sLine, stHash2LineTable;

    DWORD uiLineNum = 0;
    while (getline(ifs, sLine))
    {
        ++uiLineNum;
        if (uiLineNum == uiLine)
            break;
        sLine.clear();
    }
    ifs.close();

    int iLineSize, iBegin, iEnd, iLength = iLengthOfBatch;
    iLineSize = sLine.size();

    iBegin = uiBatch * iLengthOfBatch;
    iEnd = iBegin + iLengthOfBatch;
    //
    if (iBegin >= iLineSize)
    {
        cout << "error" << endl;
        return;
    }

    if (iEnd > iLineSize)
        iLength = iLineSize - iBegin;


    stHash2LineTable = sLine.substr(iBegin, iLength);
    stHash2LineTable = trimEnd(stHash2LineTable);
    if (stHash2LineTable.size() < 50)
    {
        cout << "stHash2LineTable.size() < 50, error" << endl;
        return;
    }
    /*cout << "the size of substring is " << stHash2LineTable.size() << "-->";

    cout << "***************start\n";*/
	ofs.write((const char*)(stHash2LineTable.c_str()), stHash2LineTable.size());
	ofs << endl;
	
}

void parse_one_file(const char *pconefile, ofstream &ofs,
                   vector<DWORD> &vecline, DWORD uiFileNum, int &iFilterNumber)
{
    cout << "parse_one_file:" << pconefile << endl;
    int imaxbytes_line = 0;

    DWORD uiLineNum = 0;

    ifstream ifs;
    ifs.open((pconefile), ios::in);
    assert(ifs.is_open());
    //store every line of char
    string sLine;

	//ofstream ofsUtf8;
	//ofsUtf8.open("utf8_chinese.txt", ios::binary );

    while (getline(ifs, sLine))
    {
        ++uiLineNum;
        //erase some chars: \r, \t, \s
        sLine = trimEnd(sLine);
        int iLineSize = sLine.size();
        if (iLineSize == 0)
        {
            continue;
        }
        if (iLineSize > imaxbytes_line)
        {
            imaxbytes_line = iLineSize;
        }
        storeOneLine(ofs, sLine, uiFileNum, uiLineNum, iFilterNumber);
		/*ofsUtf8.write((char*)(sLine.c_str()), sLine.size());
		ofsUtf8 << endl;
		if (uiLineNum > 100)
		{
			
			break;
		}*/
        sLine.clear();
		

    }
	//ofsUtf8.close();
    ifs.close();

	//uiLineNum from 1 to end
    vecline.push_back(uiLineNum);
    cout << imaxbytes_line << " is max bytes of every line" << endl;
}
//解析原始文本，存储每个字符串对应的hash值以及索引位置
int parse_all_file(const char *pcallfile, const char *pcwritefile)
{
	vector<DWORD> vecline;
    char cread;
    ifstream ifs;
    ofstream ofs;

    int iFilterNumber = 0;

	ofs.open(pcwritefile, ios::out | ios::binary);
    assert(ofs.is_open());
    ifs.open(pcallfile, ios::in);
    assert(ifs.is_open());
    string sLine;
    DWORD uiFileNum = 0;
    while (ifs.read(&cread, 1))
    {
        if (cread != 0x0a)
        {
            sLine.push_back(cread);
            continue;
        }
		++uiFileNum;

        sLine = string("../../../data/alldata/") + sLine;
        parse_one_file(sLine.c_str(), ofs, vecline, uiFileNum, iFilterNumber);
        sLine.clear();
    }
    ofs.close();
    ifs.close();
    ofs.open("../../../data/linenum.txt");

    for (size_t i = 0; i < vecline.size(); ++i)
    {
        ofs << vecline[i] << endl;
		cout << vecline[i] << endl;
    }
    ofs.close();

    return iFilterNumber;

}
//切分文件，便于直接在内存中建立map
void splitfile(const char *pcfile)
{
    ifstream ifs;
    ifs.open(pcfile, ios::in | ios::binary);
    assert(ifs.is_open());

    Hash2LineTable_S stHash2LineTable;
    DWORD uifirsthashcode;

    char szcbuffer[150], swritename[80];
    int inum = 0;
	const int ifilenum = 10;

    ofstream szofs[ifilenum];
    for(int i = 0; i < ifilenum; ++i)
    {
        sprintf (swritename, "../../../data/hash_%d.txt", i);
        szofs[i].open(swritename, ios::out | ios::binary);

        memset(swritename, 0, sizeof(swritename));
    }

    while(ifs.read((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S)))
    {
        uifirsthashcode = stHash2LineTable.sthashkey.uifirsthash;

        DWORD uihashvalue;
        sprintf(szcbuffer, "%d", uifirsthashcode);

        uihashvalue = ELFHash(szcbuffer) % ifilenum;

        if (uihashvalue == 1)
            ++inum;

        szofs[uihashvalue].write((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));

        memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));
        memset(swritename, 0, sizeof(swritename));

    }
    ifs.close();
    cout << "splitfile hash value 1, num is " << inum << endl;

    for (int i = 0; i < ifilenum; ++i)
    {
        szofs[i].close();
    }
}



void getVecLine(vector<DWORD> &vecline)
{
    ifstream ifs;
    ifs.open("../../../data/linenum.txt", ios::out);
    assert(ifs.is_open());
    DWORD uiread;
    string sLine;
    vecline.clear();
    while (getline(ifs, sLine))
    {
        uiread = atoi(sLine.c_str());
        vecline.push_back(uiread);
    };
    ifs.close();
}


vector<PositionOfText_S> getSameLineOfCount(const char *pcfile, DWORD &uiDegreeOfDup, vector<PositionOfText_S> &stMaxPos)
{
    cout << "getSameLineOfCount, file name is " << pcfile << endl;
    vector<PositionOfText_S> *pmaxlineOfPos;
    ifstream ifs;
	ifs.open(pcfile, ios::in | ios::binary);
    assert(ifs.is_open());

    Hash2LineTable_S stHash2LineTable;
    memset(&stHash2LineTable, 0, sizeof(Hash2LineTable_S));


    map<HashKey_S, vector<PositionOfText_S>, Classcomp_S> maphash2postion;
    long lkeynum = 0;

    while (ifs.eof() == false)
    {
		//cout << ifs.tellg() << endl;
		ifs.read((char*)(&stHash2LineTable), sizeof(Hash2LineTable_S));
        ++lkeynum;
        DWORD uiFileNum, uiLine, uiBatch;
		uiFileNum = stHash2LineTable.uiFileNum;
        uiLine = stHash2LineTable.uiLineNum;
		if (uiLine > 498417)
		{
			cout << "uilinenum is large than 498417, its value is " << uiLine << endl;
			break;
		}
        uiBatch = stHash2LineTable.uiBatch;
        maphash2postion[stHash2LineTable.sthashkey].push_back(PositionOfText_S(uiFileNum, uiLine, uiBatch));
    }
    cout << "*************map size************\n";
//    cout << "the num of key value is " << lkeynum << endl;
    cout << "the num of map is " << maphash2postion.size() << endl;

    ifs.close();
    DWORD uimaxcount = 0;

    const HashKey_S *pmaxkey = NULL;
    vector<PositionOfText_S> *pvecposOfText = NULL;
    ofstream ofs;
    ofs.open("../../../data/hash2linetable.txt");
    /*保存每个字符串重复度，hashkey，以及位置(虽然本程序用不到)
     *
     *file format
     *1.uicount, the max count of same line, the size of vector<PositionOfText_S>
     *2.HashKey_S
     *3.many PositionOfText_S, the num is uicount
     */
    for (map<HashKey_S, vector<PositionOfText_S>, Classcomp_S >::iterator it = maphash2postion.begin(); it != maphash2postion.end(); ++it)
    {
        DWORD uicount;
        const HashKey_S *pkey = &(it->first);
        pvecposOfText = &(it->second);
        uicount = pvecposOfText->size();
        if (uicount > uimaxcount)
        {
            uimaxcount = uicount;
            pmaxlineOfPos = pvecposOfText;
            pmaxkey = pkey;
        }
		
        ofs.write((char*)(&uicount), sizeof(DWORD));
        ofs.write((char*)(pkey), sizeof(HashKey_S));
        for (size_t i = 0; i < uicount; ++i)
            ofs.write((char*)(&(pvecposOfText->at(i))), sizeof(PositionOfText_S));
    }
    ofs.close();
    cout << "maxcount is " << uimaxcount << endl;
//    cout << "maxline is " << uimaxline << endl;
  //  cout << "max key, first is " << pmaxkey->uifirsthash << ", second is " << pmaxkey->uisecondhash << endl;
    if (uimaxcount > uiDegreeOfDup)
    {
        uiDegreeOfDup = uimaxcount;
        stMaxPos = *pmaxlineOfPos;
    }

    return *pmaxlineOfPos;

}

void print_one_max_line(ofstream &ofs, const char *pcFileList, const PositionOfText_S &stPos, int *pibit)
{
    DWORD uiLine, uiFileNum;
	uiFileNum = stPos.uiFileNum;    
	uiLine = stPos.uiLineNum;

    ifstream ifs;
    ifs.open(pcFileList, ios::in);
    assert(ifs.is_open());
    string sLine;

    int inum = 0;
    while (getline(ifs, sLine))
	{
		++inum;
        if (inum == uiFileNum)
        {
            ofs << sLine << ":the num of line is " << uiLine << ", batch is " << stPos.uiBatch << endl;

            sLine = string("../../../data/alldata/") + sLine;
            outputSpecifyFile(ofs, sLine, uiLine, stPos.uiBatch);
            break;
        }
        sLine.clear();
    }
    ifs.close();
}

void output_result(const char *pcFileList, vector<PositionOfText_S> *pmaxlineOfPos, const char *pcName)
{
    cout << "output_result to file:" << pcName << endl;
    int isum = 0;
    ofstream ofs;
	ofs.open(pcName, ios::out | ios::binary);
    int szBit[30];
    for (vector<PositionOfText_S>::const_iterator cit = pmaxlineOfPos->begin(); cit != pmaxlineOfPos->end(); ++cit)
    {
		/*if (isum++ == 20)
		{
			break;
		}*/
        print_one_max_line(ofs, pcFileList, *cit, szBit);
    }
    ofs.close();
}

void MakeResult()
{
    const char *pcWriteName = "../../../data/hash2line.txt", *pcFileList = "../../../data/filelist.txt";
    const int ifilenum = 10;

    char szName[80];
    int iFilterNumber = 0;
	//解析所有文件
    //iFilterNumber = parse_all_file(pcFileList, pcWriteName);
	//将大文件划分为10个小文件
    //splitfile(pcWriteName);

    DWORD uiDegreeOfDup = 0;
    vector<PositionOfText_S> stPos, stMaxPos;
    for (int i = 0; i < ifilenum; ++i)
    {
        cout << "***every file, the pos of max line***\n";
        memset(szName, 0, sizeof(szName));
        sprintf(szName, "../../../data/hash_%d.txt", i);
		//获取每个小文件的最大重复子串
        stPos = getSameLineOfCount(szName, uiDegreeOfDup, stMaxPos);

        memset(szName, 0, sizeof(szName));
        sprintf(szName, "../../../data/hash_%d_maxpostext.txt", i);
        output_result(pcFileList, &stPos, szName);
        stPos.clear();
    }
	const char *pcOutFileName = "degree_of_dup_result.txt";
    //output_result(pcFileList, &stMaxPos, pcOutFileName);
	ofstream ofs;
	ofs.open(pcOutFileName, ios::out | ios::app);
	ofs << "**************" << endl;
    ofs << "uiDegreeOfDup is " << uiDegreeOfDup << endl;
    ofs << "filter num is " << iFilterNumber << endl;
	ofs.close();

}

hash.h

#ifndef HASH_H
#define HASH_H
typedef unsigned char BYTE;
typedef unsigned int DWORD;

DWORD SDBMHash(char *pcstr);

// RS Hash Function
DWORD RSHash(char *pcstr);
// JS Hash Function
DWORD JSHash(char *pcstr);

// P. J. Weinberger Hash Function
DWORD PJWHash(char *pcstr);

// ELF Hash Function
DWORD ELFHash(char *pstr);

// BKDR Hash Function
DWORD BKDRHash(char *pcstr);
// DJB Hash Function
DWORD DJBHash(char *pcstr);
// AP Hash Function
DWORD APHash(char *pcstr);
#endif // HASH_H

hash.cpp

#include "hash.h"

DWORD SDBMHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD hash = 0;

    while (*pcstr)
    {
        // equivalent to: hash = 65599*hash + (*pcstr++);
        hash = (*pcstr++) + (hash << 6) + (hash << 16) - hash;
    }

    return (hash & 0x7FFFFFFF);
}

// RS Hash Function
DWORD RSHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD b = 378551;
    DWORD a = 63689;
    DWORD hash = 0;

    while (*pcstr)
    {
        hash = hash * a + (*pcstr++);
        a *= b;
    }

    return (hash & 0x7FFFFFFF);
}

// JS Hash Function
DWORD JSHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD hash = 1315423911;

    while (*pcstr)
    {
        hash ^= ((hash << 5) + (*pcstr++) + (hash >> 2));
    }

    return (hash & 0x7FFFFFFF);
}

// P. J. Weinberger Hash Function
DWORD PJWHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD BitsInUnignedInt = (DWORD)(sizeof(DWORD) * 8);
    DWORD ThreeQuarters    = (DWORD)((BitsInUnignedInt  * 3) / 4);
    DWORD OneEighth        = (DWORD)(BitsInUnignedInt / 8);
    DWORD HighBits         = (DWORD)(0xFFFFFFFF) << (BitsInUnignedInt - OneEighth);
    DWORD hash             = 0;
    DWORD test             = 0;

    while (*pcstr)
    {
        hash = (hash << OneEighth) + (*pcstr++);
        if ((test = hash & HighBits) != 0)
        {
            hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
        }
    }

    return (hash & 0x7FFFFFFF);
}

// ELF Hash Function
DWORD ELFHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD hash = 0;
    DWORD x    = 0;

    while (*pcstr)
    {
        hash = (hash << 4) + (*pcstr++);
        if ((x = hash & 0xF0000000L) != 0)
        {
            hash ^= (x >> 24);
            hash &= ~x;
        }
    }

    return (hash & 0x7FFFFFFF);
}

// BKDR Hash Function
DWORD BKDRHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD seed = 131; // 31 131 1313 13131 131313 etc..
    DWORD hash = 0;

    while (*pcstr)
    {
        hash = hash * seed + (*pcstr++);
    }

    return (hash & 0x7FFFFFFF);
}

// DJB Hash Function
DWORD DJBHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD hash = 5381;

    while (*pcstr)
    {
        hash += (hash << 5) + (*pcstr++);
    }

    return (hash & 0x7FFFFFFF);
}

// AP Hash Function
DWORD APHash(char *pcstr)
{
    if (*pcstr == 0)
        return 0;
    DWORD hash = 0;
    int i;

    for (i=0; *pcstr; i++)
    {
        if ((i & 1) == 0)
        {
            hash ^= ((hash << 7) ^ (*pcstr++) ^ (hash >> 3));
        }
        else
        {
            hash ^= (~((hash << 11) ^ (*pcstr++) ^ (hash >> 5)));
        }
    }

    return (hash & 0x7FFFFFFF);
}

空穴来风

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
统计大量文本中重复字符串的最大个数

有大量中文繁体的文本，都是网上摘取的，大小有6G。需要提取文本中相同的字符串的最大个数有几个关键问题：1.字符串的粒度如何确定？如何表示字符串的唯一位置索引？2.字符串过多，如何快速确定是否相同，时间复杂度要尽量低3.文本过多，无法放入内存怎么办4.如何统计相同hash值的个数，以及记录它们的索引位置对于1，每次读取一行，然后以100个字节为一组作为比较的字符串。每行最后一
复制链接

扫一扫