DataExtraction 本工具可以从文件中按关键字提取数据和排序功能 [版本 0.88] BY Hong Wenjun

最新推荐文章于 2024-05-08 10:13:58 发布

hongwenjun

最新推荐文章于 2024-05-08 10:13:58 发布

阅读量4.6k

点赞数

文章标签：工具 string 优化 iterator null include

本文链接：https://blog.csdn.net/hongwenjun/article/details/6216895

版权

使用图文帮助

本工具可以从文件中按关键字提取数据和排序功能 [版本 0.88] BY Hong Wenjun

示例 1 ：D:/>DataExtraction.exe "关键字" D:/原始数据.txt

示例 2 ：D:/>DataExtraction.exe /W D:/原始数据.txt -S

示例 3 ：D:/>DataExtraction.exe *** D:/原始数据.txt -S

请输入一个关键字！！[ "关键字" *** /W /D ] 按四种方式选择一种

*** 关键字提取所有数据，和-S参数配合排序

/W 关键字匹配纯字母，不包含数字

/D 关键字匹配全数字，不包含字母

-S 参数排序优化，删除重复数据

D:/>DE /W D:/600W.txt -s //文件名改成DE.exe 方便使用

当前提取关键字：/W

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

原来数据记录数目：6583939 新的数据记录数目：3235633

提取数据花费时间：9.578 秒

正在排序优化中，请等候....

排序优化后数据量：2981874 排序数据花费时间：7.532 秒

正在把数据写到新文件中...... 如果数据重复,可以最后加参数 -S 优化排序

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<保存新数据文件花费时间：17.438 秒

已经生成新的包含关键字的数据文件 newData.txt

工程项目源代码和 VC 和 GC 编译的可执行文件下载地址
http://cid-0604030941ce925d.office.live.com/self.aspx/CPPSourceCode/DataExtractionV88.7z

源代码，就一个文件 DataExtraction.cpp

#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <cstring>
#include <cctype>
#include <fstream>
#include <sstream>
// #define WIN32 //VC 编译器开关
#include <time.h>
#ifdef WIN32
#   include <windows.h>
#else
#   include <sys\time.h>
#endif
#ifdef WIN32
int gettimeofday(struct timeval *tp, void *tzp)
{
    time_t clock;
    struct tm tm;
    SYSTEMTIME wtm;
    GetLocalTime(&wtm);
    tm.tm_year     = wtm.wYear - 1900;
    tm.tm_mon     = wtm.wMonth - 1;
    tm.tm_mday     = wtm.wDay;
    tm.tm_hour     = wtm.wHour;
    tm.tm_min     = wtm.wMinute;
    tm.tm_sec     = wtm.wSecond;
    tm. tm_isdst    = -1;
    clock = mktime(&tm);
    tp->tv_sec = clock;
    tp->tv_usec = wtm.wMilliseconds * 1000;
    return (0);
}
#endif /* WIN32 */
using std::stringstream;
using std::vector;
using std::string;
using std::fstream;
void help(); // 调用使用帮助
bool isdigit(const std::string& str); // overload function  重载函数
bool isalpha(const std::string& str);
stringstream & load_sstream(stringstream &oss, fstream &infile); // 加载文件到sstream
vector<string> & sortVecData(vector<string> & vecData); // 排序和删除重复

int main(int argc, char* argv[])
{
    using namespace std;
    if ((1 == argc)||(2 == argc)) {      //错误输入处理
        help();
        return -1;
    }
    string Value = argv[1];
    fstream inFile;
    inFile.open(argv[2],ios_base::in);
    if (!inFile) {
        cerr << "文件错误：不能打开输入文件: " << argv[2] << endl <<endl ;
        help();
        return -1;
    }
    bool sortData = false;     // 排序开关
    if (argc >= 4 && ('s' == argv[3][1] || 'S' == argv[3][1])) {
        sortData = true;
    }
    stringstream oss;
    ofstream ofDataFile;
    ofDataFile.open("newData.txt"); //保存到 新的数据档文件
    long ixold = 0, ixnew = 0;
    string::size_type pos;
    string  strData;
    vector<string> vecData;
    cout << "当前提取关键字：" << Value << endl;
    // 取时间断点
    timeval tv;
    gettimeofday(&tv, NULL);
    double cl = tv.tv_sec + (double)tv.tv_usec / 1000000;

    // while(getline(cin , inFile))      //整行处理，以备修改
    while (inFile >> strData) {          //数据文件 数据输入
        if ('/' == argv[1][0]) {
            //  /W  匹配纯字母，不包含数字
            if (isalpha(strData) && ('w' == argv[1][1] || 'W' == argv[1][1] ) ) {
                vecData.push_back(strData);
            }
            //  /D  匹配全数字，不包含字母
            if (isdigit(strData) && ('d' == argv[1][1] || 'D' == argv[1][1]) ) {
                vecData.push_back(strData);
            }
        } else {    //数据是否有关键字 Value  *** 全部提取
            if ('*' == argv[1][0]) {
                vecData.push_back(strData);
            } else {
                pos = strData.find(Value);
                if (pos != string::npos  ) {
                    vecData.push_back(strData);      //数据记录到容器
                }
            }
        }
        ixold++;  //旧数据计数器
        if (ixold % 100000 == 0) cout << ">";
    }
    gettimeofday(&tv, NULL);
    cl = (tv.tv_sec + (double)tv.tv_usec / 1000000) - cl;
    cout << "\n原来数据记录数目：" << ixold << "\t";
    cout << "新的数据记录数目：" << vecData.size() << endl;
    cout << "提取数据花费时间：" << cl << " 秒\n";
    if (sortData) {
        cout << "正在排序优化中，请等候....\n";
        gettimeofday(&tv, NULL);
        cl = tv.tv_sec + (double)tv.tv_usec / 1000000;

        sortVecData(vecData);  //数据排序 删除重复
        cout << "排序优化后数据量：" << vecData.size() << "\t";
        gettimeofday(&tv, NULL);
        cl = (tv.tv_sec + (double)tv.tv_usec / 1000000) - cl;
        cout << "排序数据花费时间：" << cl << " 秒\n";
    }
    cout << "正在把数据写到新文件中......\t如果数据重复,可以最后加参数 -S 优化排序\n" ;
    gettimeofday(&tv, NULL);
    cl = tv.tv_sec + (double)tv.tv_usec / 1000000;
    vector<string>::iterator iter = vecData.begin();
    while (iter != vecData.end()) {
        oss << *iter++ << endl;  //数据写到文件
        if (ixnew++ % 100000 == 0) cout << "<";    //新数据计数器
    }
    ofDataFile <<oss.str();
    gettimeofday(&tv, NULL);
    cl = (tv.tv_sec + (double)tv.tv_usec / 1000000) - cl;
    cout << "保存新数据文件花费时间：" << cl << " 秒\n";
    cout << "\n已经生成新的包含关键字的数据文件 newData.txt" << endl;
    inFile.close();
    ofDataFile.close();
    return 0;
}
// overload function  重载函数 isdigit
bool isdigit(const std::string& str)
{
    bool flag=true;
    for(std::string::size_type i=0; i<str.length(); i++) {
        while(!isdigit(str[i])) {
            flag=false;
            break;
        }
    }
    return flag;
}
bool isalpha(const std::string& str)
{
    bool flag=true;
    for(std::string::size_type i=0; i<str.length(); i++) {
        while(!isalpha(str[i])) {
            flag=false;
            break;
        }
    }
    return flag;
}
void help()
{
    using std::cerr;
    cerr << "本工具可以从文件中按关键字提取数据和排序功能 [版本 0.98] BY Hong Wenjun\n\n";
    cerr << "示例 1 ：D://>DataExtraction.exe  \"关键字\"  D://原始数据.txt  \n" ;
    cerr << "示例 2 ：D://>DataExtraction.exe  /W   D://原始数据.txt  -S\n" ;
    cerr << "示例 3 ：D://>DataExtraction.exe  ***  D://原始数据.txt  -S\n\n" ;
    cerr << "请输入一个关键字！！[ \"关键字\"    ***    /W    //D ] 按四种方式选择一种\n";
    cerr << "*** 关键字  提取所有数据，和-S参数配合排序\n";
    cerr << "/W  关键字 匹配纯字母，不包含数字\n";
    cerr << "/D  关键字 匹配全数字，不包含字母\n";
    cerr << "-S  参数   排序优化，删除重复数据\n";
}

stringstream & load_sstream(stringstream &oss, fstream &infile) // 加载文件到sstream
{
    oss << infile.rdbuf();
    return oss;
}

vector<string> & sortVecData(vector<string> &vecData) // 排序和删除重复 数字序列
{
    sort(vecData.begin(), vecData.end());  //数据排序
    vector<string>::iterator end_unique = unique(vecData.begin(), vecData.end());  // 移动重复到最后
    vecData.erase(end_unique, vecData.end());  //删除重复
    return vecData;
}