大文件字符串替换

最新推荐文章于 2021-05-17 14:31:13 发布

iteye_5492

最新推荐文章于 2021-05-17 14:31:13 发布

阅读量455

点赞数

分类专栏：技术杂绘文章标签： c/c++

本文链接：https://blog.csdn.net/iteye_5492/article/details/82366511

版权

技术杂绘专栏收录该内容

15 篇文章 0 订阅

订阅专栏

　　前一段时间,在导数据的时候,常常要替换掉导出数据中的特殊的字符,而且一些字符是不可打印的,如ASCII码为0×05,0xFF等构成的字符串. 而且处理的文件都是几百G以上.故写了一个字符串替换的小程序予以自乐.在字符串替换的过程中,很重要的一点就是字符匹配. 先看看基于内存的匹配方法.
　　1. 无限内存中的字符串匹配
　　假设内存足够大,所以待处理的文本和要匹配的串都可以完全放入内存,且处理的文本为text[tsize], 要匹配的串为pat[psize],要替换的串为val[vsize] ,此时字符串匹配最简单的方式伪代码如下:
　　input: text[tsize] //待处理的文本大小为tsize
　　pat[psize] //要匹配的模式
　　val[vsize] //要替换的值
　　output: ret //处理好后的文本
　　begin
　　int i, j;
　　for(i=0; i字符串匹配
　　当内存有限时,最极端的情况是,每一次从文件中读一个byte,且和pattern进行匹配,这样就会增加读文件的I/O次数,故因该设置in_buff 用于读入一段适合内存的数据,当为空了后才再次读入in_buff大小的数据, out_buff 把处理好的数据放入其中,当out_buff满了才输出.这样就可以有效的减少读数据和写数据的I/O次数.
　　实验表明,在同一个(不详细说明环境的配置)实验环境下,一个字节比1024个字节缓存读取和写入数据的的情形多花3倍的时间.
　　相关的c语言实现的code如下:
　　/*
　　* replace.c
　　*
　　* Created on: Jan 27, 2010
　　* Author: grand
　　*/
　　//为了处理大文件定义的预编译的定义
　　#define _FILE_OFFSET_BITS 64
　　#define __USE_FILE_OFFSET64
　　#define __USE_LARGEFILE64
　　#define _LARGEFILE64_SOURCE
　　#include
　　#include
　　#include
　　#define BUFF_SIZE 1024 //缓存大小,可以调整从1个byte到多个
　　#define true 1
　　#define false 0
　　char in_buff[BUFF_SIZE];
　　char out_buff[BUFF_SIZE];
　　//the pattern string
　　char* pPat = NULL;
　　unsigned int len_pat = 0;
　　//the new value for pattern string
　　char* pVal = NULL;
　　unsigned int len_val = 0;
　　//statics result
　　unsigned long long inTotal = 0;
　　unsigned long long match = 0;
　　unsigned long long outTotal = 0;
　　void usage()
　　{
　　printf("replace \n");
　　}
　　void outBuff(FILE * pDest, char* pBuff, int in_start, int in_end, int flush)
　　{
　　static unsigned int outPos = 0;
　　//force to write data to file.
　　if (flush == true)
　　{
　　outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
　　return;
　　}
　　//write data into out_buffer
　　if (BUFF_SIZE-outPos > in_end in_start)
　　{
　　memcpy(out_buff + outPos, pBuff + in_start, in_end in_start);
　　outPos += in_end in_start;
　　}
　　//the out_buff is full, so write data to file.
　　else
　　{
　　outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
　　outTotal += fwrite(pBuff + in_start, sizeof(char), in_end in_start,
　　pDest);
　　outPos = 0;
　　}
　　}
　　int inBuff(FILE * pSrc)
　　{
　　int c;
　　c = fread(in_buff, sizeof(char), BUFF_SIZE,pSrc);
　　inTotal += c;
　　return c;
　　}
　　int adjust(int patPos, FILE* pDest)
　　{
　　int help = patPos;
　　int temp = 1;
　　int beg = 1;
　　patPos = 0;
　　while (temp inPos)
　　outBuff(pDest, in_buff, inPos, in_len patPos, false);
　　inPos = 0;
　　if (feof(pSrc))
　　{
　　outBuff(pDest, NULL,0, 0, true);
　　return;
　　}
　　in_len = inBuff(pSrc);
　　}
　　// succeed in matching a pattern
　　else
　　{
　　if (i patPos > inPos)
　　outBuff(pDest, in_buff, inPos, i patPos, false);
　　inPos = i;
　　outBuff(pDest, pVal, 0, len_val, false);
　　patPos = 0;
　　match++;
　　}
　　}
　　}
　　int main(int argc, char** argv)
　　{
　　FILE* pSrc = NULL;
　　FILE* pDest = NULL;
　　if (argc != 5)
　　{
　　usage();
　　return 1;
　　}
　　if ((pSrc = fopen(argv[1], "rb")) == NULL)
　　{
　　printf("can not open the src file:%s \n", argv[1]);
　　return 1;
　　}
　　if ((pDest = fopen(argv[2], "wb")) == NULL)
　　{
　　printf("can not open the dest file:%s \n", argv[2]);
　　return 1;
　　}
　　if ((pPat = malloc(strlen(argv[3]) * sizeof(char))) == NULL)
　　{
　　printf("encounter error when allocating memory for pattern\n");
　　return 1;
　　}
　　len_pat = strlen(argv[3]);
　　memcpy(pPat, argv[3], len_pat);
　　if ((pVal = malloc(strlen(argv[4]) * sizeof(char))) == NULL)
　　{
　　printf("encounter error when allocating memory for value\n");
　　return 1;
　　}
　　len_val = strlen(argv[4]);
　　memcpy(pVal, argv[4], len_val);
　　// replace start
　　matchPat(pSrc, pDest);
　　// replace end
　　// clean the resource
　　fclose(pSrc);
　　fclose(pDest);
　　free(pPat);
　　free(pVal);
　　printf("========================================== =====================\n");
　　printf("buffer size: %u bytes\n", BUFF_SIZE);
　　printf("src file size: %u bytes\n", inTotal);
　　printf("dest file size: %u bytes\n", outTotal);
　　printf("length of pattern: %u bytes\n", len_pat);
　　printf("length of replace value: %u bytes\n", len_val);
　　printf("match count: %u \n", match);
　　printf("========================================== =====================\n");
　　return 0;
　　}