前一段时间,在导数据的时候,常常要替换掉导出数据中的特殊的字符,而且一些字符是不可打印的,如ASCII码为0×05,0xFF等构成的字符串. 而且处理的文件都是几百G以上.故写了一个字符串替换的小程序予以自乐.在字符串替换的过程中,很重要的一点就是字符匹配. 先看看基于内存的匹配方法.
1. 无限内存中的字符串匹配
假设内存足够大,所以待处理的文本和要匹配的串都可以完全放入内存,且处理的文本为text[tsize], 要匹配的串为pat[psize],要替换的串为val[vsize] ,此时字符串匹配最简单的方式伪代码如下:
input: text[tsize] //待处理的文本大小为tsize
pat[psize] //要匹配的模式
val[vsize] //要替换的值
output: ret //处理好后的文本
begin
int i, j;
for(i=0; i字符串匹配
当内存有限时,最极端的情况是,每一次从文件中读一个byte,且和pattern进行匹配,这样就会增加读文件的I/O次数,故因该设置in_buff 用于读入一段适合内存的数据,当为空了后才再次读入in_buff大小的数据, out_buff 把处理好的数据放入其中,当out_buff满了才输出.这样就可以有效的减少读数据和写数据的I/O次数.
实验表明,在同一个(不详细说明环境的配置)实验环境下,一个字节比1024个字节缓存读取和写入数据的的情形多花3倍的时间.
相关的c语言实现的code如下:
/*
* replace.c
*
* Created on: Jan 27, 2010
* Author: grand
*/
//为了处理大文件定义的预编译的定义
#define _FILE_OFFSET_BITS 64
#define __USE_FILE_OFFSET64
#define __USE_LARGEFILE64
#define _LARGEFILE64_SOURCE
#include
#include
#include
#define BUFF_SIZE 1024 //缓存大小,可以调整从1个byte到多个
#define true 1
#define false 0
char in_buff[BUFF_SIZE];
char out_buff[BUFF_SIZE];
//the pattern string
char* pPat = NULL;
unsigned int len_pat = 0;
//the new value for pattern string
char* pVal = NULL;
unsigned int len_val = 0;
//statics result
unsigned long long inTotal = 0;
unsigned long long match = 0;
unsigned long long outTotal = 0;
void usage()
{
printf("replace \n");
}
void outBuff(FILE * pDest, char* pBuff, int in_start, int in_end, int flush)
{
static unsigned int outPos = 0;
//force to write data to file.
if (flush == true)
{
outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
return;
}
//write data into out_buffer
if (BUFF_SIZE-outPos > in_end in_start)
{
memcpy(out_buff + outPos, pBuff + in_start, in_end in_start);
outPos += in_end in_start;
}
//the out_buff is full, so write data to file.
else
{
outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
outTotal += fwrite(pBuff + in_start, sizeof(char), in_end in_start,
pDest);
outPos = 0;
}
}
int inBuff(FILE * pSrc)
{
int c;
c = fread(in_buff, sizeof(char), BUFF_SIZE,pSrc);
inTotal += c;
return c;
}
int adjust(int patPos, FILE* pDest)
{
int help = patPos;
int temp = 1;
int beg = 1;
patPos = 0;
while (temp inPos)
outBuff(pDest, in_buff, inPos, in_len patPos, false);
inPos = 0;
if (feof(pSrc))
{
outBuff(pDest, NULL,0, 0, true);
return;
}
in_len = inBuff(pSrc);
}
// succeed in matching a pattern
else
{
if (i patPos > inPos)
outBuff(pDest, in_buff, inPos, i patPos, false);
inPos = i;
outBuff(pDest, pVal, 0, len_val, false);
patPos = 0;
match++;
}
}
}
int main(int argc, char** argv)
{
FILE* pSrc = NULL;
FILE* pDest = NULL;
if (argc != 5)
{
usage();
return 1;
}
if ((pSrc = fopen(argv[1], "rb")) == NULL)
{
printf("can not open the src file:%s \n", argv[1]);
return 1;
}
if ((pDest = fopen(argv[2], "wb")) == NULL)
{
printf("can not open the dest file:%s \n", argv[2]);
return 1;
}
if ((pPat = malloc(strlen(argv[3]) * sizeof(char))) == NULL)
{
printf("encounter error when allocating memory for pattern\n");
return 1;
}
len_pat = strlen(argv[3]);
memcpy(pPat, argv[3], len_pat);
if ((pVal = malloc(strlen(argv[4]) * sizeof(char))) == NULL)
{
printf("encounter error when allocating memory for value\n");
return 1;
}
len_val = strlen(argv[4]);
memcpy(pVal, argv[4], len_val);
// replace start
matchPat(pSrc, pDest);
// replace end
// clean the resource
fclose(pSrc);
fclose(pDest);
free(pPat);
free(pVal);
printf("========================================== =====================\n");
printf("buffer size: %u bytes\n", BUFF_SIZE);
printf("src file size: %u bytes\n", inTotal);
printf("dest file size: %u bytes\n", outTotal);
printf("length of pattern: %u bytes\n", len_pat);
printf("length of replace value: %u bytes\n", len_val);
printf("match count: %u \n", match);
printf("========================================== =====================\n");
return 0;
}
1. 无限内存中的字符串匹配
假设内存足够大,所以待处理的文本和要匹配的串都可以完全放入内存,且处理的文本为text[tsize], 要匹配的串为pat[psize],要替换的串为val[vsize] ,此时字符串匹配最简单的方式伪代码如下:
input: text[tsize] //待处理的文本大小为tsize
pat[psize] //要匹配的模式
val[vsize] //要替换的值
output: ret //处理好后的文本
begin
int i, j;
for(i=0; i字符串匹配
当内存有限时,最极端的情况是,每一次从文件中读一个byte,且和pattern进行匹配,这样就会增加读文件的I/O次数,故因该设置in_buff 用于读入一段适合内存的数据,当为空了后才再次读入in_buff大小的数据, out_buff 把处理好的数据放入其中,当out_buff满了才输出.这样就可以有效的减少读数据和写数据的I/O次数.
实验表明,在同一个(不详细说明环境的配置)实验环境下,一个字节比1024个字节缓存读取和写入数据的的情形多花3倍的时间.
相关的c语言实现的code如下:
/*
* replace.c
*
* Created on: Jan 27, 2010
* Author: grand
*/
//为了处理大文件定义的预编译的定义
#define _FILE_OFFSET_BITS 64
#define __USE_FILE_OFFSET64
#define __USE_LARGEFILE64
#define _LARGEFILE64_SOURCE
#include
#include
#include
#define BUFF_SIZE 1024 //缓存大小,可以调整从1个byte到多个
#define true 1
#define false 0
char in_buff[BUFF_SIZE];
char out_buff[BUFF_SIZE];
//the pattern string
char* pPat = NULL;
unsigned int len_pat = 0;
//the new value for pattern string
char* pVal = NULL;
unsigned int len_val = 0;
//statics result
unsigned long long inTotal = 0;
unsigned long long match = 0;
unsigned long long outTotal = 0;
void usage()
{
printf("replace \n");
}
void outBuff(FILE * pDest, char* pBuff, int in_start, int in_end, int flush)
{
static unsigned int outPos = 0;
//force to write data to file.
if (flush == true)
{
outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
return;
}
//write data into out_buffer
if (BUFF_SIZE-outPos > in_end in_start)
{
memcpy(out_buff + outPos, pBuff + in_start, in_end in_start);
outPos += in_end in_start;
}
//the out_buff is full, so write data to file.
else
{
outTotal += fwrite(out_buff, sizeof(char), outPos, pDest);
outTotal += fwrite(pBuff + in_start, sizeof(char), in_end in_start,
pDest);
outPos = 0;
}
}
int inBuff(FILE * pSrc)
{
int c;
c = fread(in_buff, sizeof(char), BUFF_SIZE,pSrc);
inTotal += c;
return c;
}
int adjust(int patPos, FILE* pDest)
{
int help = patPos;
int temp = 1;
int beg = 1;
patPos = 0;
while (temp inPos)
outBuff(pDest, in_buff, inPos, in_len patPos, false);
inPos = 0;
if (feof(pSrc))
{
outBuff(pDest, NULL,0, 0, true);
return;
}
in_len = inBuff(pSrc);
}
// succeed in matching a pattern
else
{
if (i patPos > inPos)
outBuff(pDest, in_buff, inPos, i patPos, false);
inPos = i;
outBuff(pDest, pVal, 0, len_val, false);
patPos = 0;
match++;
}
}
}
int main(int argc, char** argv)
{
FILE* pSrc = NULL;
FILE* pDest = NULL;
if (argc != 5)
{
usage();
return 1;
}
if ((pSrc = fopen(argv[1], "rb")) == NULL)
{
printf("can not open the src file:%s \n", argv[1]);
return 1;
}
if ((pDest = fopen(argv[2], "wb")) == NULL)
{
printf("can not open the dest file:%s \n", argv[2]);
return 1;
}
if ((pPat = malloc(strlen(argv[3]) * sizeof(char))) == NULL)
{
printf("encounter error when allocating memory for pattern\n");
return 1;
}
len_pat = strlen(argv[3]);
memcpy(pPat, argv[3], len_pat);
if ((pVal = malloc(strlen(argv[4]) * sizeof(char))) == NULL)
{
printf("encounter error when allocating memory for value\n");
return 1;
}
len_val = strlen(argv[4]);
memcpy(pVal, argv[4], len_val);
// replace start
matchPat(pSrc, pDest);
// replace end
// clean the resource
fclose(pSrc);
fclose(pDest);
free(pPat);
free(pVal);
printf("========================================== =====================\n");
printf("buffer size: %u bytes\n", BUFF_SIZE);
printf("src file size: %u bytes\n", inTotal);
printf("dest file size: %u bytes\n", outTotal);
printf("length of pattern: %u bytes\n", len_pat);
printf("length of replace value: %u bytes\n", len_val);
printf("match count: %u \n", match);
printf("========================================== =====================\n");
return 0;
}