全文以SSD6的Exercise4为例。
任务目的,优化程序的时间效率。
/* substitute -- substitute strings in a list of files
This program operates on a set of files listed on
the command line. The first file specifies a list of
string substitutions to be performed on the remaining
files. The list of string substitutions has the form:
"string 1" "replacement 1"
"string 2" "replacement 2"
...
If a string contains a double quote character or
a backslash character, escape the character with
backslash: "\"" denotes the string with one double
quote character. "\\" contains one backslash.
Each file is searched for instances of "string 1".
Any occurences are replaced with "replacement 1".
In a similar manner, all "string 2"s are replaced
with "replacement 2"s, and so on.
The results are written to the input file. Be sure
to keep a backup of files if you do not want to lose
the originals when you run this program.
*/
#include "afx.h"
#include "iostream"
using namespace std;
// parse a quoted string from buffer
// return final index in string
int parse1(CString* buffer, int start, CString* str) {
// look for initial quote:
int i = buffer->Find('\"', start);
if (i != -1) {
// copy to result string
str->Empty();
int j = 0; // index into str
i++; // skip over the opening double-quote
// scan and copy up to the closing double-quote:
while ((*buffer)[i] != 0) {
if ((*buffer)[i] == '\\') {
// read next char to see what to do
i++;
if ((*buffer)[i] != 0) {
str->Insert(j++, CString((*buffer)[i]));
}
} else if ((*buffer)[i] == '\"') {
return i + 1;
}
str->Insert(j++, CString((*buffer)[i]));
i++;
}
}
return -1;
}
// parse two quoted strings from buffer; return false on failure
//
bool parse(CString* buffer, CString* pattern, CString* replacement) {
int start = parse1(buffer, 0, pattern);
if (start < 0) {
return false;
}
start = parse1(buffer, start, replacement);
return (start >= 0);
}
void substitute(CString* data, CString* pattern, CString* replacement) {
int loc;
// find every occurrence of pattern:
for (loc = data->Find(*pattern, 0); loc >= 0; loc = data->Find(*pattern, 0)) {
// delete the pattern string from loc:
data->Delete(loc, pattern->GetLength());
// insert each character of the replacement string:
for (int i = 0; i < replacement->GetLength(); i++) {
data->Insert(loc + i, (*replacement)[i]);
}
}
}
void do_substitutions(CString* data, CString* subs_filename) {
TRY {
CStdioFile file(*subs_filename, CFile::modeRead);
while (true) {
CString buffer; // holds line from file
CString pattern;
CString replacement;
file.ReadString(buffer);
// handle end of file
if (buffer.GetLength() == 0)
break;
if (parse(&buffer, &pattern, &replacement)) {
substitute(data, &pattern, &replacement);
} else {
cout << "Bad pattern/replacement line: " << buffer << endl;
return;
}
}
}
CATCH(CFileException, e) { cout << "File could not be opened or read " << e->m_cause << endl; }
END_CATCH
}
void process_file(CString* filename, CString* subs_filename) {
// read in filename to a CString
TRY {
CFile file(*filename, CFile::modeRead);
int size = file.GetLength();
// read the data, allocate more than we need
char* data = new char[size + 16];
file.Read(data, size);
// files are not zero-terminated but string should be:
data[size] = 0;
// now we can make a CString from the data:
CString content(data);
delete data; // data is no longer needed
do_substitutions(&content, subs_filename);
// write the data
file.Close();
file.Open(*filename, CFile::modeWrite);
file.Write(content, content.GetLength());
file.SetLength(content.GetLength());
file.Close();
}
CATCH(CFileException, e) { cout << "File could not be opened or read " << e->m_cause << " " << *filename << endl; }
END_CATCH
}
int main(int argc, char* argv[]) {
if (argc < 3) {
cout << "Not enough input arguments" << endl;
cout << "Usage: substitute subs-file src1 src2 ..." << endl;
} else {
CString subs_filename(argv[1]);
for (int i = 2; i < argc; i++) {
CString filename(argv[i]);
process_file(&filename, &subs_filename);
}
}
return 0;
}
程序做的事简单来说就是从replace.txt里获得模式串和替换串,然后遍历其他文件进行字符串替换。
性能优化任务,首先我们要进行性能检测。
性能检测的方式有很多,我选择VS自带的性能探测器,
分别用检测选项和CPU使用率选项,对程序检测的结果如下图。
可以看到,最占时间的是substirute里的for循环。
这里做的事是遍历文件找匹配串,我们看for循环的条件,每一轮的loc的值,是调用find函数拿到的,而find函数的第二个参数是起始位置,所以显而易见我们不需要每一轮都从文件开头开始find,所以这里需要这样改。
void substitute(CString* data, CString* pattern, CString* replacement) {
int loc;
// find every occurrence of pattern:
for (loc = data->Find(*pattern, 0); loc >= 0; loc = data->Find(*pattern, loc)) {
// delete the pattern string from loc:
data->Delete(loc, pattern->GetLength());
// insert each character of the replacement string:
data->Insert(loc, *replacement);
}
}
同时insert这里不需要一个字符一个字符插入,因为插入一个字符和一个字符串的效率是差不多的。
这里改完之后,程序执行时间由0.28降到0.25。
继续通过CPU占用率查看,发现IO操作占了很大一块时间,在这里也就是读取文件占时间很多,是一个可以优化的方向。
首先我们看程序的整体逻辑是遍历五个文件,然后每次遍历都要去从replace.txt里读字符串,这要执行5次不必要的IO操作,所以我们可以把这段重新写一下,改成先读replace.txt,然后把模式串和替换串都存在数组里,需要的时候直接操作数组就好,减少IO的时间。
void myIO(CStdioFile* fileSub, CString patterns[], CString replacements[]) {
CString buffer;
int i = 0;
while (true) {
fileSub->ReadString(buffer);
// handle end of file
if (buffer.GetLength() == 0)
break;
if(parse(&buffer, &patterns[i], &replacements[i])){
}else {
cout << "Bad pattern/replacement line: " << buffer << endl;
return;
}
i++;
}
}
int main(int argc, char* argv[]) {
if (argc < 3) {
cout << "Not enough input arguments" << endl;
cout << "Usage: substitute subs-file src1 src2 ..." << endl;
} else {
CString subs_filename(argv[1]);
CStdioFile fileSub(subs_filename, CFile::modeRead);
CString patterns[20];
CString replacements[20];
myIO(&fileSub, patterns, replacements);
for (int i = 2; i < argc; i++) {
CString filename(argv[i]);
process_file(&filename, &fileSub, patterns, replacements);
}
}
return 0;
}
这里优化完之后执行时间由0.25减少到0.22.
接着查看CPU占用率,发现delete占比还是很高,于是我试着用replace方法代替,结果发现效率真的变高了。
void do_substitutions(CString* data, CStdioFile* fileSub, CString patterns[], CString replacements[]) {
TRY {
int i = 0;
while (true) {
// fileSub->ReadString(buffer);
// handle end of file
if (patterns[i].GetLength() == 0)
break;
//substitute(data, &patterns[i], &replacements[i]);
data->Replace(patterns[i], replacements[i]);
i++;
}
}
CATCH(CFileException, e) { cout << "File could not be opened or read " << e->m_cause << endl; }
END_CATCH
}
可能是replace的实现比较优秀吧,并且减少了一次函数调用。
然后接着改了几处getLength重复调用的情况。
最后一处优化点在读文件的地方。
5个文件每次都是读一遍关闭再打开再写入,我们可以让他只打开一次。
CFile file(*filename, CFile::modeReadWrite);
int size = file.GetLength();
// read the data, allocate more than we need
char* data = new char[size + 16];
file.Read(data, size);
// files are not zero-terminated but string should be:
data[size] = 0;
// now we can make a CString from the data:
CString content(data);
delete data; // data is no longer needed
do_substitutions(&content, fileSub, patterns, replacements);
// write the data
file.SeekToBegin();
int32_t len = content.GetLength();
file.Write(content, len);
file.SetLength(len);
file.Close();
将CFile的model改成readandwrite,然后在读完文件之后将文件指针放到文件开头,接着写入就可以了。
总结
优化的点:
- 遍历字符串尽量保证不重复。
- 尽量处理整个字符串,避免一个字符一个字符读取或写入。
- 减少I/O操作,能一次处理完尽量一次处理完。
- 读写文件只打开一次即可。