词频统计---python与C++的执行效率分析

最新推荐文章于 2022-09-08 17:59:32 发布

ActionLi

最新推荐文章于 2022-09-08 17:59:32 发布

阅读量6.1k

点赞数

分类专栏： python c++ 文章标签： c++ string python vector iostream iterator

本文链接：https://blog.csdn.net/ActionLi/article/details/1645123

版权

c++ 同时被 2 个专栏收录

9 篇文章 0 订阅

订阅专栏

python

5 篇文章 0 订阅

订阅专栏

Question: 一个全英文文本，统计每个单词出现的次数，按次数从大到小排列，输出到文本文件中。
Data：MIT的Python教程---Python Programming : An Introduction to Computer Science，大约800K。
C++:

#include <stdafx.h>
#include <windows.h>
#include <fstream>
#include <map>
#include <algorithm>
#include <vector>
#include <string>
#include <iostream>
//   This   class   is   for   getting   the   elapsed   thread   time   of   the   CPU,   the   unit   is   ms
//   the   usage   is:
//
//   CThreadTime   ElapsedTime;
//   ElapsedTime.BeginGetElapsedTime();
//   TODO:   Your   performance   code
//   int   nThreadTine   =   ElapsedTime.EndGetElapsedTime();
//

class   CThreadTime
{
public:
          void         BeginGetElapsedTime();
          __int64   EndGetElapsedTime();

private:
          __int64   FileTimeToQuadWord(PFILETIME   pft);

private:
          FILETIME   ftKernelTimeStart;
          FILETIME   ftKernelTimeEnd;
          FILETIME   ftUserTimeStart;
          FILETIME   ftUserTimeEnd;
          FILETIME   ftDummy;
};

//   Get   the   time   elapsed   since   the   thread   start
inline   void   CThreadTime::BeginGetElapsedTime()
{
          GetThreadTimes(GetCurrentThread(),   &ftDummy,   &ftDummy,   &ftKernelTimeStart,   &ftUserTimeStart);
}

//   Calculate   the   time   elapsed
inline   __int64   CThreadTime::EndGetElapsedTime()
{
          GetThreadTimes(GetCurrentThread(),   &ftDummy,   &ftDummy,   &ftKernelTimeEnd,   &ftUserTimeEnd);

          __int64   qwKernelTimeElapsed   =   FileTimeToQuadWord(&ftKernelTimeEnd)   -   FileTimeToQuadWord(&ftKernelTimeStart);
          __int64   qwUserTimeElapsed   =   FileTimeToQuadWord(&ftUserTimeEnd)   -   FileTimeToQuadWord(&ftUserTimeStart);

          //   Get   total   time   duration   by   adding   the   kernel   and   user   times.
          //   the   default   is   100ns,   so   we   convert   it   to   ms
          return   (qwKernelTimeElapsed   +   qwUserTimeElapsed)   /   10000;
}

inline   __int64   CThreadTime::FileTimeToQuadWord(PFILETIME   pft)
{
          return   (Int64ShllMod32(pft->dwHighDateTime,   32)   |   pft->dwLowDateTime);
}
//前面是代码运行时间测试函数,来源于网上,下面是正文
bool countcmp(const wordp &wa,const wordp &wb)
{
return wa.second > wb.second;
}
void wordscount()
{
   string ifile("d://words.txt"),ofile("d://wordscount.txt");
   ifstream infile(ifile.c_str());
if(infile){
    wordc wordsmap;
    string str;
    while(infile >> str){
      ++wordsmap[str];
    }
    ofstream outfile(ofile.c_str());
    wordc::iterator iter=wordsmap.begin();
   vector<wordp> vv(wordsmap.begin(),wordsmap.end());
   stable_sort(vv.begin(),vv.end(),countcmp);
   vector<wordp> ::iterator viter=vv.begin();
   while(viter != vv.end()){
      outfile<<viter->first<<" "<<viter->second<<"/n";
      viter++;
   }
   outfile.close();
}//if(infile)

infile.close();

}
int main()
{
CThreadTime   ElapsedTime;
ElapsedTime.BeginGetElapsedTime();

     for(int i=0;i<100;i++)
        wordscount();

int   nThreadTime   =   ElapsedTime.EndGetElapsedTime();
   cout<<"have used time(millsecond): "<<nThreadTime/100<<endl;
   return 0;
}

WinXp,VS2005开发平台,用时260ms

#include < fstream >

#include < map >

#include < algorithm >

#include < vector >

#include < string >

#include < iostream >

#include < time.h >

using namespace std;

typedef map < string , int > wordc;

typedef pair < string , int > wordp;

bool countcmp( const wordp & wa, const wordp & wb)

{

return wa.second > wb.second;

}

void wordscount()

{

string ifile("./words"),ofile("./wordscount");

ifstream infile(ifile.c_str());

if(infile){

wordc wordsmap;

string str;

while(infile >> str){

++wordsmap[str];

}

ofstream outfile(ofile.c_str());

wordc::iterator iter=wordsmap.begin();

vector<wordp> vv(wordsmap.begin(),wordsmap.end());

stable_sort(vv.begin(),vv.end(),countcmp);

vector<wordp> ::iterator viter=vv.begin();

while(viter != vv.end()){

outfile<<viter->first<<" "<<viter->second<<" ";

viter++;

}

outfile.close();

}//if(infile)

infile.close();

}

int main()

{

long st=clock(),end(0);

for(int i=0;i<100;i++)

wordscount();

end=clock();

cout<<"the time is : "<<difftime(end,st)/100000<<endl;

return 0;

}

Linux平台，g++ 4.1.2 用时200ms
Python:

import timeit

import string

def compareitems((w1,c1),(w2,c2)):

if c1 > c2:

return - 1

elif c1 == c2:

return cmp(w1,w2)

else :

return 1

def main():

wordf = open( " d:/words.txt " , " r " )

wordc = open( " d:/wordscount.txt " , " w " )

worddict = {}

# for line in wordf.lines():

# lst=[]

# lst=string.split(line)

# for word in lst:

# if worddict.has_key(word):

# worddict[word] += 1

# else:

# worddict[word] = 1

lines = wordf.read()

wlst = string.split(lines)

for word in wlst:

try :

worddict[word] += 1

pass

except KeyError:

worddict[word] = 1

wlst = []

wlst = worddict.items()

wlst.sort(compareitems)

lines = ""

for i in range(len(wlst)):

lines += " %s %d " % wlst[i]

wordc.write(lines)

wordc.close()

wordf.close()

if __name__ == " __main__ " :

t = timeit.Timer( " wordcount.main() " , " import wordcount " )

print t.repeat( 3 , 100 )

# t.timeit()

winxp平台, python 2.5 用时 402ms
效率分析:
这两处代码写的都不够优化,但大体反映了C++与python不同的执行效率.
但看到python不到40 行的代码数,优雅的语法,为什么fans喜爱它就不言自明了.

[注]转载请注明出处
本文做为python代码3，题目是一个月前(6.9)想好的，直到今天才真正的写起来，可能很多时候确是没有时间，但是一拖再拖的原因有二： 1.目标没有侧重点，究竟为什么要写这样一个系列，我本意是为google编程竞赛作些准备，但是却被眼前的事情所累，准备接项目，准备开源竞赛，准备购买基金，准备买数码相机...小事很多，我又在小事中间读些BBS，查看Email，玩玩卡丁车，2,最近心态不好，又找不到有效的舒缓渠道，很多事情做不下去...