题目:给定一个目录,统计该目录及其子目录下所有文件内容中的词频,并输出最多的词频。假设所有的文件都是txt格式,且所有的内容都是英文。
分析:
1、可以用Dictionary来存储每个词的出现次数
2、时间复杂度至少为O(n)
分析:
1、可以用Dictionary来存储每个词的出现次数
2、时间复杂度至少为O(n)
似乎无法再进行优化了。那么我们通过多线程来并行处理。C#多线程非常方便。代码如下:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.IO;
namespace MyTest
{
class Program
{
static object _sync = new object();
static int _unreadFileCount;
static AutoResetEvent _event = new AutoResetEvent(false);
static Dictionary<string, int> wordsCount = new Dictionary<string, int>();
static void Main(string[] args)
{
string path = @"D:\Users\ruiyli\testData";
List<FileInfo> files = getFiles(path);
_unreadFileCount = files.Count;
foreach (var file in files)
{
CountWordsAsync(file);
}
_event.WaitOne();
int i = 0;
foreach (var kv in wordsCount.OrderByDescending(o => o.Value))
{
Console.WriteLine(kv.Key + ":" + kv.Value);
i++;
if (i > 10)
{
break;
}
}
Console.ReadKey();
}
static async void CountWordsAsync(FileInfo file)
{
try
{
//Console.WriteLine(file.FullName);
using (StreamReader sr = new StreamReader(file.FullName))
{
StringBuilder word = new StringBuilder();
string line;
while ((line = await sr.ReadLineAsync()) != null)
{
for (int i = 0; i < line.Length; i++ )
{
if (check(line[i]))
{
word.Append(line[i]);
}
else
{
if (word.Length > 0)
{
string words = word.ToString();
lock (_sync)
{
if (!wordsCount.ContainsKey(words))
{
wordsCount[words] = 1;
}
else
{
wordsCount[words]++;
}
}
word.Clear();
}
}
}
}
}
}
catch (Exception e)
{
}
lock (_sync)
{
_unreadFileCount--;
if (_unreadFileCount == 0)
{
_event.Set();
}
}
}
static bool check(char c)
{
return char.IsLetterOrDigit(c);
}
static List<FileInfo> getFiles(string path)
{
List<FileInfo> files = new List<FileInfo>();
DirectoryInfo di = new DirectoryInfo(path);
files.AddRange(di.GetFiles());
foreach (var d in di.GetDirectories())
{
files.AddRange(getFiles(path + @"\" + d.Name));
}
return files;
}
}
}