网上看到的代码. 折腾了一会儿. 开始的时候觉得用hash表效率比较高;但是修改完代码之后,发现内存太多..50W的记录,要50多M的内存.而直接使用stream或者MMF的话,内存只有不到3M多..而效率并没有差很多,都是毫秒级的..使用哈希表时,耗时一直为0ms..
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.IO.MemoryMappedFiles;
using System.Diagnostics;
namespace ConsoleApplication1
{
class Program
{
static void Main(string[] args)
{
Dict dict = new Dict("病症");
//dict.Add("中国", "China");
//dict.Add("北京", "beijing");
//dict.Add("中国人", "Chinese");
//for (int i = 0; i < 100000; i++)
//{
// dict.Add("中国人1" + i, "Chinese1" + i);
// dict.Add("中国人2" + i, "Chinese2" + i);
// dict.Add("中国人3" + i, "Chinese3" + i);
// dict.Add("中国人4" + i, "Chinese4" + i);
//}
//dict.Save();
dict = new Dict();
while (true)
{
Console.Write("请输入词语:");
var w = Console.ReadLine();
Stopwatch sw = new Stopwatch();
sw.Start();
Console.WriteLine("找到词语:");
Console.WriteLine(dict.GetDescription(w));
sw.Stop();
Console.WriteLine("耗时:" + sw.ElapsedMilliseconds + "ms");
}
}
}
/// <summary>
/// 词典
/// </summary>
class Dict
{
DictInfo info;
SortedList<string, DictIndex> indexs = new SortedList<string, DictIndex>();
List<DictWord> words = new List<DictWord>();
/// <summary>
/// 索引文件
/// </summary>
string idxFile = "dic.idx";
/// <summary>
/// 数据文件
/// </summary>
string dictfile = "dic.dict";
/// <summary>
/// 词典信息文件
/// </summary>
string ifoFile = "dic.ifo";
MemoryMappedFile idxMMFReader;
MemoryMappedFile dictMMFReader;
MemoryMappedViewStream idixStream;
private MemoryMappedViewStream dictStream;
/// <summary>
/// 查询使用
/// </summary>
public Dict()
{
LoadDictInfo();
idxMMFReader = MemoryMappedFile.CreateFromFile(idxFile, FileMode.Open);
idixStream = idxMMFReader.CreateViewStream();
dictMMFReader = MemoryMappedFile.CreateFromFile(dictfile, FileMode.Open);
dictStream = dictMMFReader.CreateViewStream();
}
/// <summary>
/// 创建时使用
/// </summary>
/// <param name="name"></param>
public Dict(string name)
{
info = new DictInfo { BookName = name, WordCount = 0, CurrentOffset = 0 };
indexs = new SortedList<string, DictIndex>();
words = new List<DictWord>();
}
/// <summary>
/// 获取词语解释
/// </summary>
/// <param name="word"></param>
/// <returns></returns>
public string GetDescription(string word)
{
var i = 0;
var mid = info.WordCount / 2;
var max = info.WordCount;
DictIndex w = new DictIndex();
while (i <= max)
{
mid = (i + max) / 2;
w = GetWordIndex(mid);
if (string.Compare(w.Word, word) > 0)
{
max = mid - 1;
}
else if (string.Compare(w.Word, word) < 0)
{
i = mid + 1;
}
else
{
break;
}
}
return "[" + w.Word + "]\n" + GetWordDescription(w);
}
/// <summary>
/// 获取指定位置的索引
/// </summary>
/// <param name="wordIndex"></param>
/// <returns></returns>
public DictIndex GetWordIndex(int wordIndex)
{
//using (MemoryMappedViewStream idixStream = idxMMFReader.CreateViewStream())
{
byte[] word = new byte[128];
byte[] offset = new byte[4];
byte[] size = new byte[4];
idixStream.Seek(0, SeekOrigin.Begin);
idixStream.Seek(wordIndex * 136, SeekOrigin.Begin);
idixStream.Read(word, 0, 128);
idixStream.Read(offset, 0, 4);
idixStream.Read(size, 0, 4);
var dicIndex = new DictIndex();
dicIndex.Word = Encoding.UTF8.GetString(word).Replace("\0", "");
dicIndex.Offset = BitConverter.ToInt32(offset, 0);
dicIndex.DataSize = BitConverter.ToInt32(size, 0);
return dicIndex;
}
}
/// <summary>
/// 获取指定词语的解释
/// </summary>
/// <param name="wordIndex"></param>
/// <returns></returns>
public string GetWordDescription(DictIndex dictIndex)
{
//using (MemoryMappedViewStream idixStream = dictMMFReader.CreateViewStream())
{
dictStream.Seek(0, SeekOrigin.Begin);
if (dictIndex.Offset != 0)
dictStream.Seek(dictIndex.Offset, SeekOrigin.Begin);
byte[] word = new byte[dictIndex.DataSize];
idixStream.Read(word, 0, dictIndex.DataSize);
return Encoding.UTF8.GetString(word).Replace("\0", "");
}
}
/// <summary>
/// 添加词语
/// </summary>
/// <param name="word"></param>
/// <param name="explation"></param>
public void Add(string word, string description)
{
words.Add(new DictWord() { Description = description });
indexs.Add(word, new DictIndex { DataSize = Encoding.UTF8.GetBytes(description).Length, Offset = info.CurrentOffset, Word = word });
// 数量++
info.WordCount++;
// 偏移++
info.CurrentOffset += Encoding.UTF8.GetBytes(description).Length;
}
/// <summary>
/// 加载词典信息
/// </summary>
void LoadDictInfo()
{
var infos = File.ReadAllLines(ifoFile);
info = new DictInfo
{
BookName = infos[0].Replace("BookName=", "").Trim(),
WordCount = int.Parse(infos[1].Replace("WordCount=", "").Trim()),
CurrentOffset = int.Parse(infos[2].Replace("CurrentOffset=", "").Trim()),
};
}
/// <summary>
/// 保存
/// </summary>
public void Save()
{
StringBuilder dicBuilder = new StringBuilder();
dicBuilder.AppendLine(string.Format("BookName={0}", info.BookName));
dicBuilder.AppendLine(string.Format("WordCount={0}", info.WordCount));
dicBuilder.AppendLine(string.Format("CurrentOffset={0}", info.CurrentOffset));
File.WriteAllText(ifoFile, dicBuilder.ToString(), Encoding.UTF8);
dicBuilder = new StringBuilder();
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(dictfile, FileMode.OpenOrCreate)))
{
foreach (var word in words)
{
idxWriter.Write(Encoding.UTF8.GetBytes(word.Description));
}
idxWriter.Close();
}
using (BinaryWriter idxWriter = new BinaryWriter(File.Open(idxFile, FileMode.OpenOrCreate)))
{
foreach (var index in indexs)
{
// 分块大小 128+4+4 = 136
// word 最长128
byte[] word = new byte[128];
var wordData = Encoding.UTF8.GetBytes(index.Key);
var length = Math.Min(128, wordData.Length);
for (var i = 0; i < length; i++)
{
word[i] = wordData[i];
}
idxWriter.Write(word);
byte[] re = new byte[4];
idxWriter.Write(index.Value.Offset);
idxWriter.Write(index.Value.DataSize);
}
idxWriter.Close();
}
}
}
/// <summary>
/// 词语解释
/// </summary>
class DictWord
{
/// <summary>
/// 解析
/// </summary>
public string Description
{
get;
set;
}
}
/// <summary>
/// 词典索引
/// </summary>
class DictIndex
{
/// <summary>
/// 词语
/// </summary>
public string Word
{
get;
set;
}
/// <summary>
/// 偏移
/// </summary>
public int Offset
{
get;
set;
}
/// <summary>
/// 数据大小
/// </summary>
public int DataSize
{
get;
set;
}
}
/// <summary>
/// 词典信息
/// </summary>
class DictInfo
{
/// <summary>
/// 词典名称
/// </summary>
public string BookName
{
get;
set;
}
/// <summary>
/// 收录词数
/// </summary>
public int WordCount
{
get;
set;
}
/// <summary>
/// 当前偏移
/// </summary>
public int CurrentOffset
{
get;
set;
}
}
}