本项目需要把数据存档位二进制文件,载入时只载入文件索引,通过索引,快速定位到数据内容,从而实现最小存储,最快速查找。下面代码是初步实现,通过扩展,还实现搜索引擎关键字匹配度,权重,分词效果,这是后话,先把最基础的通过偏移量快速查找分享下。
/// <summary>
/// 索引文件结构
/// </summary>
struct Token
{
/// <summary>
/// 关键字
/// </summary>
public string ID;
/// <summary>
/// 移偏量
/// </summary>
public int Offset;
/// <summary>
/// 长度
/// </summary>
public int Length;
}
/// <summary>
/// 搜索
/// </summary>
class Search
{
private static StringBuilder _mainContent = new StringBuilder();
//生成索引文件和数据文件
public void BuildFile()
{
//生成索引文件
if (File.Exists("index.txt"))
File.Delete("index.txt");
using (FileStream aFile = new FileStream("index.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
{
Random rd = new Random();
int rdv = 0;
byte[] bytes = null;
byte[] byCont = null;
int offset = 0;
int len = 0;
using (BinaryWriter bw = new BinaryWriter(aFile, Encoding.UTF8))
{
for (int i = 0; i < 15; i++)
{
rdv = rd.Next(10, 305000);
string indexerid = string.Empty;
indexerid = i.ToString() + DateTime.Today.ToString("yyyyMMdd");
_mainContent.Append(indexerid + "|test programe" + rdv.ToString());
string result = indexerid + "|test programe" + rdv.ToString();
bytes = System.Text.Encoding.UTF8.GetBytes(_mainContent.ToString()); //所有内容
byCont = System.Text.Encoding.UTF8.GetBytes(result); //本次内容
//计算偏移量和内容长度
if (i == 0)
{
offset = 0;
len = byCont.Length;
}
else
{
offset = bytes.Length - byCont.Length;
len = byCont.Length;
}
bw.Write(indexerid + "," + (offset) + "," + (len));
bw.Flush();
}
}
}
//生成数据文件
if (File.Exists("data.txt"))
File.Delete("data.txt");
using (FileStream dFile = new FileStream("data.txt", FileMode.Append, FileAccess.Write, FileShare.ReadWrite))
{
Random rd = new Random();
byte[] bytes = null;
using (BinaryWriter bw = new BinaryWriter(dFile, Encoding.UTF8))
{
bytes = System.Text.Encoding.Default.GetBytes(_mainContent.ToString());
bw.Write(_mainContent.ToString());
bw.Flush();
}
}
}
//加载索引表到内存
public Dictionary<string, Token> GetTokenDic()
{
Dictionary<string, Token> dic = new Dictionary<string, Token>();
using (FileStream aFile = new FileStream("index.txt", FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
using (BinaryReader bw = new BinaryReader(aFile, Encoding.UTF8))
{
for (int i = 0; i < 15; i++)
{
string result = bw.ReadString();
if (result.IndexOf(',') != -1)
{
string[] arr = result.Split(',');
Token token = new Token();
token.ID = arr[0];
token.Length = Convert.ToInt32(arr[2]);
token.Offset = Convert.ToInt32(arr[1]);
dic.Add(arr[0], token);
}
}
}
}
return dic;
}
//根据关键字,通过偏移量快速查找内容
public void ReadFile(string key)
{
Dictionary<string, Token> dic = GetTokenDic();
//char[] charData = null;
FileStream file = new FileStream("data.txt", FileMode.Open);
int dOffset = 0;
int dLen = 0;
Token t = new Token();
if (dic.TryGetValue(key, out t))
{
dOffset = t.Offset;
dLen = t.Length;
}
byte[] byData = new byte[dLen];
using (BinaryReader bw = new BinaryReader(file, Encoding.UTF8))
{
file.Seek(dOffset + 2, SeekOrigin.Begin);
file.Read(byData, 0, dLen);
string d = Encoding.UTF8.GetString(byData);
Console.WriteLine(d);
file.Close();
}
}
}
static void Main(string[] args)
{
Search s = new Search();
//s.BuildFile();
s.ReadFile("1420130825");
Console.Read();
}