背景
-
在产品迭代时,个人版产品已经将联系人和消息实时备份到本地,而消息的备份的目的仍然是为了快速查询对自己有用的上下文,并能快速定位到这些用户以及这些有用的信息。另外包括未来喂给 chatgpt-4o 的数据也是需要调用搜索获取的,也不是全量喂,以减少无用信息的资源浪费。
-
关系型数据库建立的是 B 树索引,而全文索引采用的是文本倒排索引,前者是正排查询,在处理文字间隔情况下,几乎是白痴级遍历和比对,后者却是利用词根的方式反差文档 ID,效率上高了太多倍。
SQLite 自带全文倒排索引
咨询 ChatGPT 询问是否有轻量级可以在客户端上使用的倒排索引库,ChatGPT 推荐了 SQLite FTS5,实际测试效果还不错。
实际使用的 SQL 语句,明显发现对中文的检索支持比较差
CREATE VIRTUAL TABLE documents USING fts5(title, body);
INSERT INTO documents (title, body) VALUES ('Twelfth-Night', 'If music be the food of love, play on: Give me excess of it…');
INSERT INTO documents (title, body) VALUES ('Macbeth', 'When shall we three meet again In thunder, lightning, or in rain?');
INSERT INTO documents (title, body) VALUES ('zhengxi', '😇🤣😆#;;郑钦文已于当地时间5日早上离开巴黎,备战北美赛季,这也意味着郑钦文无缘本次巴黎奥运会的中国代表团闭幕式的旗手.');
INSERT INTO documents (title, body) VALUES ('eluosi','Он очень прост в использовании, поддерживается загрузка как в Excel, так и в CSV, и в основном он бесплатен.');
INSERT INTO documents (title, body) VALUES ('email', 'leiluo88888888@gmail.com');
INSERT INTO documents (title, body) VALUES ('wadesk', '我们客户端也有群发https://wadesk.io');
INSERT INTO documents (title, body) VALUES ('Depois', 'Depois de comprar, tire um print da tela do preço e quantidade de compra e informe a assistente Michele . Ela ajudará a registrar e');
SELECT rowid, title, body FROM documents WHERE documents MATCH 'очень';
SQLite FTS5 可以自定义分词器
在进一步搜索过程中发现,微信在本地消息搜索过程中也有,消息检索的需求,且他们还做了进一步的优化
https://juejin.cn/post/6844903504419504135
进一步定位到一个针对中文搜索的优化库
https://www.wangfenjin.com/posts/simple-tokenizer/
这是一个开源的库,并且还给了使用 example,非常 Nice
simple/examples/node/node-sqlite3.js at master · wangfenjin/simple · GitHub
const path = require("path");
const sqlite3 = require('sqlite3').verbose();
const db = new sqlite3.Database(':memory:');
// 获取应用路径
const ext_path = path.resolve(__dirname, 'lib');
const dict_path = path.resolve(__dirname, 'lib/dict');
db.serialize(function() {
console.log("extension path: " + ext_path + ", dict path: " + dict_path);
// 加载扩展
const platform = process.env.npm_config_target_platform || process.platform;
const arch = process.arch;
if (platform === 'win32') {
if (arch === 'x64') {
db.loadExtension(path.join(ext_path, "libsimple-windows-x64.dll"));
} else {
db.loadExtension(path.join(ext_path, "libsimple-windows-x86.dll"));
}
} else if (platform === 'darwin' && arch === 'arm64') {
db.loadExtension(path.join(ext_path, "libsimple-aarch64-linux-gnu-gcc.so"));
} else if (platform === 'darwin' && arch === 'x64') {
db.loadExtension(path.join(ext_path, "libsimple-osx-x64.dylib"));
} else {
db.loadExtension(path.join(ext_path, "libsimple-linux.so"));
}
// 设置 jieba 字典文件路径
db.run("select jieba_dict(?)", dict_path);
// 创建表并插入数据
db.run("CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple')");
db.run("insert into t1(x) values ('周杰伦 Jay Chou:我已分不清,你是友情还是错过的爱情'), ('周杰伦 Jay Chou:最美的不是下雨天,是曾与你躲过雨的屋檐'), ('I love China! 我爱中国!我是中华人民共和国公民!'), ('@English &special _characters.\"''bacon-&and''-eggs%')");
// 执行全文搜索
db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match simple_query('zjl')", function(err, row) {
console.log(row.id + ": " + row.info);
});
db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match simple_query('中国')", function(err, row) {
console.log(row.id + ": " + row.info);
});
db.each("select rowid as id, simple_highlight(t1, 0, '[', ']') as info from t1 where x match jieba_query('中国')", function(err, row) {
console.log(row.id + ": " + row.info);
});
});
db.close();