havenask/aios/indexlib at main · alibaba/havenask · GitHub
indexlib是其中核心的索引库。本文将用脑图详解其设计。
倒排索引:context-->docid
正排索引:docid-->(Attribute/Schema)
KV索引: key-->value
测试用例
#include "gtest/gtest.h"
#include <chrono>
#include "autil/legacy/any_jsonizable.h"
#include "glog/logging.h"
#include "autil/legacy/jsonizable.h"
#include "config/index_partition_schema.h"
#include "index_base/schema_adapter.h"
#include "storage/file_system_wrapper.h"
#include "document/raw_document.h"
#include "document/raw_document/default_raw_document.h"
#include "document/extend_document/indexlib_extend_document.h"
#include "document/index_document/normal_document/index_document.h"
#include "document/index_document/normal_document/normal_document.h"
#include "document/index_document/normal_document/attribute_document.h"
#include "document/index_document/normal_document/summary_document.h"
#include "document/document_factory_wrapper.h"
#include "document/document_parser.h"
#include "document/document_parser/normal_parser/normal_document_parser.h"
#include "config/field_schema.h"
#include "partition/index_builder.h"
#include "util/memory_control/quota_control.h"
#include "util/memory_control/memory_quota_controller.h"
#include "partition/online_partition.h"
#include "util/term.h"
#include "partition/index_partition_reader.h"
#include "index/normal/summary/summary_reader.h"
#include "document/index_document/normal_document/search_summary_document.h"
#include "alog/Configurator.h"
static alog::Logger *_logger;
namespace ajson = autil::legacy;
class MySchema : public autil::legacy::Jsonizable
{
public:
void Jsonize(autil::legacy::Jsonizable::JsonWrapper& json) override
{
json.Jsonize("name", name);
json.Jsonize("age", age);
}
void Print(){
LOG(INFO) << "name is: " << name << " age: " << age;
}
private:
std::string name;
int age;
};
TEST(TestBuilder, test_json) {
alog::Configurator::configureLogger("./logger.conf");
_logger = alog::Logger::getRootLogger();
// IE_ROOT_LOG_SETLEVEL(DEBUG);
LOG(INFO) << "test json...";
std::string raw_json = R"(
{
"name" : "hello",
"age" : 3
}
)";
LOG(INFO) << "raw json: " << raw_json;
MySchema ms;
autil::legacy::FromJsonString(ms, raw_json);
ms.Print();
// LOG(INFO) << ms;
}
static std::string GetSchema() {
std::string mJsonStringHead = R"(
{
"table_name": "mainse_summary",
"table_type": "normal",
"fields": [
{ "field_name": "quantity", "field_type": "INTEGER" },
{ "field_name": "provcity", "compress_type": "uniq|equal", "field_type": "STRING" },
{ "field_name": "category", "field_type": "INTEGER" },
{ "field_name": "nid", "field_type": "STRING" },
{ "field_name": "zk_time", "field_type": "STRING" },
{ "field_name": "title", "field_type": "STRING" },
{ "field_name": "user", "field_type": "STRING" },
{ "field_name": "user_id", "field_type": "STRING" },
{ "field_name": "vip", "field_type": "STRING" },
{ "field_name": "ends", "field_type": "STRING" },
{ "field_name": "pid", "field_type": "STRING" },
{ "field_name": "nick", "field_type": "STRING" },
{ "field_name":"int32_multi", "field_type":"int32", "multi_value": true}
],
"indexs": [
{
"index_fields": "nid",
"index_name": "pk",
"index_type": "PRIMARYKEY64",
"pk_hash_type": "default_hash",
"pk_storage_type": "hash_table"
}
],
"attributes" : [
{
"pack_name" : "pack_attr",
"sub_attributes" : ["category", "int32_multi"]
},
{
"pack_name" : "uniq_pack_attr",
"sub_attributes" : ["nick", "pid"],
"compress_type" : "uniq"
},
"quantity", "provcity", "vip"
],
)";
std::string mJsonStringTail = R"(
}
)";
std::string jsonString = mJsonStringHead + R"(
"summarys": {
"summary_fields": [ "nid", "title", "pid", "provcity", "category" ]
}
)" + mJsonStringTail;
return jsonString;
}
static indexlib::config::IndexPartitionSchemaPtr global_schema;
TEST(TestBuilder, load_schema) {
LOG(INFO) << "test load schema";
std::string schema_json = GetSchema();
LOG(INFO) << schema_json;
//1
indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));
FromJsonString(*schema, schema_json);
//2
ASSERT_NO_THROW(indexlib::index_base::SchemaAdapter::LoadSchema(schema_json, schema));
//3
indexlib::storage::FileSystemWrapper::Delete("/tmp/myschema.json", true);
indexlib::storage::FileSystemWrapper::AtomicStore("/tmp/myschema.json", schema_json);
schema = indexlib::index_base::SchemaAdapter::LoadSchema("/tmp", "myschema.json");
ASSERT_TRUE(schema);
//schema field
LOG(INFO) << "field count:" << schema->GetFieldSchema()->GetFieldCount();
LOG(INFO) << "index count:" << schema->GetIndexSchema()->GetIndexCount();
// LOG(INFO) << "index count:" << schema->GetSummarySchema()->GetSummaryCount();
LOG(INFO) << "attr count:" << schema->GetAttributeSchema()->GetAttributeCount();
auto attr_schema = schema->GetAttributeSchema();
for(int i = 0; i < attr_schema->GetAttributeCount(); i++) {
LOG(INFO) << "attr name: " << attr_schema->GetAttributeConfig(i)->GetAttrName()
<< " id: " << attr_schema->GetAttributeConfig(i)->GetAttrId();
}
for(int i = 0; i < attr_schema->GetPackAttributeCount(); i++) {
LOG(INFO) << "pack attr: " << attr_schema->GetPackAttributeConfig(i)->GetAttrName();
std::vector<std::string> attrNames;
attr_schema->GetPackAttributeConfig(i)->GetSubAttributeNames(attrNames);
for(auto &sub: attrNames) {
LOG(INFO) << "sub name: " << sub;
}
}
auto summary_schema = schema->GetSummarySchema();
// LOG(INFO) << "compress: " << summary_schema->GetSummaryConfig("quantity")->GetFieldConfig()->GetCompressType().GetCompressStr();
// for(int i = 0; i < summary_schema->GetSummaryGroupConfigCount(); i++) {
// auto group = summary_schema->GetSummaryGroupConfig(i);
// LOG(INFO) << "summary group: " << group->GetGroupName()
// << " compress_type: " << group->GetCompressType()
// << " filed_count: " << group->GetSummaryFieldsCount();
// }
global_schema = schema;
}
static indexlib::document::RawDocumentPtr ParseDocStr(const std::string& docStr)
{
const std::string DP_SPATIAL_KEY_VALUE_SEPARATOR = "|";
const std::string DP_KEY_VALUE_SEPARATOR = ",";
const std::string DP_KEY_VALUE_EQUAL_SYMBOL = "=";
const std::string DP_CMD_SEPARATOR = ";";
const std::string DP_TOKEN_SEPARATOR = " ";
const char DP_MULTI_VALUE_SEPARATOR = ' ';
const std::string DP_MAIN_JOIN_FIELD = "main_join";
const std::string DP_SUB_JOIN_FIELD = "sub_join";
std::vector<std::string> keyValues = autil::StringUtil::split(docStr, DP_SPATIAL_KEY_VALUE_SEPARATOR);
if (keyValues.size() <= 1)
{
keyValues = autil::StringUtil::split(docStr, DP_KEY_VALUE_SEPARATOR);
}
indexlib::document::RawDocumentPtr rawDoc(new indexlib::document::DefaultRawDocument);
for (size_t i = 0; i < keyValues.size(); ++i)
{
std::vector<std::string> keyValue = autil::StringUtil::split(keyValues[i],
DP_KEY_VALUE_EQUAL_SYMBOL);
std::string key = keyValue[0];
autil::StringUtil::trim(key);
std::string value;
if(keyValue.size() == 2)
{
value = keyValue[1];
autil::StringUtil::trim(value);
}
rawDoc->setField(key, value);
}
return rawDoc;
}
static int64_t GetTsMirco() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
static indexlib::document::DocumentPtr global_doc;
TEST(TestBuilder, CreateDoc){
std::string doc_str1 =
"cmd=add,quantity=10,provcity=shanghai,category=1,nid=123,zk_time=1684305956,title=hello1,user=huozai,user_id=98765,vip=svip,ends=some_ends,pid=203,nick=world,int32_multi=2 3;";
std::string doc_str2 =
"cmd=add,quantity=11,provcity=shanghaj,category=2,nid=124,zk_time=1684305957,title=hello2,user=huozaj,user_id=98766,vip=tvip,ends=some_fnds,pid=204,nick=worle,int32_multi=4 5;";
std::string doc_str3 =
"cmd=add,quantity=12,provcity=shanghak,category=3,nid=125,zk_time=1684305958,title=hello3,user=huozak,user_id=98767,vip=ivip,ends=some_gnds,pid=205,nick=worlf,int32_multi=6 7;";
//1 from doc str to raw doc
auto raw_doc = ParseDocStr(doc_str1);
ASSERT_TRUE(raw_doc);
raw_doc->setDocTimestamp(GetTsMirco());
LOG(INFO) << raw_doc->toString();
raw_doc->SetTimestamp(GetTsMirco());
raw_doc->setField("__ts__", std::to_string(raw_doc->GetTimestamp()/1000000 + 10000000));
raw_doc->setDocTimestamp(raw_doc->GetTimestamp());
//2 build normal doc
indexlib::document::IndexlibExtendDocumentPtr extend_doc(new indexlib::document::IndexlibExtendDocument());
extend_doc->setRawDocument(raw_doc);
auto classified_doc = extend_doc->getClassifiedDocument();
LOG(INFO) << "classifiedDoc: pk: " << classified_doc->getPrimaryKey();
LOG(INFO) << "classifiedDoc: index: " << classified_doc->getIndexDocument()->GetPrimaryKey();
LOG(INFO) << "classifiedDoc: attr packfield count: " << classified_doc->getAttributeDoc()->GetPackFieldCount();
LOG(INFO) << "classifiedDoc: summary nonempty count: " << classified_doc->getSummaryDoc()->GetNotEmptyFieldCount();
LOG(INFO) << "raw doc type: " << raw_doc->getDocOperateType();
std::string schema_json = GetSchema();
indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));
FromJsonString(*schema, schema_json);
// schema->SetDefaultTTL(1000000); //TTL
// schema->SetEnableTTL(true, DEFAULT_REGIONID, "__ts__");
indexlib::document::DocumentFactoryWrapper wrapper(schema);
wrapper.Init();
indexlib::document::DocumentParserPtr parser(wrapper.CreateDocumentParser());
auto normal_doc_parser = dynamic_cast<indexlib::document::NormalDocumentParser*>(parser.get());
ASSERT_TRUE(normal_doc_parser != nullptr);
indexlib::document::DocumentPtr doc = parser->Parse(extend_doc);
ASSERT_TRUE(doc);
indexlib::document::NormalDocumentPtr normal_doc = DYNAMIC_POINTER_CAST( indexlib::document::NormalDocument, doc);
ASSERT_TRUE(normal_doc);
global_doc = doc;
}
TEST(TestBuilder, TestOfflineBuilder){
LOG(INFO) << "-----------------------begin offline---------------------";
ASSERT_TRUE(global_schema);
ASSERT_TRUE(global_doc);
indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);
options->SetIsOnline(false);
std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";
autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);
auto& buildConfig = options->GetBuildConfig();
buildConfig.enablePackageFile = false;
buildConfig.maxDocCount = 1024;
buildConfig.ttl = 1000000;
indexlib::util::QuotaControlPtr memoryQuotaControl(new indexlib::util::QuotaControl(1024*1024*100));
// indexlib::storage::FileSystemWrapper::DeleteDir("./tdata/builder");
indexlib::partition::IndexBuilderPtr builder(new indexlib::partition::IndexBuilder("./tdata/builder", *options, global_schema, memoryQuotaControl));
ASSERT_TRUE(builder->Init());
for(int i = 0; i < 1000; i++) {
ASSERT_TRUE(builder->Build(global_doc));
}
ASSERT_TRUE(builder->Merge(*options));
builder->EndIndex();
}
TEST(TestBuilder, TestOnlineBuild){
LOG(INFO) << "-----------------------begin online---------------------";
ASSERT_TRUE(global_schema);
ASSERT_TRUE(global_doc);
indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);
options->SetIsOnline(true);
std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";
autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);
auto& buildConfig = options->GetBuildConfig();
buildConfig.enablePackageFile = false;
buildConfig.maxDocCount = 1024;
buildConfig.ttl = 1000000;
indexlib::util::MemoryQuotaControllerPtr quotaControl(new indexlib::util::MemoryQuotaController(100*1024*1024));
indexlib::partition::OnlinePartitionPtr part(new indexlib::partition::OnlinePartition("online", quotaControl));
options->GetOnlineConfig().onDiskFlushRealtimeIndex = true;
options->GetOnlineConfig().maxRealtimeDumpInterval = 10000;
options->GetOnlineConfig().maxRealtimeMemSize = 100*1024*1024;
auto rs = part->Open("./tdata/builder", "", global_schema, *options);
ASSERT_EQ(rs, indexlib::partition::IndexPartition::OS_OK);
//todo use IndexPartitionCreator::Create create partition
indexlib::util::QuotaControlPtr memoryQuotaControlOnline(new indexlib::util::QuotaControl(100*1024*1024));
indexlib::partition::IndexBuilderPtr online_builder(new indexlib::partition::IndexBuilder(part, memoryQuotaControlOnline));
ASSERT_TRUE(online_builder->Init());
//online add doc
for(int i = 0; i < 100; i++) {
ASSERT_TRUE(online_builder->Build(global_doc));
}
online_builder->EndIndex();
LOG(INFO) << "online add doc ok";
//query
auto partReader = part->GetReader();
ASSERT_TRUE(partReader);
auto indexReader = partReader->GetIndexReader();
ASSERT_TRUE(indexReader);
indexlib::util::Term t("123", "pk");
auto ite = indexReader->Lookup(t);
ASSERT_TRUE(ite != nullptr);
auto docid = ite->SeekDoc(INVALID_DOCID);
LOG(INFO) << "type: " << ite->GetMatchValueType() << " dockid: " << docid;
LOG(INFO) << "enable: " << _logger->isLevelEnabled(alog::LOG_LEVEL_INFO);
IE_LOG(ERROR, "hello ie log");
ALOG_ERROR(_logger, "hello :%s", "world");
auto summaryReader = partReader->GetSummaryReader();
ASSERT_TRUE(summaryReader != nullptr);
indexlib::document::SearchSummaryDocument summaryDoc(NULL, 40960);
ASSERT_TRUE(summaryReader->GetDocument(docid, &summaryDoc));
int count = 3;
LOG(INFO) << "field count: " << count;
for(int i = 0; i < count; i++) {
const autil::ConstString* field = summaryDoc.GetFieldValue(i);
ASSERT_TRUE(field != nullptr);
LOG(INFO) << "value is: " << *field;
}
}
LOG配置
logger.conf
alog.rootLogger=INFO, indexlibAppender
alog.max_msg_len=2000000
alog.appender.indexlibAppender=ConsoleAppender
#alog.appender.indexlibAppender=FileAppender
#alog.appender.indexlibAppender.fileName=TestLog.log
alog.appender.indexlibAppender.flush=true
alog.appender.indexlibAppender.layout=PatternLayout
#alog.appender.indexlibAppender.layout.LogPattern=[%%d] [%%t], %%f() [%%n] [%%l] [%%m]
alog.appender.indexlibAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]
alog.logger.indexlib=INFO
inherit.indexlib.test=false
alog.logger.local.LocalFileSystem=INFO
alog.logger.ErrorLogCollector=TRACE1,ErrorLogCollectorAppender
inherit.ErrorLogCollector=false
alog.appender.ErrorLogCollectorAppender=FileAppender
alog.appender.ErrorLogCollectorAppender.fileName=error_log_collector.log
alog.appender.ErrorLogCollectorAppender.flush=true
alog.appender.ErrorLogCollectorAppender.max_file_size=100
alog.appender.ErrorLogCollectorAppender.layout=PatternLayout
alog.appender.ErrorLogCollectorAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]
alog.appender.ErrorLogCollectorAppender.compress=true
alog.appender.ErrorLogCollectorAppender.log_keep_count=100
# alog.logger.indexlib.test=INFO, indexlibTestAppender
# alog.appender.indexlibTestAppender=ConsoleAppender
# alog.appender.indexlibTestAppender.layout=PatternLayout
# alog.appender.indexlibTestAppender.layout.LogPattern=[%%d] [%%t/%%p,PSM:%%f():%%n] [%%m]
# inherit.indexlib.test=false
生成数据在磁盘的结构
├── deploy_meta.0
├── deploy_meta.1
├── index_format_version
├── __indexlib_fs_root_link__@1684478171 -> ./tdata/builder
├── join_index_partition
├── merge_resource
│ └── version.1
├── rt_index_partition
│ ├── segment_1073741824_level_0
│ │ ├── attribute
│ │ │ ├── pack_attr
│ │ │ │ ├── data
│ │ │ │ ├── data_info
│ │ │ │ └── offset
│ │ │ ├── provcity
│ │ │ │ ├── data
│ │ │ │ ├── data_info
│ │ │ │ └── offset
│ │ │ ├── quantity
│ │ │ │ └── data
│ │ │ ├── uniq_pack_attr
│ │ │ │ ├── data
│ │ │ │ ├── data_info
│ │ │ │ └── offset
│ │ │ └── vip
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ ├── counter
│ │ ├── deletionmap
│ │ │ └── data_1073741824
│ │ ├── deploy_index
│ │ ├── index
│ │ │ ├── pk
│ │ │ │ └── data
│ │ │ └── virtual_timestamp_index
│ │ │ ├── dictionary
│ │ │ ├── index_format_option
│ │ │ └── posting
│ │ ├── operation_log
│ │ │ ├── data
│ │ │ └── meta
│ │ ├── segment_file_list
│ │ ├── segment_info
│ │ ├── segment_metrics
│ │ └── summary
│ │ ├── data
│ │ └── offset
│ └── version.0
├── schema.json
├── segment_0_level_0
│ ├── attribute
│ │ ├── pack_attr
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ ├── provcity
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ ├── quantity
│ │ │ └── data
│ │ ├── uniq_pack_attr
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ └── vip
│ │ ├── data
│ │ ├── data_info
│ │ └── offset
│ ├── counter
│ ├── deletionmap
│ │ └── data_0
│ ├── deploy_index
│ ├── index
│ │ └── pk
│ │ └── data
│ ├── segment_file_list
│ ├── segment_info
│ ├── segment_metrics
│ └── summary
│ ├── data
│ └── offset
├── segment_1_level_0
│ ├── attribute
│ │ ├── pack_attr
│ │ │ ├── category
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ ├── int32_multi
│ │ │ └── offset
│ │ ├── provcity
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ ├── quantity
│ │ │ └── data
│ │ ├── uniq_pack_attr
│ │ │ ├── data
│ │ │ ├── data_info
│ │ │ └── offset
│ │ └── vip
│ │ ├── data
│ │ ├── data_info
│ │ └── offset
│ ├── counter
│ ├── deletionmap
│ ├── deploy_index
│ ├── index
│ │ └── pk
│ │ └── data
│ ├── segment_file_list
│ ├── segment_info
│ ├── segment_metrics
│ └── summary
│ ├── data
│ └── offset
├── summary_info
│ ├── index_summary.0
│ └── index_summary.1
├── version.0
└── version.1