阿里havenask之indexlib

GitHub - alibaba/havenask

havenask/aios/indexlib at main · alibaba/havenask · GitHub

indexlib是其中核心的索引库。本文将用脑图详解其设计。

倒排索引:context-->docid
正排索引:docid-->(Attribute/Schema)
KV索引:  key-->value

测试用例

#include "gtest/gtest.h"
#include <chrono>
#include "autil/legacy/any_jsonizable.h"
#include "glog/logging.h"
#include "autil/legacy/jsonizable.h"
#include "config/index_partition_schema.h"
#include "index_base/schema_adapter.h"
#include "storage/file_system_wrapper.h"
#include "document/raw_document.h"
#include "document/raw_document/default_raw_document.h"
#include "document/extend_document/indexlib_extend_document.h"
#include "document/index_document/normal_document/index_document.h"
#include "document/index_document/normal_document/normal_document.h"
#include "document/index_document/normal_document/attribute_document.h"
#include "document/index_document/normal_document/summary_document.h"
#include "document/document_factory_wrapper.h"
#include "document/document_parser.h"
#include "document/document_parser/normal_parser/normal_document_parser.h"
#include "config/field_schema.h"
#include "partition/index_builder.h"
#include "util/memory_control/quota_control.h"
#include "util/memory_control/memory_quota_controller.h"
#include "partition/online_partition.h"
#include "util/term.h"
#include "partition/index_partition_reader.h"
#include "index/normal/summary/summary_reader.h"
#include "document/index_document/normal_document/search_summary_document.h"
#include "alog/Configurator.h"
static alog::Logger *_logger;
namespace ajson = autil::legacy;
class MySchema : public autil::legacy::Jsonizable
{
public:
    
    void Jsonize(autil::legacy::Jsonizable::JsonWrapper& json) override
    {
        json.Jsonize("name", name);
        json.Jsonize("age", age); 
    }
    void Print(){
        LOG(INFO) << "name is: " << name << " age: " << age;
    }
private:
    std::string name;
    int age;
};
TEST(TestBuilder, test_json) {
    alog::Configurator::configureLogger("./logger.conf");
    _logger = alog::Logger::getRootLogger();
    // IE_ROOT_LOG_SETLEVEL(DEBUG);
    LOG(INFO) << "test json...";
    std::string raw_json = R"(
    {
        "name" : "hello",
        "age" : 3
    }
    )";
    LOG(INFO) << "raw json: " <<  raw_json;
    MySchema ms;
    autil::legacy::FromJsonString(ms, raw_json);
    ms.Print();
    // LOG(INFO) << ms;
}
static std::string GetSchema() {
    std::string mJsonStringHead = R"(
    {
        "table_name": "mainse_summary",
        "table_type": "normal",
        "fields": [
            { "field_name": "quantity", "field_type": "INTEGER" },
            { "field_name": "provcity", "compress_type": "uniq|equal", "field_type": "STRING" },
            { "field_name": "category", "field_type": "INTEGER" },
            { "field_name": "nid", "field_type": "STRING" },
            { "field_name": "zk_time", "field_type": "STRING" },
            { "field_name": "title", "field_type": "STRING" },
            { "field_name": "user", "field_type": "STRING" },
            { "field_name": "user_id", "field_type": "STRING" },
            { "field_name": "vip", "field_type": "STRING" },
            { "field_name": "ends", "field_type": "STRING" },
            { "field_name": "pid", "field_type": "STRING" },
            { "field_name": "nick", "field_type": "STRING" },
            { "field_name":"int32_multi", "field_type":"int32", "multi_value": true}
        ],
        "indexs": [
            {
                "index_fields": "nid",
                "index_name": "pk",
                "index_type": "PRIMARYKEY64",
                "pk_hash_type": "default_hash",
                "pk_storage_type": "hash_table"
            }
        ],
        "attributes" : [
            {
                "pack_name" : "pack_attr",
                "sub_attributes" : ["category", "int32_multi"]
            },
            {
                "pack_name" : "uniq_pack_attr",
                "sub_attributes" : ["nick", "pid"],
                "compress_type" : "uniq"
            },
            "quantity", "provcity", "vip"
        ],
    )";
    std::string mJsonStringTail = R"(
    }
    )";

    std::string jsonString = mJsonStringHead + R"(
        "summarys": {
            "summary_fields": [ "nid", "title", "pid", "provcity", "category" ]
        }
        )" + mJsonStringTail;
    return jsonString;
}
static indexlib::config::IndexPartitionSchemaPtr global_schema;
TEST(TestBuilder, load_schema) {
    LOG(INFO) << "test load schema";
    std::string schema_json = GetSchema();
    LOG(INFO) << schema_json;
    //1
    indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));
    FromJsonString(*schema, schema_json);
    //2
    ASSERT_NO_THROW(indexlib::index_base::SchemaAdapter::LoadSchema(schema_json, schema));
    //3
    indexlib::storage::FileSystemWrapper::Delete("/tmp/myschema.json", true);
    indexlib::storage::FileSystemWrapper::AtomicStore("/tmp/myschema.json", schema_json);
    schema = indexlib::index_base::SchemaAdapter::LoadSchema("/tmp", "myschema.json");
    ASSERT_TRUE(schema);

    //schema field
    LOG(INFO) << "field count:" << schema->GetFieldSchema()->GetFieldCount();
    LOG(INFO) << "index count:" << schema->GetIndexSchema()->GetIndexCount();
    // LOG(INFO) << "index count:" << schema->GetSummarySchema()->GetSummaryCount();
    LOG(INFO) << "attr count:" << schema->GetAttributeSchema()->GetAttributeCount();
    auto attr_schema = schema->GetAttributeSchema();
    for(int i = 0; i < attr_schema->GetAttributeCount(); i++) {
        LOG(INFO) << "attr name: " << attr_schema->GetAttributeConfig(i)->GetAttrName()
                  << " id: " << attr_schema->GetAttributeConfig(i)->GetAttrId();
    }
    for(int i = 0; i < attr_schema->GetPackAttributeCount(); i++) {
        LOG(INFO) << "pack attr: " << attr_schema->GetPackAttributeConfig(i)->GetAttrName();
        std::vector<std::string> attrNames;
        attr_schema->GetPackAttributeConfig(i)->GetSubAttributeNames(attrNames);
        for(auto &sub: attrNames) {
            LOG(INFO) << "sub name: " << sub;
        }
    }
    auto summary_schema = schema->GetSummarySchema();
    // LOG(INFO) << "compress: " << summary_schema->GetSummaryConfig("quantity")->GetFieldConfig()->GetCompressType().GetCompressStr();
    // for(int i = 0; i < summary_schema->GetSummaryGroupConfigCount(); i++) {
    //     auto group = summary_schema->GetSummaryGroupConfig(i);
    //     LOG(INFO) << "summary group: " << group->GetGroupName()
    //               << " compress_type: " << group->GetCompressType()
    //               << " filed_count: " << group->GetSummaryFieldsCount();
    // }
    global_schema = schema;

}

static indexlib::document::RawDocumentPtr ParseDocStr(const std::string& docStr)
{
    const std::string DP_SPATIAL_KEY_VALUE_SEPARATOR = "|";
    const std::string DP_KEY_VALUE_SEPARATOR        = ",";
    const std::string DP_KEY_VALUE_EQUAL_SYMBOL     = "=";
    const std::string DP_CMD_SEPARATOR              = ";";
    const std::string DP_TOKEN_SEPARATOR            = " ";
    const char   DP_MULTI_VALUE_SEPARATOR      = ' ';
    const std::string DP_MAIN_JOIN_FIELD = "main_join";
    const std::string DP_SUB_JOIN_FIELD = "sub_join";
    std::vector<std::string> keyValues = autil::StringUtil::split(docStr, DP_SPATIAL_KEY_VALUE_SEPARATOR);
    if (keyValues.size() <= 1)
    {
        keyValues = autil::StringUtil::split(docStr, DP_KEY_VALUE_SEPARATOR);
    }

    indexlib::document::RawDocumentPtr rawDoc(new indexlib::document::DefaultRawDocument);
    for (size_t i = 0; i < keyValues.size(); ++i)
    {
        std::vector<std::string> keyValue = autil::StringUtil::split(keyValues[i], 
                DP_KEY_VALUE_EQUAL_SYMBOL);

        std::string key = keyValue[0];
        autil::StringUtil::trim(key);

        std::string value;
        if(keyValue.size() == 2)
        {
            value = keyValue[1];
            autil::StringUtil::trim(value);
        }
        rawDoc->setField(key, value);
    }
    return rawDoc;
}
static int64_t GetTsMirco() {
    return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
static indexlib::document::DocumentPtr global_doc;
TEST(TestBuilder, CreateDoc){
    std::string doc_str1 =
    "cmd=add,quantity=10,provcity=shanghai,category=1,nid=123,zk_time=1684305956,title=hello1,user=huozai,user_id=98765,vip=svip,ends=some_ends,pid=203,nick=world,int32_multi=2 3;";
    std::string doc_str2 =
    "cmd=add,quantity=11,provcity=shanghaj,category=2,nid=124,zk_time=1684305957,title=hello2,user=huozaj,user_id=98766,vip=tvip,ends=some_fnds,pid=204,nick=worle,int32_multi=4 5;";
    std::string doc_str3 =
    "cmd=add,quantity=12,provcity=shanghak,category=3,nid=125,zk_time=1684305958,title=hello3,user=huozak,user_id=98767,vip=ivip,ends=some_gnds,pid=205,nick=worlf,int32_multi=6 7;";
    //1 from doc str to raw doc
    auto raw_doc = ParseDocStr(doc_str1);
    ASSERT_TRUE(raw_doc);
    raw_doc->setDocTimestamp(GetTsMirco());
    LOG(INFO) << raw_doc->toString();
    raw_doc->SetTimestamp(GetTsMirco());
    raw_doc->setField("__ts__", std::to_string(raw_doc->GetTimestamp()/1000000 + 10000000));
    raw_doc->setDocTimestamp(raw_doc->GetTimestamp());


    //2 build normal doc
    indexlib::document::IndexlibExtendDocumentPtr extend_doc(new indexlib::document::IndexlibExtendDocument());
    extend_doc->setRawDocument(raw_doc);
    auto classified_doc = extend_doc->getClassifiedDocument();
    LOG(INFO) << "classifiedDoc: pk: " << classified_doc->getPrimaryKey();
    LOG(INFO) << "classifiedDoc: index: " << classified_doc->getIndexDocument()->GetPrimaryKey();
    LOG(INFO) << "classifiedDoc: attr packfield count: " << classified_doc->getAttributeDoc()->GetPackFieldCount();
    LOG(INFO) << "classifiedDoc: summary nonempty count: " << classified_doc->getSummaryDoc()->GetNotEmptyFieldCount();
    LOG(INFO) << "raw doc type: " << raw_doc->getDocOperateType();
    std::string schema_json = GetSchema();
    indexlib::config::IndexPartitionSchemaPtr schema(new indexlib::config::IndexPartitionSchema("myschema"));
    FromJsonString(*schema, schema_json);
    // schema->SetDefaultTTL(1000000); //TTL
    // schema->SetEnableTTL(true, DEFAULT_REGIONID, "__ts__");
    indexlib::document::DocumentFactoryWrapper wrapper(schema);
    wrapper.Init();
    indexlib::document::DocumentParserPtr parser(wrapper.CreateDocumentParser());
    auto normal_doc_parser = dynamic_cast<indexlib::document::NormalDocumentParser*>(parser.get());
    ASSERT_TRUE(normal_doc_parser != nullptr);
    indexlib::document::DocumentPtr doc = parser->Parse(extend_doc);
    ASSERT_TRUE(doc);
    indexlib::document::NormalDocumentPtr normal_doc = DYNAMIC_POINTER_CAST( indexlib::document::NormalDocument, doc);
    ASSERT_TRUE(normal_doc);
    global_doc = doc;
}

TEST(TestBuilder, TestOfflineBuilder){
    LOG(INFO) << "-----------------------begin offline---------------------";
    ASSERT_TRUE(global_schema);
    ASSERT_TRUE(global_doc);
    indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);
    options->SetIsOnline(false);
    std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";
    autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);
    auto& buildConfig = options->GetBuildConfig();
    buildConfig.enablePackageFile = false;
    buildConfig.maxDocCount = 1024;
    buildConfig.ttl = 1000000;
    indexlib::util::QuotaControlPtr memoryQuotaControl(new indexlib::util::QuotaControl(1024*1024*100));
    // indexlib::storage::FileSystemWrapper::DeleteDir("./tdata/builder");
    indexlib::partition::IndexBuilderPtr builder(new indexlib::partition::IndexBuilder("./tdata/builder", *options, global_schema, memoryQuotaControl));
    ASSERT_TRUE(builder->Init());
    for(int i = 0; i < 1000; i++) {
        ASSERT_TRUE(builder->Build(global_doc));
    }
    ASSERT_TRUE(builder->Merge(*options));
    builder->EndIndex();
}

TEST(TestBuilder, TestOnlineBuild){
    LOG(INFO) << "-----------------------begin online---------------------";
    ASSERT_TRUE(global_schema);
    ASSERT_TRUE(global_doc);
    indexlib::config::IndexPartitionOptionsPtr options(new indexlib::config::IndexPartitionOptions);
    options->SetIsOnline(true);
    std::string mergeConfigStr = "{\"class_name\":\"default\",\"parameters\":{\"split_num\":\"1\"}}";
    autil::legacy::FromJsonString(options->GetMergeConfig().GetSplitSegmentConfig(), mergeConfigStr);
    auto& buildConfig = options->GetBuildConfig();
    buildConfig.enablePackageFile = false;
    buildConfig.maxDocCount = 1024;
    buildConfig.ttl = 1000000;
    indexlib::util::MemoryQuotaControllerPtr quotaControl(new indexlib::util::MemoryQuotaController(100*1024*1024));
    indexlib::partition::OnlinePartitionPtr part(new indexlib::partition::OnlinePartition("online", quotaControl));
    options->GetOnlineConfig().onDiskFlushRealtimeIndex = true;
    options->GetOnlineConfig().maxRealtimeDumpInterval = 10000;
    options->GetOnlineConfig().maxRealtimeMemSize = 100*1024*1024;
    auto rs =  part->Open("./tdata/builder", "", global_schema, *options);
    ASSERT_EQ(rs,  indexlib::partition::IndexPartition::OS_OK);
   
    //todo  use IndexPartitionCreator::Create create partition
    indexlib::util::QuotaControlPtr memoryQuotaControlOnline(new indexlib::util::QuotaControl(100*1024*1024));
    indexlib::partition::IndexBuilderPtr online_builder(new indexlib::partition::IndexBuilder(part, memoryQuotaControlOnline));
    ASSERT_TRUE(online_builder->Init());

    //online add doc
    for(int i = 0; i < 100; i++) {
        ASSERT_TRUE(online_builder->Build(global_doc));
    }
    online_builder->EndIndex();
    LOG(INFO) << "online add doc ok";
    //query
    auto partReader = part->GetReader();
    ASSERT_TRUE(partReader);
    auto indexReader = partReader->GetIndexReader();
    ASSERT_TRUE(indexReader);
    indexlib::util::Term t("123", "pk");
    auto ite = indexReader->Lookup(t);
    ASSERT_TRUE(ite != nullptr);
    auto docid = ite->SeekDoc(INVALID_DOCID);
    LOG(INFO) << "type: " << ite->GetMatchValueType() << " dockid: " << docid;
    LOG(INFO) << "enable: " << _logger->isLevelEnabled(alog::LOG_LEVEL_INFO);
    IE_LOG(ERROR, "hello ie log");
    ALOG_ERROR(_logger, "hello :%s", "world");
    auto summaryReader = partReader->GetSummaryReader();
    ASSERT_TRUE(summaryReader != nullptr);
    indexlib::document::SearchSummaryDocument summaryDoc(NULL, 40960);
    ASSERT_TRUE(summaryReader->GetDocument(docid, &summaryDoc));
    int count = 3;
    LOG(INFO) << "field count: " << count;
    for(int i = 0; i < count; i++) {
        const autil::ConstString* field = summaryDoc.GetFieldValue(i);
        ASSERT_TRUE(field != nullptr);
        LOG(INFO) << "value is: " << *field;
    }
    
}

LOG配置

logger.conf

alog.rootLogger=INFO, indexlibAppender
alog.max_msg_len=2000000
alog.appender.indexlibAppender=ConsoleAppender
#alog.appender.indexlibAppender=FileAppender
#alog.appender.indexlibAppender.fileName=TestLog.log
alog.appender.indexlibAppender.flush=true
alog.appender.indexlibAppender.layout=PatternLayout
#alog.appender.indexlibAppender.layout.LogPattern=[%%d] [%%t], %%f() [%%n] [%%l] [%%m]
alog.appender.indexlibAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]

alog.logger.indexlib=INFO
inherit.indexlib.test=false
alog.logger.local.LocalFileSystem=INFO

alog.logger.ErrorLogCollector=TRACE1,ErrorLogCollectorAppender
inherit.ErrorLogCollector=false
alog.appender.ErrorLogCollectorAppender=FileAppender
alog.appender.ErrorLogCollectorAppender.fileName=error_log_collector.log
alog.appender.ErrorLogCollectorAppender.flush=true
alog.appender.ErrorLogCollectorAppender.max_file_size=100
alog.appender.ErrorLogCollectorAppender.layout=PatternLayout
alog.appender.ErrorLogCollectorAppender.layout.LogPattern=[%%h][xxxx][%%d][%%l][%%t][%%p][%%F:%%n %%f] : [%%m]
alog.appender.ErrorLogCollectorAppender.compress=true
alog.appender.ErrorLogCollectorAppender.log_keep_count=100

# alog.logger.indexlib.test=INFO, indexlibTestAppender
# alog.appender.indexlibTestAppender=ConsoleAppender
# alog.appender.indexlibTestAppender.layout=PatternLayout
# alog.appender.indexlibTestAppender.layout.LogPattern=[%%d] [%%t/%%p,PSM:%%f():%%n] [%%m]
# inherit.indexlib.test=false

生成数据在磁盘的结构

├── deploy_meta.0
├── deploy_meta.1
├── index_format_version
├── __indexlib_fs_root_link__@1684478171 -> ./tdata/builder
├── join_index_partition
├── merge_resource
│   └── version.1
├── rt_index_partition
│   ├── segment_1073741824_level_0
│   │   ├── attribute
│   │   │   ├── pack_attr
│   │   │   │   ├── data
│   │   │   │   ├── data_info
│   │   │   │   └── offset
│   │   │   ├── provcity
│   │   │   │   ├── data
│   │   │   │   ├── data_info
│   │   │   │   └── offset
│   │   │   ├── quantity
│   │   │   │   └── data
│   │   │   ├── uniq_pack_attr
│   │   │   │   ├── data
│   │   │   │   ├── data_info
│   │   │   │   └── offset
│   │   │   └── vip
│   │   │       ├── data
│   │   │       ├── data_info
│   │   │       └── offset
│   │   ├── counter
│   │   ├── deletionmap
│   │   │   └── data_1073741824
│   │   ├── deploy_index
│   │   ├── index
│   │   │   ├── pk
│   │   │   │   └── data
│   │   │   └── virtual_timestamp_index
│   │   │       ├── dictionary
│   │   │       ├── index_format_option
│   │   │       └── posting
│   │   ├── operation_log
│   │   │   ├── data
│   │   │   └── meta
│   │   ├── segment_file_list
│   │   ├── segment_info
│   │   ├── segment_metrics
│   │   └── summary
│   │       ├── data
│   │       └── offset
│   └── version.0
├── schema.json
├── segment_0_level_0
│   ├── attribute
│   │   ├── pack_attr
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   └── offset
│   │   ├── provcity
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   └── offset
│   │   ├── quantity
│   │   │   └── data
│   │   ├── uniq_pack_attr
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   └── offset
│   │   └── vip
│   │       ├── data
│   │       ├── data_info
│   │       └── offset
│   ├── counter
│   ├── deletionmap
│   │   └── data_0
│   ├── deploy_index
│   ├── index
│   │   └── pk
│   │       └── data
│   ├── segment_file_list
│   ├── segment_info
│   ├── segment_metrics
│   └── summary
│       ├── data
│       └── offset
├── segment_1_level_0
│   ├── attribute
│   │   ├── pack_attr
│   │   │   ├── category
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   ├── int32_multi
│   │   │   └── offset
│   │   ├── provcity
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   └── offset
│   │   ├── quantity
│   │   │   └── data
│   │   ├── uniq_pack_attr
│   │   │   ├── data
│   │   │   ├── data_info
│   │   │   └── offset
│   │   └── vip
│   │       ├── data
│   │       ├── data_info
│   │       └── offset
│   ├── counter
│   ├── deletionmap
│   ├── deploy_index
│   ├── index
│   │   └── pk
│   │       └── data
│   ├── segment_file_list
│   ├── segment_info
│   ├── segment_metrics
│   └── summary
│       ├── data
│       └── offset
├── summary_info
│   ├── index_summary.0
│   └── index_summary.1
├── version.0
└── version.1

分层架构

脑图详解 

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值