DoubleArray词典管理

49 class PosWeightDict
 50 {
 51 public:
 52     ~PosWeightDict();
 53     bool GetWeightInfo(std::string literal,WeightInfo* wei);
 54 
 55 private:
 56     uint8_t *m_map_ptr;
 57     size_t m_map_size;
 58     Darts::DoubleArray m_darts_dict;
 59     WeightInfo* m_values;
 60 
 61 private:
 62     PosWeightDict():
 63         m_map_ptr(NULL),m_map_size(0),m_values(NULL) {
 64         Init();
 65     }
 66     void Init();
 67     void Test();
 68     friend struct DefaultSingletonTraits<PosWeightDict>;
 69     DISALLOW_COPY_AND_ASSIGN(PosWeightDict);
 70 
 71 };
 72 

 73 }

 

#include <vector>
#include <string>
#include <sys/mman.h>
#include <fcntl.h>
#include "include/zlog/zlog.h"
#include "src/qrewrite/include/pos_weight_dict.h"
#include "google/protobuf/text_format.h"
#include "include/qquery/qquery_common.h"
#include "src/util/logging.h"
#include "src/util/flags.h"




DEFINE_string(pos_weight_dict, 
    "",
    "Path of the dict file for pos ngram based term weight");


namespace qrewrite {


void PosWeightDict::Init() {
    const char* file_name = FLAGS_pos_weight_dict.c_str();
    if (!file_name || file_name[0] =='\0'){
        dzlog_notice("input file name is empty.");
        return;
    }
    struct stat st_buf;
    if (stat(file_name, &st_buf) < 0) {
        dzlog_error("stat file \"%s\" failed. [%d]:%m.",
                file_name, errno);
        return;
    }
    size_t file_len;
    void *buffer = mmapFile(file_name, true, file_len);
    if (buffer == NULL) {
        dzlog_error("mmap file \"%s\" failed",file_name);
         return;
    }
    size_t darts_len = this->m_darts_dict.open_mmap(buffer);
    if (darts_len == 0) {
        dzlog_error("darts mmap open failed\n");
        return;
    }
    if((st_buf.st_size - darts_len) % sizeof(WeightInfo) != 0){
        dzlog_error("darts mmap open failed\n");
        return;
    }
    this->m_map_ptr = (uint8_t *)buffer;
    this-> m_map_size = st_buf.st_size;
    this->m_values = (WeightInfo*)((uint8_t*)buffer + darts_len);
    Test();
}


void PosWeightDict::Test(){
    WeightInfo  wei;
    if(!this->GetWeightInfo("000054号",&wei)){
         dzlog_notice("pos weight dict test failed");
    }
    else{
        dzlog_notice("pos weight dict test ok %f:%f:%f:%f:%f:%f",
                    wei.pos_idf,
                    wei.pos_ridf,
                    wei.pos_bs,
                    wei.pos_boolean,
                    wei.pos_weighted,
                    wei.pos_weighted_norm);
    }
}


PosWeightDict::~PosWeightDict() {
    if (m_map_ptr) {
        ::munmap(m_map_ptr, m_map_size);
    }
    m_map_ptr = NULL;
    m_map_size = 0;
    m_darts_dict.clear();
    m_values = NULL;
}


bool PosWeightDict::GetWeightInfo(std::string literal,WeightInfo* wei) {
    if(m_map_ptr == NULL || m_values == NULL){
        return false;
    }
    Darts::DoubleArray::result_pair_type result;
    m_darts_dict.exactMatchSearch(literal.c_str(), result);
    if (result.length == 0) {
        return false;
    }
    else{
        wei->pos_idf = m_values[result.value].pos_idf;
        wei->pos_ridf = m_values[result.value].pos_ridf;
        wei->pos_bs = m_values[result.value].pos_bs;
        wei->pos_boolean = m_values[result.value].pos_boolean;
        wei->pos_weighted = m_values[result.value].pos_weighted;
        wei->pos_weighted_norm = m_values[result.value].pos_weighted_norm;
        return true;             
    }



}

/* vim: set ts=4 sw=4 sts=4 tw=100 et: */

 

 

#include <stdlib.h>
#include <sys/mman.h>
#include <stdio.h>
#include <vector>
#include <unistd.h>
#include <stdint.h>
#include "src/util/at_exit.h"
#include "include/share/darts.h"
#include "src/util/string_util.h"
#include "src/util/flags.h"
#include "src/util/singleton.h"


using namespace std;


#pragma pack(1)
struct WeightInfo {
    float  pos_idf;
    float  pos_ridf;
    float  pos_bs;
    float  pos_boolean;
    float  pos_weighted;
    float  pos_weighted_norm;
};
#pragma pack()


struct item_dar
{
    std::string key;
    int32_t value_idx; 
    item_dar() {
        key = "";
        value_idx = 0;
    }


    void set_key(char* k) {
        key = k;
    }
    
    bool operator < (const item_dar &v) const
    {
        int ret = key.compare(v.key);
        if (ret < 0) {
            return true;
        } else {
            return false;
        } 
    }
};
 
void split(char* line, const char* delimit, vector<char*>* split_array) {
    split_array->clear();
    char* begin = strtok(line,delimit);
        while(begin){
                split_array->push_back(begin);
                begin = strtok(NULL, delimit);
        }
}


bool parse_line(char* line, item_dar& key_dar,WeightInfo &weight_info) {
    if(strlen(line) <= 1){
        fprintf(stderr, "empty line\n");
        return false;
    }
    //remove tail white
    if(line[strlen(line)-1] == '\t') {
        line[strlen(line)-1] = '\0';
    }
         
    std::vector<char*> parts;
    parts.clear();
    split(line,"\t",&parts);
 
    if(parts.size() != 7){
        return false;
    }
    
    key_dar.key = string(parts[0]);
    weight_info.pos_idf = strtod(parts[1],NULL);
    weight_info.pos_ridf = strtod(parts[2],NULL);
    weight_info.pos_bs = strtod(parts[3],NULL);
    weight_info.pos_boolean = strtod(parts[4],NULL);
    weight_info.pos_weighted = strtod(parts[5],NULL);
    weight_info.pos_weighted_norm = strtod(parts[6],NULL);
    
    return true;
}


int test(char* patternfile, char* test_keys) {
    int fd = ::open(patternfile, O_RDONLY);
    if (fd < 0) {
        return -1;
    }


    struct stat s;
    if (fstat(fd, &s) == -1) {
        return -1;
    }


    off_t size = s.st_size;
    void* ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
    if (ptr == MAP_FAILED) {
        fprintf(stderr, "MAP_FAILED.\n");
        return -1;
    }
    
    ::madvise(ptr, s.st_size, MADV_WILLNEED);
    Darts::DoubleArray darts_dict;
    size_t darts_len = darts_dict.open_mmap(ptr);
    if (darts_len == 0) {
        fprintf(stderr, "darts mmap open failed\n");
        return -1;
    }
    
    CHECK((size - darts_len) % sizeof(WeightInfo) == 0);
    fprintf(stderr,"sizeof(WeightInfo)=%lu\n",sizeof(WeightInfo));
    fprintf(stderr,"%lu\t%lu\n",size,darts_len);
    WeightInfo* values = (WeightInfo*)((uint8_t*)ptr + darts_len);


    FILE* testInput = fopen(test_keys, "r");
    char buf[1024];
    std::string value_string;
    Darts::DoubleArray::result_pair_type result;
    while (fgets(buf, 1024, testInput) != NULL) {
        buf[strlen(buf)-1] = '\0';
        darts_dict.exactMatchSearch(buf, result);
        if (result.length == 0) {
            fprintf(stderr, "not found %s\n",buf);
        }
        else{
            fprintf(stderr, "found %s, %lu:%d has  match in darts\n", buf,result.length,result.value);
            WeightInfo tmp;
            tmp = values[result.value];
            fprintf(stderr,"%f %f %f %f %f %f\n",tmp.pos_idf,tmp.pos_ridf,tmp.pos_bs,tmp.pos_boolean,tmp.pos_weighted,tmp.pos_weighted_norm);
         }
    }
    fclose(testInput);
    return 0;
}


int main(int argc, char** argv) {
    util::ParseCommandLineFlags(&argc, &argv, true);
    util::AtExitManager exit_manager;
    
    if (argc != 3) {
        if (argc == 4) {
            fprintf(stderr, "test mode, bins:%s, keys:%s\n", argv[1], argv[2]);
            return test(argv[1], argv[2]);
        }
        fprintf(stderr, "need argument as output binary darts\n");
        return -1;
    }
    
    int fd = ::open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
    if (fd < 0) {
        fprintf(stderr, "open %s failed\n", argv[1]);
        return -1;
    }


    //source file for input
    FILE* fin = fopen(argv[2], "r");
    if (fin == NULL) {
        fprintf(stderr, "open %s error\n", argv[2]);
        return -1;
    }
    
    std::vector<item_dar> keys_vector;
    std::vector<WeightInfo> values_vector;
    char buf[4096];
        buf[0] = '\0';
    
    lseek(fd, sizeof(size_t), SEEK_SET);
    while (fgets(buf, 4096, fin) != NULL) {
        buf[strlen(buf) - 1] = 0;
                if(strlen(buf) <= 1){ 
            fprintf(stderr,"low length"); 
            continue;
        }
        item_dar dar_key;
        WeightInfo tmp_weight_info;
        if (parse_line(buf, dar_key, tmp_weight_info)) {
            dar_key.value_idx = values_vector.size();
            keys_vector.push_back(dar_key);
            values_vector.push_back(tmp_weight_info);
        } else {
            fprintf(stderr,"error in:%s\n", buf);
        }
        buf[0] = 0;
    }   
    
    std::sort(keys_vector.begin(), keys_vector.end());
    std::vector<const char*> sorted_keys;
    std::vector<int> sorted_values;
    std::vector<size_t> sorted_keylens;
    
    for (uint32_t i = 0; i < keys_vector.size(); i++) {
        sorted_keys.push_back(keys_vector[i].key.c_str());
        sorted_keylens.push_back(keys_vector[i].key.length());
        sorted_values.push_back(keys_vector[i].value_idx);
    }
    
    Darts::DoubleArray darts;
    struct timeval start, end;
    gettimeofday(&start, NULL);
    if (darts.build(sorted_keys.size(), &sorted_keys[0], &sorted_keylens[0], &sorted_values[0])) {
        fprintf(stderr, "build darts failed\n");
        return -1;
    }


    gettimeofday(&end, NULL);
    fprintf(stderr, "%lu keys, used %lu s\n", sorted_keys.size(), end.tv_sec - start.tv_sec);
    darts.save(fd,0);
    uint8_t* value_start = (uint8_t*)&values_vector[0];
    size_t total_size = values_vector.size() * sizeof(WeightInfo);
    size_t wn = 0;
    while (wn < total_size) {
        wn += ::write(fd, value_start + wn, total_size - wn);
    }
    ::close(fd);
    fclose(fin);
    return 0;
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值