49 class PosWeightDict
50 {
51 public:
52 ~PosWeightDict();
53 bool GetWeightInfo(std::string literal,WeightInfo* wei);
54
55 private:
56 uint8_t *m_map_ptr;
57 size_t m_map_size;
58 Darts::DoubleArray m_darts_dict;
59 WeightInfo* m_values;
60
61 private:
62 PosWeightDict():
63 m_map_ptr(NULL),m_map_size(0),m_values(NULL) {
64 Init();
65 }
66 void Init();
67 void Test();
68 friend struct DefaultSingletonTraits<PosWeightDict>;
69 DISALLOW_COPY_AND_ASSIGN(PosWeightDict);
70
71 };
72
73 }
#include <vector>
#include <string>
#include <sys/mman.h>
#include <fcntl.h>
#include "include/zlog/zlog.h"
#include "src/qrewrite/include/pos_weight_dict.h"
#include "google/protobuf/text_format.h"
#include "include/qquery/qquery_common.h"
#include "src/util/logging.h"
#include "src/util/flags.h"
DEFINE_string(pos_weight_dict,
"",
"Path of the dict file for pos ngram based term weight");
namespace qrewrite {
void PosWeightDict::Init() {
const char* file_name = FLAGS_pos_weight_dict.c_str();
if (!file_name || file_name[0] =='\0'){
dzlog_notice("input file name is empty.");
return;
}
struct stat st_buf;
if (stat(file_name, &st_buf) < 0) {
dzlog_error("stat file \"%s\" failed. [%d]:%m.",
file_name, errno);
return;
}
size_t file_len;
void *buffer = mmapFile(file_name, true, file_len);
if (buffer == NULL) {
dzlog_error("mmap file \"%s\" failed",file_name);
return;
}
size_t darts_len = this->m_darts_dict.open_mmap(buffer);
if (darts_len == 0) {
dzlog_error("darts mmap open failed\n");
return;
}
if((st_buf.st_size - darts_len) % sizeof(WeightInfo) != 0){
dzlog_error("darts mmap open failed\n");
return;
}
this->m_map_ptr = (uint8_t *)buffer;
this-> m_map_size = st_buf.st_size;
this->m_values = (WeightInfo*)((uint8_t*)buffer + darts_len);
Test();
}
void PosWeightDict::Test(){
WeightInfo wei;
if(!this->GetWeightInfo("000054号",&wei)){
dzlog_notice("pos weight dict test failed");
}
else{
dzlog_notice("pos weight dict test ok %f:%f:%f:%f:%f:%f",
wei.pos_idf,
wei.pos_ridf,
wei.pos_bs,
wei.pos_boolean,
wei.pos_weighted,
wei.pos_weighted_norm);
}
}
PosWeightDict::~PosWeightDict() {
if (m_map_ptr) {
::munmap(m_map_ptr, m_map_size);
}
m_map_ptr = NULL;
m_map_size = 0;
m_darts_dict.clear();
m_values = NULL;
}
bool PosWeightDict::GetWeightInfo(std::string literal,WeightInfo* wei) {
if(m_map_ptr == NULL || m_values == NULL){
return false;
}
Darts::DoubleArray::result_pair_type result;
m_darts_dict.exactMatchSearch(literal.c_str(), result);
if (result.length == 0) {
return false;
}
else{
wei->pos_idf = m_values[result.value].pos_idf;
wei->pos_ridf = m_values[result.value].pos_ridf;
wei->pos_bs = m_values[result.value].pos_bs;
wei->pos_boolean = m_values[result.value].pos_boolean;
wei->pos_weighted = m_values[result.value].pos_weighted;
wei->pos_weighted_norm = m_values[result.value].pos_weighted_norm;
return true;
}
}
}
/* vim: set ts=4 sw=4 sts=4 tw=100 et: */
#include <stdlib.h>
#include <sys/mman.h>
#include <stdio.h>
#include <vector>
#include <unistd.h>
#include <stdint.h>
#include "src/util/at_exit.h"
#include "include/share/darts.h"
#include "src/util/string_util.h"
#include "src/util/flags.h"
#include "src/util/singleton.h"
using namespace std;
#pragma pack(1)
struct WeightInfo {
float pos_idf;
float pos_ridf;
float pos_bs;
float pos_boolean;
float pos_weighted;
float pos_weighted_norm;
};
#pragma pack()
struct item_dar
{
std::string key;
int32_t value_idx;
item_dar() {
key = "";
value_idx = 0;
}
void set_key(char* k) {
key = k;
}
bool operator < (const item_dar &v) const
{
int ret = key.compare(v.key);
if (ret < 0) {
return true;
} else {
return false;
}
}
};
void split(char* line, const char* delimit, vector<char*>* split_array) {
split_array->clear();
char* begin = strtok(line,delimit);
while(begin){
split_array->push_back(begin);
begin = strtok(NULL, delimit);
}
}
bool parse_line(char* line, item_dar& key_dar,WeightInfo &weight_info) {
if(strlen(line) <= 1){
fprintf(stderr, "empty line\n");
return false;
}
//remove tail white
if(line[strlen(line)-1] == '\t') {
line[strlen(line)-1] = '\0';
}
std::vector<char*> parts;
parts.clear();
split(line,"\t",&parts);
if(parts.size() != 7){
return false;
}
key_dar.key = string(parts[0]);
weight_info.pos_idf = strtod(parts[1],NULL);
weight_info.pos_ridf = strtod(parts[2],NULL);
weight_info.pos_bs = strtod(parts[3],NULL);
weight_info.pos_boolean = strtod(parts[4],NULL);
weight_info.pos_weighted = strtod(parts[5],NULL);
weight_info.pos_weighted_norm = strtod(parts[6],NULL);
return true;
}
int test(char* patternfile, char* test_keys) {
int fd = ::open(patternfile, O_RDONLY);
if (fd < 0) {
return -1;
}
struct stat s;
if (fstat(fd, &s) == -1) {
return -1;
}
off_t size = s.st_size;
void* ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
fprintf(stderr, "MAP_FAILED.\n");
return -1;
}
::madvise(ptr, s.st_size, MADV_WILLNEED);
Darts::DoubleArray darts_dict;
size_t darts_len = darts_dict.open_mmap(ptr);
if (darts_len == 0) {
fprintf(stderr, "darts mmap open failed\n");
return -1;
}
CHECK((size - darts_len) % sizeof(WeightInfo) == 0);
fprintf(stderr,"sizeof(WeightInfo)=%lu\n",sizeof(WeightInfo));
fprintf(stderr,"%lu\t%lu\n",size,darts_len);
WeightInfo* values = (WeightInfo*)((uint8_t*)ptr + darts_len);
FILE* testInput = fopen(test_keys, "r");
char buf[1024];
std::string value_string;
Darts::DoubleArray::result_pair_type result;
while (fgets(buf, 1024, testInput) != NULL) {
buf[strlen(buf)-1] = '\0';
darts_dict.exactMatchSearch(buf, result);
if (result.length == 0) {
fprintf(stderr, "not found %s\n",buf);
}
else{
fprintf(stderr, "found %s, %lu:%d has match in darts\n", buf,result.length,result.value);
WeightInfo tmp;
tmp = values[result.value];
fprintf(stderr,"%f %f %f %f %f %f\n",tmp.pos_idf,tmp.pos_ridf,tmp.pos_bs,tmp.pos_boolean,tmp.pos_weighted,tmp.pos_weighted_norm);
}
}
fclose(testInput);
return 0;
}
int main(int argc, char** argv) {
util::ParseCommandLineFlags(&argc, &argv, true);
util::AtExitManager exit_manager;
if (argc != 3) {
if (argc == 4) {
fprintf(stderr, "test mode, bins:%s, keys:%s\n", argv[1], argv[2]);
return test(argv[1], argv[2]);
}
fprintf(stderr, "need argument as output binary darts\n");
return -1;
}
int fd = ::open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
fprintf(stderr, "open %s failed\n", argv[1]);
return -1;
}
//source file for input
FILE* fin = fopen(argv[2], "r");
if (fin == NULL) {
fprintf(stderr, "open %s error\n", argv[2]);
return -1;
}
std::vector<item_dar> keys_vector;
std::vector<WeightInfo> values_vector;
char buf[4096];
buf[0] = '\0';
lseek(fd, sizeof(size_t), SEEK_SET);
while (fgets(buf, 4096, fin) != NULL) {
buf[strlen(buf) - 1] = 0;
if(strlen(buf) <= 1){
fprintf(stderr,"low length");
continue;
}
item_dar dar_key;
WeightInfo tmp_weight_info;
if (parse_line(buf, dar_key, tmp_weight_info)) {
dar_key.value_idx = values_vector.size();
keys_vector.push_back(dar_key);
values_vector.push_back(tmp_weight_info);
} else {
fprintf(stderr,"error in:%s\n", buf);
}
buf[0] = 0;
}
std::sort(keys_vector.begin(), keys_vector.end());
std::vector<const char*> sorted_keys;
std::vector<int> sorted_values;
std::vector<size_t> sorted_keylens;
for (uint32_t i = 0; i < keys_vector.size(); i++) {
sorted_keys.push_back(keys_vector[i].key.c_str());
sorted_keylens.push_back(keys_vector[i].key.length());
sorted_values.push_back(keys_vector[i].value_idx);
}
Darts::DoubleArray darts;
struct timeval start, end;
gettimeofday(&start, NULL);
if (darts.build(sorted_keys.size(), &sorted_keys[0], &sorted_keylens[0], &sorted_values[0])) {
fprintf(stderr, "build darts failed\n");
return -1;
}
gettimeofday(&end, NULL);
fprintf(stderr, "%lu keys, used %lu s\n", sorted_keys.size(), end.tv_sec - start.tv_sec);
darts.save(fd,0);
uint8_t* value_start = (uint8_t*)&values_vector[0];
size_t total_size = values_vector.size() * sizeof(WeightInfo);
size_t wn = 0;
while (wn < total_size) {
wn += ::write(fd, value_start + wn, total_size - wn);
}
::close(fd);
fclose(fin);
return 0;
}