#include <assert.h>
#include <sys/time.h>
#include <openssl/md5.h>
#include <algorithm>
#include <stdint.h>
#include <math.h>
#include "include/share/darts.h"
#include "qmodule/transcode/include/transcode.h"
#include "include/google/protobuf/text_format.h"
#include "src/qrewrite/feature/include/proto_value.h"
#include "src/qrewrite/feature/src/phrase_info.pb.h"
#include "src/qfed/proto_to_array.h"
using namespace std;
using namespace qrewrite::phrase_feature;
static const uint16_t CUTOFF = 30000;
double log2 (double x){
return log(x)/log(2);
}
uint16_t trans_to_uint16(uint32_t val){
if(val <= CUTOFF){
return val;
}
else {
char buf[100] = {'\0'};
sprintf(buf,"%.6f",log2(val)*1000);
return CUTOFF + atoi(buf);
}
}
float trans_from_uint16(uint16_t val){
if(val <= CUTOFF){
return val;
}
else {
return pow(2,(val-CUTOFF)*1.0/1000);
}
}
void split(char* line, const char* delimit, vector<char*>* split_array) {
split_array->clear();
char* begin = strtok(line,delimit);
while(begin){
split_array->push_back(begin);
begin = strtok(NULL, delimit);
}
}
uint64_t md5(char* buf, int len) {
static uint8_t __smd5[16];
MD5((const unsigned char*)buf, len, __smd5);
return *(uint64_t*)__smd5;
}
#pragma pack(1)
struct item_md5
{
uint64_t key;
void set_key(uint64_t k){
key = k;
}
bool operator < (const item_md5 &v) const
{
if (v.key < key) {
return true;
} else {
return false;
}
}
uint16_t unigram_freq;
uint16_t trigram_freq;
uint16_t document_freq;
uint16_t sentence_freq;
item_md5() {
key = 0;
unigram_freq = 0;
trigram_freq = 0;
document_freq = 0;
sentence_freq = 0;
}
void set_unigram_freq(uint32_t k) {
unigram_freq = trans_to_uint16(k);
}
void set_trigram_freq(uint32_t k) {
trigram_freq = trans_to_uint16(k);
}
void set_document_freq(uint32_t k) {
document_freq = trans_to_uint16(k);
}
void set_sentence_freq(uint32_t k) {
sentence_freq = trans_to_uint16(k);
}
};
#pragma pack()
int compare_md5(const void *p, const void *q) {
const item_md5* p1 = (const item_md5*)p;
const item_md5* p2 = (const item_md5*)q;
if (p1->key == p2->key) {
return 0;
}
else if (p1->key > p2->key ){
return -1;
}
else {
return 1;
}
}
uint8_t* find_md5_value(uint64_t key, uint8_t* md5_start, uint8_t* md5_end){
item_md5 tmp;
tmp.key = key;
return (uint8_t*)bsearch(&tmp, md5_start, (md5_end - md5_start)/sizeof(item_md5) + 1, sizeof(item_md5), compare_md5);
}
bool parse_line(char* line, item_md5& item) {
if(strlen(line) <= 1){
fprintf(stderr, "empty line\n");
return false;
}
//remove tail white
if(line[strlen(line)-1] == '\t') {
line[strlen(line)-1] = '\0';
}
char* seg = strchr(line,'\t');
if(seg == NULL){
return false;
}
*seg = '\0';
item.key = md5(line,strlen(line));
std::vector<char*> parts;
parts.clear();
split(seg+1," ",&parts);
for(uint32_t i = 0; i < parts.size(); ++i){
char* sep1 = strstr(parts[i],"1:");
//char* sep2 = strstr(parts[i],"2:");
char* sep3 = strstr(parts[i],"3:");
char* sep4 = strstr(parts[i],"5:");
char* sep5 = strstr(parts[i],"7:");
if(sep1 != NULL){
item.set_unigram_freq(strtoul(sep1+2,NULL,10));
}
/*
else if(sep2 != NULL){
item.set_bigram_freq(strtoul(sep2+2,NULL,10));
}
*/
else if(sep3 != NULL){
item.set_trigram_freq(strtoul(sep3+2,NULL,10));
}
else if(sep4 != NULL){
item.set_document_freq(strtoul(sep4+2,NULL,10));
}
else if(sep5 != NULL){
item.set_sentence_freq(strtoul(sep5+2,NULL,10));
}
}
fprintf(stderr,"set k:v=%s:%lu %u:%u:%u:%u\n",line,item.key,
item.unigram_freq,item.trigram_freq,
item.document_freq,item.sentence_freq);
return true;
}
int test(char* dartsfile, char* test_keys) {
int fd = ::open(dartsfile, O_RDONLY);
if (fd < 0) {
return -1;
}
struct stat s;
if (fstat(fd, &s) == -1) {
return -1;
}
off_t size = s.st_size;
void* ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
return -1;
}
FILE* testInput = fopen(test_keys, "r");
char buf[1024];
std::string value_string;
uint8_t* md5_key_start = (uint8_t*)ptr + sizeof(size_t);// Start of md5 part
uint8_t* md5_key_end = (uint8_t*)ptr + size;
assert( (md5_key_end-md5_key_start)% sizeof(item_md5) == 0);
while (fgets(buf, 1024, testInput) != NULL) {
buf[strlen(buf)-1] = '\0';
uint64_t md5_key = md5(buf, strlen(buf));
uint8_t* mv = find_md5_value(md5_key, md5_key_start, md5_key_end);
if (mv != NULL) {
fprintf(stderr, "%s,\t%lu:%u\n",
buf, md5_key,
*(mv + sizeof(uint64_t)));
}
else {
fprintf(stderr, "Not Find %s\n", buf);
}
}
return 0;
}
int main(int argc, char** argv) {
if (argc != 3) {
if (argc == 4) {
fprintf(stderr, "test mode, darts:%s, keys:%s\n", argv[1], argv[2]);
return test(argv[1], argv[2]);
}
fprintf(stderr, "need argument as output binary darts\n");
return -1;
}
int fd = ::open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
fprintf(stderr, "open %s failed\n", argv[1]);
return -1;
}
FILE* fin = fopen(argv[2], "r");
if (fin == NULL) {
fprintf(stderr, "open %s error\n", argv[2]);
return -1;
}
vector<item_md5> inputs_md5;//items stores as md5
char buf[4096];
buf[0] = '\0';
int c = 0;
size_t key_num = 0;
while (fgets(buf, 4096, fin) != NULL) {
if(strlen(buf) <= 1)continue;
buf[strlen(buf) - 1] = 0;
c++;
item_md5 item;
if(parse_line(buf, item)){
inputs_md5.push_back(item);
key_num += 1;
}
else{
fprintf(stderr,"error in:%s\n", buf);
}
buf[0] = 0;
if (c % 1000 == 0) {
fprintf(stderr, "handle %d lines\r", c);
}
}
std::sort(inputs_md5.begin(), inputs_md5.end());
fprintf(stderr, "sort md5 key %lu finished\n",key_num);
write(fd, &key_num,sizeof(size_t));
uint8_t* md5_ptr = (uint8_t*)&inputs_md5[0];
uint64_t md5_len = inputs_md5.size() * sizeof(item_md5);
uint64_t wlen = 0;
while (wlen < md5_len) {
wlen += ::write(fd, md5_ptr + wlen, md5_len - wlen);
}
fprintf(stderr, "finished md5keynum:%lu, len:%lu\n", inputs_md5.size(), md5_len);
::close(fd);
return 0;
}
86 class NgramDict
87 {
88 public:
89 ~NgramDict();
90 bool GetNgramInfo(char* literal,NgramInfo& info);
91 bool GetNgramInfo(string literal,NgramInfo& info);
92
93 public:
94 uint8_t *m_map_ptr;
95 size_t m_map_size;
96 size_t m_key_num;
97 MemNgramInfo* m_values;
98
99 private:
100 NgramDict():
101 m_map_ptr(NULL),m_map_size(0),m_key_num(0),m_values(NULL) {
102 Init();
103 }
104 void Init();
105 void Test();
106 friend struct DefaultSingletonTraits<NgramDict>;
107 DISALLOW_COPY_AND_ASSIGN(NgramDict);
108
109 };
110
24 void NgramDict::Init() {
25 const char* file_name = FLAGS_ngram_dict.c_str();
26 if (!file_name || file_name[0] =='\0'){
27 dzlog_notice("input file name is empty.");
28 return;
29 }
30 struct stat st_buf;
31 if (stat(file_name, &st_buf) < 0) {
32 dzlog_notice("stat file \"%s\" failed. [%d]:%m.",
33 file_name, errno);
34 return;
35 }
36 size_t file_len;
37 void *buffer = mmapFile(file_name, true, file_len);
38 if (buffer == NULL) {
39 dzlog_notice("mmap file \"%s\" failed",file_name);
40 return;
41 }
42
43 this->m_map_ptr = (uint8_t *)buffer;
44 this-> m_map_size = st_buf.st_size;
45 this->m_values = (MemNgramInfo*)((uint8_t*)buffer + sizeof(size_t));
46 this->m_key_num = *(size_t*)buffer;
47 dzlog_notice("ngram.darts key_num=%lu",this->m_key_num);
48
49 assert((this->m_map_size-sizeof(size_t)) %sizeof(MemNgramInfo) == 0);
50 Test();
51 }
98 bool NgramDict::GetNgramInfo(char* literal,NgramInfo& info) {
99 if(m_map_ptr == NULL || m_values == NULL){
100 return false;
101 }
102
103 uint64_t md5_key = md5_sum(literal, strlen(literal));
104 NgramInfo tmp;
105 tmp.key = md5_key;
106 MemNgramInfo* res = (MemNgramInfo*)bsearch(&tmp, this->m_values, this->m_key_num, sizeof(MemNgramInfo), compare_ngram);
107
108 if (res == NULL) {
109 return false;
110 }
111 else{
112 res->trans_to_normal(info);
113 return true;
114 }
115 }