#include <stdlib.h>
#include <sys/mman.h>
#include <stdio.h>
#include <vector>
#include <unistd.h>
#include <stdint.h>
using namespace std;
#pragma pack(1)
struct PhraseGroup {
char phrases[1024];
};
#pragma pack()
struct item_dar
{
std::string key;
int32_t value_idx;
item_dar() {
key = "";
value_idx = 0;
}
void set_key(char* k) {
key = k;
}
bool operator < (const item_dar &v) const
{
int ret = key.compare(v.key);
if (ret < 0) {
return true;
} else {
return false;
}
}
};
void split(char* line, const char* delimit, vector<char*>* split_array) {
split_array->clear();
char* begin = strtok(line,delimit);
while(begin){
split_array->push_back(begin);
begin = strtok(NULL, delimit);
}
}
bool parse_line(char* line, std::vector<char*>* keys) {
if(strlen(line) <= 1){
fprintf(stderr, "empty line\n");
return false;
}
if(strlen(line) > 1024){
fprintf(stderr, "long line\n");
return false;
}
std::vector<char*> parts;
parts.clear();
split(line,"\t",&parts);
if(parts.size() > 3){
return false;
}
for(size_t i = 0; i < parts.size(); ++i){
if(strlen(parts[i]) <=1) {
return false;
}
}
keys->assign(parts.begin(),parts.end());
return true;
}
int test(char* patternfile, char* test_keys) {
int fd = ::open(patternfile, O_RDONLY);
if (fd < 0) {
return -1;
}
struct stat s;
if (fstat(fd, &s) == -1) {
return -1;
}
off_t size = s.st_size;
void* ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
fprintf(stderr, "MAP_FAILED.\n");
return -1;
}
::madvise(ptr, s.st_size, MADV_WILLNEED);
Darts::DoubleArray darts_dict;
size_t darts_len = darts_dict.open_mmap(ptr);
if (darts_len == 0) {
fprintf(stderr, "darts mmap open failed\n");
return -1;
}
CHECK((size - darts_len) % sizeof(PhraseGroup) == 0);
fprintf(stderr,"sizeof(PhraseGroup)=%lu\n",sizeof(PhraseGroup));
fprintf(stderr,"%lu\t%lu\n",size,darts_len);
PhraseGroup* values = (PhraseGroup*)((uint8_t*)ptr + darts_len);
FILE* testInput = fopen(test_keys, "r");
char buf[1024];
std::string value_string;
Darts::DoubleArray::result_pair_type result;
while (fgets(buf, 1024, testInput) != NULL) {
buf[strlen(buf)-1] = '\0';
darts_dict.exactMatchSearch(buf, result);
if (result.length == 0) {
fprintf(stderr, "not found %s\n",buf);
}
else{
fprintf(stderr, "found %s, %lu:%d has match in darts\n", buf,result.length,result.value);
PhraseGroup tmp;
tmp = values[result.value];
fprintf(stderr,"%s\n",tmp.phrases);
}
}
fclose(testInput);
return 0;
}
int main(int argc, char** argv) {
util::ParseCommandLineFlags(&argc, &argv, true);
util::AtExitManager exit_manager;
if (argc != 3) {
if (argc == 4) {
fprintf(stderr, "test mode, bins:%s, keys:%s\n", argv[1], argv[2]);
return test(argv[1], argv[2]);
}
fprintf(stderr, "need argument as output binary darts\n");
return -1;
}
int fd = ::open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
fprintf(stderr, "open %s failed\n", argv[1]);
return -1;
}
//source file for input
FILE* fin = fopen(argv[2], "r");
if (fin == NULL) {
fprintf(stderr, "open %s error\n", argv[2]);
return -1;
}
std::vector<item_dar> keys_vector;
std::vector<PhraseGroup> values_vector;
char buf[1024];
buf[0] = '\0';
lseek(fd, sizeof(size_t), SEEK_SET);
while (fgets(buf, 1024, fin) != NULL) {
buf[strlen(buf) - 1] = 0;
if(strlen(buf) <= 1){
fprintf(stderr,"low length");
continue;
}
std::vector<char*> keys;
std::string raw = std::string(buf);
if (parse_line(buf, &keys)) {
for(size_t i = 0 ; i < keys.size(); ++i){
item_dar dar_key;
PhraseGroup tmp_phrase_group;
strncpy(tmp_phrase_group.phrases,raw.c_str(),1024);
dar_key.key = std::string(keys[i]);
dar_key.value_idx = values_vector.size();
keys_vector.push_back(dar_key);
values_vector.push_back(tmp_phrase_group);
}
} else {
fprintf(stderr,"error in:%s\n", raw.c_str());
}
buf[0] = 0;
}
std::sort(keys_vector.begin(), keys_vector.end());
std::vector<const char*> sorted_keys;
std::vector<int> sorted_values;
std::vector<size_t> sorted_keylens;
for (uint32_t i = 0; i < keys_vector.size(); i++) {
sorted_keys.push_back(keys_vector[i].key.c_str());
sorted_keylens.push_back(keys_vector[i].key.length());
sorted_values.push_back(keys_vector[i].value_idx);
}
Darts::DoubleArray darts;
struct timeval start, end;
gettimeofday(&start, NULL);
if (darts.build(sorted_keys.size(), &sorted_keys[0], &sorted_keylens[0], &sorted_values[0])) {
fprintf(stderr, "build darts failed\n");
return -1;
}
gettimeofday(&end, NULL);
fprintf(stderr, "%lu keys, used %lu s\n", sorted_keys.size(), end.tv_sec - start.tv_sec);
darts.save(fd,0);
uint8_t* value_start = (uint8_t*)&values_vector[0];
size_t total_size = values_vector.size() * sizeof(PhraseGroup);
size_t wn = 0;
while (wn < total_size) {
wn += ::write(fd, value_start + wn, total_size - wn);
}
::close(fd);
fclose(fin);
return 0;
}
#include <sys/mman.h>
#include <stdio.h>
#include <vector>
#include <unistd.h>
#include <stdint.h>
using namespace std;
#pragma pack(1)
struct PhraseGroup {
char phrases[1024];
};
#pragma pack()
struct item_dar
{
std::string key;
int32_t value_idx;
item_dar() {
key = "";
value_idx = 0;
}
void set_key(char* k) {
key = k;
}
bool operator < (const item_dar &v) const
{
int ret = key.compare(v.key);
if (ret < 0) {
return true;
} else {
return false;
}
}
};
void split(char* line, const char* delimit, vector<char*>* split_array) {
split_array->clear();
char* begin = strtok(line,delimit);
while(begin){
split_array->push_back(begin);
begin = strtok(NULL, delimit);
}
}
bool parse_line(char* line, std::vector<char*>* keys) {
if(strlen(line) <= 1){
fprintf(stderr, "empty line\n");
return false;
}
if(strlen(line) > 1024){
fprintf(stderr, "long line\n");
return false;
}
std::vector<char*> parts;
parts.clear();
split(line,"\t",&parts);
if(parts.size() > 3){
return false;
}
for(size_t i = 0; i < parts.size(); ++i){
if(strlen(parts[i]) <=1) {
return false;
}
}
keys->assign(parts.begin(),parts.end());
return true;
}
int test(char* patternfile, char* test_keys) {
int fd = ::open(patternfile, O_RDONLY);
if (fd < 0) {
return -1;
}
struct stat s;
if (fstat(fd, &s) == -1) {
return -1;
}
off_t size = s.st_size;
void* ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED) {
fprintf(stderr, "MAP_FAILED.\n");
return -1;
}
::madvise(ptr, s.st_size, MADV_WILLNEED);
Darts::DoubleArray darts_dict;
size_t darts_len = darts_dict.open_mmap(ptr);
if (darts_len == 0) {
fprintf(stderr, "darts mmap open failed\n");
return -1;
}
CHECK((size - darts_len) % sizeof(PhraseGroup) == 0);
fprintf(stderr,"sizeof(PhraseGroup)=%lu\n",sizeof(PhraseGroup));
fprintf(stderr,"%lu\t%lu\n",size,darts_len);
PhraseGroup* values = (PhraseGroup*)((uint8_t*)ptr + darts_len);
FILE* testInput = fopen(test_keys, "r");
char buf[1024];
std::string value_string;
Darts::DoubleArray::result_pair_type result;
while (fgets(buf, 1024, testInput) != NULL) {
buf[strlen(buf)-1] = '\0';
darts_dict.exactMatchSearch(buf, result);
if (result.length == 0) {
fprintf(stderr, "not found %s\n",buf);
}
else{
fprintf(stderr, "found %s, %lu:%d has match in darts\n", buf,result.length,result.value);
PhraseGroup tmp;
tmp = values[result.value];
fprintf(stderr,"%s\n",tmp.phrases);
}
}
fclose(testInput);
return 0;
}
int main(int argc, char** argv) {
util::ParseCommandLineFlags(&argc, &argv, true);
util::AtExitManager exit_manager;
if (argc != 3) {
if (argc == 4) {
fprintf(stderr, "test mode, bins:%s, keys:%s\n", argv[1], argv[2]);
return test(argv[1], argv[2]);
}
fprintf(stderr, "need argument as output binary darts\n");
return -1;
}
int fd = ::open(argv[1], O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (fd < 0) {
fprintf(stderr, "open %s failed\n", argv[1]);
return -1;
}
//source file for input
FILE* fin = fopen(argv[2], "r");
if (fin == NULL) {
fprintf(stderr, "open %s error\n", argv[2]);
return -1;
}
std::vector<item_dar> keys_vector;
std::vector<PhraseGroup> values_vector;
char buf[1024];
buf[0] = '\0';
lseek(fd, sizeof(size_t), SEEK_SET);
while (fgets(buf, 1024, fin) != NULL) {
buf[strlen(buf) - 1] = 0;
if(strlen(buf) <= 1){
fprintf(stderr,"low length");
continue;
}
std::vector<char*> keys;
std::string raw = std::string(buf);
if (parse_line(buf, &keys)) {
for(size_t i = 0 ; i < keys.size(); ++i){
item_dar dar_key;
PhraseGroup tmp_phrase_group;
strncpy(tmp_phrase_group.phrases,raw.c_str(),1024);
dar_key.key = std::string(keys[i]);
dar_key.value_idx = values_vector.size();
keys_vector.push_back(dar_key);
values_vector.push_back(tmp_phrase_group);
}
} else {
fprintf(stderr,"error in:%s\n", raw.c_str());
}
buf[0] = 0;
}
std::sort(keys_vector.begin(), keys_vector.end());
std::vector<const char*> sorted_keys;
std::vector<int> sorted_values;
std::vector<size_t> sorted_keylens;
for (uint32_t i = 0; i < keys_vector.size(); i++) {
sorted_keys.push_back(keys_vector[i].key.c_str());
sorted_keylens.push_back(keys_vector[i].key.length());
sorted_values.push_back(keys_vector[i].value_idx);
}
Darts::DoubleArray darts;
struct timeval start, end;
gettimeofday(&start, NULL);
if (darts.build(sorted_keys.size(), &sorted_keys[0], &sorted_keylens[0], &sorted_values[0])) {
fprintf(stderr, "build darts failed\n");
return -1;
}
gettimeofday(&end, NULL);
fprintf(stderr, "%lu keys, used %lu s\n", sorted_keys.size(), end.tv_sec - start.tv_sec);
darts.save(fd,0);
uint8_t* value_start = (uint8_t*)&values_vector[0];
size_t total_size = values_vector.size() * sizeof(PhraseGroup);
size_t wn = 0;
while (wn < total_size) {
wn += ::write(fd, value_start + wn, total_size - wn);
}
::close(fd);
fclose(fin);
return 0;
}