#include <bits/stdc++.h>
#include <pthread.h>
using namespace std;
#define NUM_THREADS 1
const long long max_size = 2000; // max length of strings
const long long N = 10; // number of closest words that will be shown
const long long max_w = 50; // max length of vocabulary entries
const float w2v_sim = 10.0;
struct word2vec {
FILE *f;
long long words, size;
float *M;
char *vocab;
void setup(char* file_name) {
f = fopen(file_name, "rb");
if (f == NULL) {
printf("Input file not found\n");
return;
}
fscanf(f, "%lld", &words);
fscanf(f, "%lld", &size);
//printf("%lld", words);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
if (M == NULL) {
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
return;
}
int a, b;
float len;
for (b = 0; b < words; ++b) {
a = 0;
while (1) {
vocab[b * max_w + a] = fgetc(f);
if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
}
vocab[b * max_w + a] = 0;
for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
len = 0;
for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
len = sqrt(len);
for (a = 0; a < size; a++) M[a + b * size] /= len;
}
fclose(f);
}
bool query(char* word, float* vec) {
int a, b;
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], word)) break;
if (b == words) b = -1;
if (b != -1) {
for (a = 0; a < size; a++) {
vec[a] = M[a + b *size];
}
}
if (b == -1) return false;
else return true;
}
}word2vec;
const long long sent_length = 100;
const long long vec_size = 100;
struct sentence {
char st[sent_length][max_size];
bool word_in_sent[sent_length];
int cn;bool in;
//float M[sent_length][vec_size];
float *M;
float tM;
//float M[vec_size];
float tmp[vec_size];
int bestw[sent_length][N];
float bestd[sent_length][N];
void init() {
M = (float *)malloc((long long)word2vec.words * sizeof(float));
}
void input(char* st1) {
int b, c, d;
cn = 0; b = 0; c = 0;
while (1) {
st[cn][b] = st1[c];
b++;
c++;
st[cn][b] = 0;
if (st1[c] == 0) break;
if (st1[c] == ' ') {
cn++;
b = 0;
c++;
}
while (st1[c] == ' ') {
c ++;
}
}
cn++;
//two stratege
//1. per word in word2vec find the best match in sents for this word REASON: CAPTURE THE SENT INFO <- MAX_POOLING
//2. per word in sents find the most similar word in word2vec REASON: QUERY EXPANSION TO RELATE THE TWO SENTS
for (c = 0; c < word2vec.words; c ++) M[c] = 0.0;
for (b = 0; b < cn; b ++) {
in = word2vec.query(st[b], tmp);
if (in == false) continue;
for (c = 0; c < word2vec.words; c ++) {
tM = 0.0;
for (d = 0; d < vec_size; d++) {
tM += tmp[d] * word2vec.M[d + c*vec_size];
}
if(M[c] < tM)M[c] = tM;
}
}
//norm();
}
void norm(){
float s = 0.0;
int c;
for (c = 0; c < word2vec.words; c++) s += M[c]*M[c];
//if (fabs(s-1e-8) < 0) return ;
s = sqrt(s);
for (c = 0; c < word2vec.words; c++) M[c] = M[c] / s;
}
void top1000() {
int a, c, d;
for (a = 0; a < N; a++) bestd[a] = -1;
for (a = 0; a < N; a++) bestw[a] = -1;
for (c = 0; c < word2vec.words; c++) {
for (a = 0; a < N; a++) {
if (M[c] > bestd[a]) {
for (d = N - 1; d > a; d--) {
bestd[d] = bestd[d - 1];
bestw[d] = bestw[d - 1];
//strcpy(bestw[d], bestw[d - 1]);
}
bestd[a] = M[c];
//strcpy(bestw[a], &word2vec.vocab[c * max_w]);
bestw[a] = c;
break;
}
}
}
}
}sent1[NUM_THREADS], sent2[NUM_THREADS];
float func(float* a, float *b) {
return (a-b)*(b-a);
}
float sent2dist(int id, char* str1, char* str2) {
int a, b, c, d;float s;
float p, r, f1;
sent1[id].input(str1);
sent2[id].input(str2);
sent1[id].top1000();
c = 0, d = 0;
for(b = 0; b < sent2[id].cn; b++) {
for (a = 0; a < N; a++) {
if (!strcmp(&word2vec.vocab[sent1[id].bestw[a] * max_w], sent2[id].st[b]))break;
}
if (a != N) c ++;
d ++;
}
p = 1.0 * c / d;
printf("%d %d %.2f\n", c, d, p);
sent2[id].top1000();
c = 0, d = 0;
for(b = 0; b < sent1[id].cn; b++) {
for (a = 0; a < N; a++) {
if (!strcmp(&word2vec.vocab[sent2[id].bestw[a] * max_w], sent1[id].st[b]))break;
}
if (a != N) c ++;
d ++;
}
r = 1.0*c/d;
printf("%d %d %.2f\n", c, d, r);
if (fabs(p+r) > 1e-8)f1 = 2.0*p*r/(p+r);
else f1 = 0.0;
//for (c = 0; c < word2vec.words; c++) s += sent1.M[c] * sent2.M[c];//pow(sent1.M[c]-sent2.M[c], 2.0);
//1.top1000 represent sentence
//p(sent1, sent2) = ? in all sentence_pair()? wrong!
//p(sent1, sent2) > p(sent3, sent4) => ranking problem => maybe confict => how to get the total order(delete patial order)
//=>dist measure
//=>can we solve it by probability?
//-> yes
//2.p(sent2 | sent1) = ?, p(sent1 | sent2) = ?, DIST IS p(sent2 | sent1) * p(sent1 | sent2) P REPRESENT THE PROBABILITY WHICH SENT2 CAN INFER SENT1
//2.1 p(word2_1, word2_2, ... | word1_1, word1_2, )
//2.2 p(word2_1, word2_2, ... | word1_1) * p(word2_1, word2_2, ...., word1_2 |word1_1) WORD ORDER IS NO USE
//2.3 p(sent2 | sent1) = p(sent2 | word1_1) * p(word2_1 | word1_1) * p(sent2 | word2_1) question is p(y|x) is not the order but the sim
//known Sim(word1_1, word2_1) = .. ,Sim(word1_1, word2_2) = .. =>
//p(word2_1 | word1_1) = p(word2_1, word1_1) / sum(p(word, word1_1))
/***************************************************************************/
//2.2 P(SENT2|SENT1) = P(WORD2_1 | WORD1_1, WORD1_2, WORD1_3, ...) * P(WORD2_2 | WORD1_1, WORD1_2, WORD1_3, ...) * ..
// -> P(WORD2_1, WORD1_1, ....) / P(WORD1_1, WORD1_2, WORD1_3, ...) ..
// -> P(WORD)
//2.3 P(SENT2|SENT1) = P(SENT2_1 | WORD1_1, WORD1_2, WORD1_3, ...) * P(WORD2_2 | WORD1_1, WORD1_2, WORD1_3, ...)
// -> P(SENT2_1, WORD1_1, ....) / P(WORD1_1, WORD1_2, WORD1_3, ...) ..
// -> P(SENT2_1) / P(WORD1_1, WORD1_2, WORD1_3, ...) ...
//top 1000 F1 = p*r
return f1;
}
void *PrintHello(void *threadid)
{
long long a;
long long tid;
long long finished;
char filename[max_w] = "input.align.clean.txt";
char output[max_w];
char str1[sent_length * max_size];
char str2[sent_length * max_size];
float sim;
finished = 0;
tid = (long long)threadid;
tid = tid * 4000;
// filename = "input.align.clean.txt";
//sprintf(filename, "align.%lld", tid);
sprintf(output, "out1.plus.align.%lld", tid);
FILE *fp = fopen(filename, "r");
FILE *fw = fopen(output, "w");
//freopen(filename, "r", stdin);
while (fgets(str1, sizeof(str1), fp)) {
//if(!) break;
fgets(str2, sizeof(str2), fp);
str1[strlen(str1)-1] = 0;
str2[strlen(str2)-1] = 0;
printf("\r %lld has finished %lld\n", tid/4000, finished);
//printf("%s %s\n", str1, str2);
sim = sent2dist(tid/4000, str1, str2);
// printf("%lld %s\t%s\t%.2f\n", tid, str1, str2, sim);
fprintf(fw, "%.8lf\n", sim);
finished ++;
}
fclose(fp);
fclose(fw);
pthread_exit(NULL);
}
int main(int argc, char **argv) {
char file_name[max_size];
char str1[sent_length * max_size];
char str2[sent_length * max_size];
int a;
char* ch;
float sim;
pthread_t threads[NUM_THREADS];
int rc;
long t;
if (argc < 2) {
printf("Usage: ./distance <DICT> <INPUT> <OUTPUT>\nwhere DICT contains word projections in the BINARY FORMAT\n");
return 0;
}
strcpy(file_name, argv[1]);
word2vec.setup(file_name);
for (t = 0; t < NUM_THREADS; t ++) {
sent1[t].init();
sent2[t].init();
}
for(t = 0; t < NUM_THREADS; t++){
printf("In main: creating thread %ld\n", t);
rc = pthread_create(&threads[t], NULL, PrintHello, (void *)t);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
for(t = 0; t < NUM_THREADS; t++) {
pthread_join(threads[t], NULL);
}
/* Last thing that main() should do */
pthread_exit(NULL);
/*
freopen(argv[2], "r", stdin);
freopen(argv[3], "w", stdout);
while (gets(str1)) {
gets(str2);
sim = sent2dist(str1, str2);
printf("%s\t%s\t%.8f\n", str1, str2, sim);
}*/
return 0;
}
03-20
2448
08-24
400