w2v for sent_pair

rgtjf
于 2016-02-02 01:40:51 发布
阅读量475
点赞数
分类专栏：学习
本文链接：https://blog.csdn.net/rgtjf/article/details/50621281
版权
学习专栏收录该内容
15 篇文章 0 订阅
订阅专栏
#include <bits/stdc++.h>
#include <pthread.h>
using namespace std;
#define NUM_THREADS 1

const long long max_size = 2000;          // max length of strings
const long long N = 10;                  // number of closest words that will be shown
const long long max_w = 50;              // max length of vocabulary entries

const float w2v_sim = 10.0;

struct word2vec {
  FILE *f;
  long long words, size;
  float *M;
  char *vocab;
  void setup(char* file_name) {
    f = fopen(file_name, "rb");
    if (f == NULL) {
      printf("Input file not found\n");
      return;
    }
    fscanf(f, "%lld", &words);
    fscanf(f, "%lld", &size);
    //printf("%lld", words);
    vocab = (char *)malloc((long long)words * max_w * sizeof(char));
    M = (float *)malloc((long long)words * (long long)size * sizeof(float));
    if (M == NULL) {
     printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
     return;
    }
    int a, b;
    float len;
    for (b = 0; b < words; ++b) {
      a = 0;
      while (1) {
       vocab[b * max_w + a] = fgetc(f);
       if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
       if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
      }
      vocab[b * max_w + a] = 0;
      for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
      len = 0;
      for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
      len = sqrt(len);
      for (a = 0; a < size; a++) M[a + b * size] /= len;
   }
   fclose(f);
  }

  bool query(char* word, float* vec) {
    int a, b;
    for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], word)) break;
    if (b == words) b = -1;

    if (b != -1) {
      for (a = 0; a < size; a++) {
        vec[a] = M[a + b *size];
      }
    }

    if (b == -1) return false;
    else return true;
  }
}word2vec;


const long long sent_length = 100;
const long long vec_size = 100;

struct sentence {
  char st[sent_length][max_size];
  bool word_in_sent[sent_length];
  int cn;bool in;
  //float M[sent_length][vec_size];
  float *M;
  float tM;
  //float M[vec_size];
  float tmp[vec_size];
  int bestw[sent_length][N];
  float bestd[sent_length][N];

  void init() {
    M = (float *)malloc((long long)word2vec.words * sizeof(float));
  }

  void input(char* st1) {
    int b, c, d;
    cn = 0; b = 0; c = 0;
    while (1) {
      st[cn][b] = st1[c];
      b++;
      c++;
      st[cn][b] = 0;
      if (st1[c] == 0) break;
      if (st1[c] == ' ') { 
        cn++;
        b = 0;
        c++;
      }
      while (st1[c] == ' ') {
        c ++;
      }
    }
    cn++;
    //two stratege 
    //1. per word in word2vec find the best match in sents for this word REASON: CAPTURE THE SENT INFO <- MAX_POOLING
    //2. per word in sents find the most similar word in word2vec REASON: QUERY EXPANSION TO RELATE THE TWO SENTS
    for (c = 0; c < word2vec.words; c ++) M[c] = 0.0;
    for (b = 0; b < cn; b ++) {
      in = word2vec.query(st[b], tmp);
      if (in == false) continue;
      for (c = 0; c < word2vec.words; c ++) {
        tM = 0.0;
        for (d = 0; d < vec_size; d++) {
          tM += tmp[d] * word2vec.M[d + c*vec_size];
        }
        if(M[c] < tM)M[c] = tM;
      }
    }
    //norm();
  }

  void norm(){
    float s = 0.0;
    int c;
    for (c = 0; c < word2vec.words; c++) s += M[c]*M[c];
    //if (fabs(s-1e-8) < 0) return ;
    s = sqrt(s);
    for (c = 0; c < word2vec.words; c++) M[c] = M[c] / s;
  }


  void top1000() {
    int a, c, d;
    for (a = 0; a < N; a++) bestd[a] = -1;
    for (a = 0; a < N; a++) bestw[a] = -1;

    for (c = 0; c < word2vec.words; c++) {
      for (a = 0; a < N; a++) {
        if (M[c] > bestd[a]) {
          for (d = N - 1; d > a; d--) {
            bestd[d] = bestd[d - 1];
            bestw[d] = bestw[d - 1];
            //strcpy(bestw[d], bestw[d - 1]);
          }
          bestd[a] = M[c];
          //strcpy(bestw[a], &word2vec.vocab[c * max_w]);
          bestw[a] = c;
          break;
        }
      }
    }
  }
}sent1[NUM_THREADS], sent2[NUM_THREADS];

float func(float* a, float *b) {
  return (a-b)*(b-a);
}

float sent2dist(int id, char* str1, char* str2) {
  int a, b, c, d;float s;
  float p, r, f1;
  sent1[id].input(str1);
  sent2[id].input(str2);

  sent1[id].top1000();
  c = 0, d = 0;
  for(b = 0; b < sent2[id].cn; b++) {
    for (a = 0; a < N; a++) {
      if (!strcmp(&word2vec.vocab[sent1[id].bestw[a] * max_w], sent2[id].st[b]))break;
    }
    if (a != N) c ++;
    d ++; 
  } 
  p = 1.0 * c / d;
  printf("%d %d %.2f\n", c, d, p);
  sent2[id].top1000();
  c = 0, d = 0;
  for(b = 0; b < sent1[id].cn; b++) {
    for (a = 0; a < N; a++) {
      if (!strcmp(&word2vec.vocab[sent2[id].bestw[a] * max_w], sent1[id].st[b]))break;
    }
    if (a != N) c ++;
    d ++; 
  } 
  r = 1.0*c/d;
  printf("%d %d %.2f\n", c, d, r);
  if (fabs(p+r) > 1e-8)f1 = 2.0*p*r/(p+r);
  else f1 = 0.0;
  //for (c = 0; c < word2vec.words; c++) s += sent1.M[c] * sent2.M[c];//pow(sent1.M[c]-sent2.M[c], 2.0);
  //1.top1000 represent sentence
  //p(sent1, sent2) = ? in all sentence_pair()? wrong!
  //p(sent1, sent2) > p(sent3, sent4) => ranking problem => maybe confict => how to get the total order(delete patial order)
  //=>dist measure
  //=>can we solve it by probability?
      //-> yes
      //2.p(sent2 | sent1) = ?, p(sent1 | sent2) = ?,  DIST IS p(sent2 | sent1) * p(sent1 | sent2) P REPRESENT THE PROBABILITY WHICH SENT2 CAN INFER SENT1
      //2.1 p(word2_1, word2_2, ... | word1_1, word1_2, )
      //2.2 p(word2_1, word2_2, ... | word1_1) * p(word2_1, word2_2, ...., word1_2 |word1_1) WORD ORDER IS NO USE
      //2.3 p(sent2 | sent1) = p(sent2 | word1_1) * p(word2_1 | word1_1) * p(sent2 | word2_1) question is p(y|x) is not the order but the sim
      //known Sim(word1_1, word2_1) = .. ,Sim(word1_1, word2_2) = .. =>
      //p(word2_1 | word1_1) = p(word2_1, word1_1) / sum(p(word, word1_1))
      /***************************************************************************/
      //2.2 P(SENT2|SENT1) = P(WORD2_1 | WORD1_1, WORD1_2, WORD1_3, ...) * P(WORD2_2 | WORD1_1, WORD1_2, WORD1_3, ...) * ..
      //    -> P(WORD2_1, WORD1_1, ....) / P(WORD1_1, WORD1_2, WORD1_3, ...) ..
      //    -> P(WORD)
      //2.3 P(SENT2|SENT1) = P(SENT2_1 | WORD1_1, WORD1_2, WORD1_3, ...) * P(WORD2_2 | WORD1_1, WORD1_2, WORD1_3, ...)
      //    -> P(SENT2_1, WORD1_1, ....) / P(WORD1_1, WORD1_2, WORD1_3, ...) ..
      //    -> P(SENT2_1) / P(WORD1_1, WORD1_2, WORD1_3, ...) ...

    //top 1000 F1 = p*r

  return f1;
}


void *PrintHello(void *threadid)
{
  long long a;
  long long tid;
  long long finished;
  char filename[max_w] = "input.align.clean.txt";
  char output[max_w];
  char str1[sent_length * max_size];
  char str2[sent_length * max_size];
  float sim;

  finished = 0;
  tid = (long long)threadid;
  tid = tid * 4000;

 // filename = "input.align.clean.txt";
 //sprintf(filename, "align.%lld", tid);
  sprintf(output, "out1.plus.align.%lld", tid);

  FILE *fp = fopen(filename, "r");
  FILE *fw = fopen(output, "w");
  //freopen(filename, "r", stdin);
  while (fgets(str1, sizeof(str1), fp)) {
    //if(!) break;
    fgets(str2, sizeof(str2), fp);
    str1[strlen(str1)-1] = 0;
    str2[strlen(str2)-1] = 0;

    printf("\r %lld has finished %lld\n", tid/4000, finished);
    //printf("%s %s\n", str1, str2);
    sim = sent2dist(tid/4000, str1, str2);
//    printf("%lld %s\t%s\t%.2f\n", tid, str1, str2, sim);
    fprintf(fw, "%.8lf\n", sim);
    finished ++;
  }

  fclose(fp);
  fclose(fw);
  pthread_exit(NULL);
}

int main(int argc, char **argv) {
  char file_name[max_size];
  char str1[sent_length * max_size];
  char str2[sent_length * max_size];
  int a;
  char* ch;
  float sim;
  pthread_t threads[NUM_THREADS];
  int rc;
  long t;

  if (argc < 2) {
    printf("Usage: ./distance <DICT> <INPUT> <OUTPUT>\nwhere DICT contains word projections in the BINARY FORMAT\n");
    return 0;
  }
  strcpy(file_name, argv[1]);
  word2vec.setup(file_name);

  for (t = 0; t < NUM_THREADS; t ++) {
    sent1[t].init();
    sent2[t].init();
  }

  for(t = 0; t < NUM_THREADS; t++){
    printf("In main: creating thread %ld\n", t);
    rc = pthread_create(&threads[t], NULL, PrintHello, (void *)t);
    if (rc){
      printf("ERROR; return code from pthread_create() is %d\n", rc);
      exit(-1);
    }
  }
  for(t = 0; t < NUM_THREADS; t++) {
     pthread_join(threads[t], NULL);
  }
   /* Last thing that main() should do */
   pthread_exit(NULL);

  /*
  freopen(argv[2], "r", stdin);
  freopen(argv[3], "w", stdout);
  while (gets(str1)) {
    gets(str2);
    sim = sent2dist(str1, str2);
    printf("%s\t%s\t%.8f\n", str1, str2, sim);
  }*/
  return 0;
}
rgtjf
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
w2v for sent_pair

#include <bits/stdc++.h>#include <pthread.h>using namespace std;#define NUM_THREADS 1const long long max_size = 2000; // max length of stringsconst long long N = 10; // nu
复制链接

扫一扫
专栏目录