word2vec

 

# -*- coding: utf-8 -*-
import pandas as pd
from bs4 import BeautifulSoup   
import re #替换标点
#frequent words with little meaning
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.ensemble import RandomForestClassifier
import nltk.data
import logging
from gensim.models import word2vec
#to train Word2Vec it is better not to remove stopwords 
#because the algorithm relies on the broader context of 
#the sentence in order to produce high-quality word vectors.


def review_to_wordlist( review, remove_stopwords=False ):
    review_text = BeautifulSoup(review).get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

#Word2Vec does not need labels in order to create meaningful representations.
#Words with similar meanings appear in clusters, 
#and clusters are spaced such that some word relationships,


#Word2Vec expects single sentences, each one as a list of words. 
#In other words, the input format is a list of lists.



def review_to_sentences(review,tokenizer,remove_stopwords=False):
    raw_sentences=tokenizer.tokenize(review.strip())
    sentences=[]
    for raw_sentence in raw_sentences :
        if(len(raw_sentence)>0):
            sentences.append(review_to_wordlist(raw_sentence,remove_stopwords)      )
    return sentences
    #将段落分成raw_sentences
    # 将每个raw_sentence都转为wordlist
            




def loadData():
    train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3,encoding="utf-8")
    unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3,encoding="utf-8" )
    test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3,encoding="utf-8" )
    return train,test

def reviewsToSentences():
    
    train,unlabeled_train,test=loadData()
    tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    sentences=[]
    print "Parsing sentences from training set"
    for review in train["review"]:
        sentences += review_to_sentences(review,tokenizer)
        
    print "Parsing sentences from training set"
    for review in unlabeled_train["review"]:
        sentences += review_to_sentences(review,tokenizer)



    
def apply(model):
    model.doesnt_match(["man","woman","child","kitchen"])
    model.most_similar("man")
    
    
    
    
#-----------------------average_vector------------------------------
from gensim.models import Word2Vec
import numpy as np
def makeFeatureVec(words, model, num_features):
    
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    index2word_set = set(model.index2word)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getReviewsVec(reviews,model,num_features):
    
    reviewSetfeatureVec=[]
    counter=0.0
    for words in reviews:
        reviewSetfeatureVec[counter]=makeFeatureVec(words,model,num_features)
        counter+=1
    return reviewSetfeatureVec
    


def run(sentences,train,test):
    #creating nice output messages

    logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',\
                        level=logging.INFO)
    num_features=400
    min_word_count=40
    num_workers=4
    context=10
    downsampling=1e-3
    
    print "training model..."
    model=word2vec.Word2Vec(sentences,workers=num_workers,\
          size=num_features, min_count=min_word_count,\
          window=context, sample=downsampling)
    model.init_sims(replace=True)
    model.save("FirstWord2Vec")             

    clean_train_reviews = []
    for review in train["review"]:
        clean_train_reviews.append( review_to_wordlist( review,remove_stopwords=True )) 
    trainDataVecs = getReviewsVec( clean_train_reviews, model, num_features )
    
    print "Creating average feature vecs for test reviews"
    clean_test_reviews = []
    for review in test["review"]:
        clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))    
    testDataVecs = getReviewsVec( clean_test_reviews, model, num_features )
    

    forest = RandomForestClassifier( n_estimators = 100 )    
    print "Fitting a random forest to labeled training data..."
    forest = forest.fit( trainDataVecs, train["sentiment"] )
    
    # Test & extract results 
    result = forest.predict( testDataVecs )
    
    # Write the test results 
    output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
    output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
    
#-----------------------word-centroid------------------------------

from sklearn.cluster import KMeans
import time

def kmeans_run(model):
    word_vectors = model.syn0
    num_clusters = word_vectors.shape[0] / 5
    
    # Initalize a k-means object and use it to extract centroids
    kmeans_clustering = KMeans( n_clusters = num_clusters )
    centeroid_id = kmeans_clustering.fit_predict( word_vectors )
    word_centroid_map=dict(zip(word_vectors,centeroid_id ))
    return word_centroid_map
    

def create_review_centroids_histogram(clean_train_reviews,clean_test_reviews,model,word_centroid_map):
    dict_words=set(model.index2word)
    review_centroids_vector=np.zeros(num_centroids,dtype="float32")

    counter=0    
    for review_wordlist in clean_train_reviews:
        for word in review_wordlist:
            if word in dict_words:
                index=word_centroid_map[word]
                review_centroid_vector[index]+=1
                reviews_centroid_vector[counter]=review_centroids_vector
                counter+=1
        
    for review_wordlist in clean_test_reviews:
        for word in review_wordlist:
            if word in dict_words:
                index=word_centroid_map[word]
                test_review_centroid_vector[index]+=1
                test_reviews_centroid_vector[counter]=review_centroids_vector
                counter+=1
    forest = RandomForestClassifier(n_estimators = 100)
    forest=forest.fit(reviews_centroid_vector,train["sentiment"])    
    result=forest.predict(test_reviews_centroid_vector)
    output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
    output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )
    
    
    









  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值