word2vec

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import gensim
import re

data = pd.read_csv('liepin_fenci.csv')
#cixing_data = data['cixing']
# aaa = data.iloc[:1000, 2]
cixing_data = data['cixing']
# punctuation='.※\\×·■★〓!"#$%&\'()*+,,-./:;<=>?@[\\]^_`{|}~"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、\ue65c〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。'

drop_words_list = ['(',')','⒈','(',')','/','*','【','】','.','>','•','-','一',';',',','&',"'"]
for i in drop_words_list:
    cixing_data = cixing_data.map(lambda x: x.replace(i, ''))
# cixing_data = cixing_data.map(lambda x: re.sub(r"[%s]+" %punctuation, '', x))
cixing_data = cixing_data.map(lambda x:re.sub(r'\s+',' ',x))
cixing_data = cixing_data.map(lambda x:x.strip())
# cixing_data = cixing_data.map(lambda x:x.lstrip(' '))
cixing_data = cixing_data.map(lambda x:x.split(' '))
from gensim.models import Word2Vec
class TextLoader(object):
    def __init__(self,j):
        self.data = j
        pass

    def __iter__(self):
        return iter(self.data)

line = TextLoader(cixing_data)#句子为列表形式的
model = gensim.models.Word2Vec(line, workers=8, min_count=1)
model.save('word2vector2.model')
model = Word2Vec.load('word2vector2.model')
print('"公司" 和 "部门" 的相似度:'+ str(model.similarity('java','python')))
#%%
#1
topic_data = pd.read_csv('liepin_cixing.csv')
def cixing_wv(word):
    try:
        arr = model.wv[word]
    except:
        arr = np.array([0]*100)
    return arr
topic_data['wv'] = topic_data['cixing'].map(lambda x:list(np.sum(map(lambda y:cixing_wv(y), x.split(',')), axis=0)))
#%%
#2
data['wv'] = cixing_data.map(lambda x:list(np.sum(map(lambda y:cixing_wv(y), x), axis=0)))
#%%
from sklearn.cluster import KMeans
n = 20
X = []
data['wv'].map(lambda x:X.append(x))
X = np.array(X)
kmeans = KMeans(n_clusters=n, random_state=0).fit(X)
data['label'] = kmeans.labels_


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值