import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import gensim
import re
data = pd.read_csv('liepin_fenci.csv')
#cixing_data = data['cixing']
# aaa = data.iloc[:1000, 2]
cixing_data = data['cixing']
# punctuation='.※\\×·■★〓!"#$%&\'()*+,,-./:;<=>?@[\\]^_`{|}~"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、\ue65c〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。'
drop_words_list = ['(',')','⒈','(',')','/','*','【','】','.','>','•','-','一',';',',','&',"'"]
for i in drop_words_list:
cixing_data = cixing_data.map(lambda x: x.replace(i, ''))
# cixing_data = cixing_data.map(lambda x: re.sub(r"[%s]+" %punctuation, '', x))
cixing_data = cixing_data.map(lambda x:re.sub(r'\s+',' ',x))
cixing_data = cixing_data.map(lambda x:x.strip())
# cixing_data = cixing_data.map(lambda x:x.lstrip(' '))
cixing_data = cixing_data.map(lambda x:x.split(' '))
from gensim.models import Word2Vec
class TextLoader(object):
def __init__(self,j):
self.data = j
pass
def __iter__(self):
return iter(self.data)
line = TextLoader(cixing_data)#句子为列表形式的
model = gensim.models.Word2Vec(line, workers=8, min_count=1)
model.save('word2vector2.model')
model = Word2Vec.load('word2vector2.model')
print('"公司" 和 "部门" 的相似度:'+ str(model.similarity('java','python')))
#%%
#1
topic_data = pd.read_csv('liepin_cixing.csv')
def cixing_wv(word):
try:
arr = model.wv[word]
except:
arr = np.array([0]*100)
return arr
topic_data['wv'] = topic_data['cixing'].map(lambda x:list(np.sum(map(lambda y:cixing_wv(y), x.split(',')), axis=0)))
#%%
#2
data['wv'] = cixing_data.map(lambda x:list(np.sum(map(lambda y:cixing_wv(y), x), axis=0)))
#%%
from sklearn.cluster import KMeans
n = 20
X = []
data['wv'].map(lambda x:X.append(x))
X = np.array(X)
kmeans = KMeans(n_clusters=n, random_state=0).fit(X)
data['label'] = kmeans.labels_
word2vec
最新推荐文章于 2022-12-15 10:14:51 发布