#-*- coding: utf-8 -*-
from pandas importread_csvimportnumpy as npfrom sklearn.datasets.base importBunchimport pickle #导入cPickle包并且取一个别名pickle #持久化类
from sklearn.feature_extraction.text importTfidfVectorizerimportjiebaimport operator #排序用
from sklearn importmetricsfrom sklearn.externals importjoblibimportxlwtdefimportSmallContentdata(file, data, art, label, f, Slast, Snew):
dataset=read_csv(file)
Sdata=dataset.values[:, :]print(type(Sdata))if f == 1:for line inSdata:
ls=[]
ls.append(line[14])
ls.append(line[15])
ls.append(line[16])
ls.append(line[17])
Slast.append(ls)#print(len(Slast))
#print("需要对照的小类数据准备完毕")
'''找到smalli不为0的装入Straindata,把数据分开'''
for smalli in range(14, 18):#print(smalli)
count =0for line inSdata:
count= count + 1
if line[smalli] != '0' and line[smalli] !=0:
k= 1ls=[]for i inline:if k == 1:#art.append(i)
k = k + 1
continue
if k == 11: #k为14并不代表是line[14],因为line是从0开始
breakls.append(float(i))
k= k + 1data.append(ls)
label.append(line[smalli])if f == 1:
Snew.append(count)for line inSdata:
art.append(line[0])#print("为什么都超限",len(Snew))
defgetKvector(train_set, vec, n):classobj:def __init__(self):
self.key=0
self.weight= 0.0nonzero=train_set.tdm.nonzero()
k=0
lis=[]
gather=[]
p= -1
for i innonzero[0]:
p= p + 1
if k ==i:
a=obj()
a.key= nonzero[1][p]
a.weight= train_set.tdm[i, nonzero[1][p]]
lis.append(a)else:
lis.sort(key=lambda obj: obj.weight, reverse=True) #对链表内为类对象的排序
gather.append(lis)while k
k= k + 1lis=[]
a=obj()
a.key= nonzero[1][p]
a.weight= train_set.tdm[i, nonzero[1][p]]
lis.append(a)
gather.append(lis)#gather存储的是每条数据的事实描述的特征向量,已经从小到大排好了,只不过每个存既有key又有weight
#我们只要key,不再需要weight
sj= 1
for i ingather:
ls=[]for j ini:
sj= sj + 1ls.append(float(j.key))while sj <=n:
sj= sj + 1ls.append(-1)
sj= 1vec.append(ls)'''读取停用词'''
def_readfile(path):
with open(path,"rb") as fp:
content=fp.read()returncontent'''读取bunch对象'''
def_readbunchobj(path):
with open(path,"rb") as file_obj:
bunch=pickle.load(file_obj)returnbunch'''写入bunch对象'''
def_writebunchobj(path, bunchobj):
with open(path,"wb") as file_obj:
pickle.dump(bunchobj, file_obj)defbuildtrainbunch(bunch_path, art_train, trainlabel):
bunch= Bunch(label=[], contents=[])for item1 intrainlabel:
bunch.label.append(item1)#trainContentdatasave=[] #存储所有训练和测试数据的分词
for item2 inart_train:
item2=str(item2)
item2= item2.replace("\r\n", "")
item2= item2.replace(" ", "")
content_seg=jieba.cut(item2)
save2= ''
for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#trainContentdatasave.append(item3)
save2 = save2 + "," +item3
bunch.contents.append(save2)
with open(bunch_path,"wb") as file_obj:
pickle.dump(bunch, file_obj)print("构建训练数据文本对象结束!!!")defbuildtestbunch(bunch_path, art_test, testlabel):
bunch= Bunch(label=[], contents=[])for item1 intestlabel:
bunch.label.append(item1)#testContentdatasave=[] #存储所有训练和测试数据的分词
for item2 inart_test:
item2=str(item2)
item2= item2.replace("\r\n", "")
item2= item2.replace(" ", "")
content_seg=jieba.cut(item2)
save2= ''
for item3 incontent_seg:if len(item3) > 1 and item3 != '\r\n':#testContentdatasave.append(item3)
save2 = save2 + "," +item3
bunch.contents.append(save2)
with open(bunch_path,"wb") as file_obj:
pickle.dump(bunch, file_obj)print("构建测试数据文本对象结束!!!")defvector_space(stopword_path, bunch_path, space_path):
stpwrdlst= _readfile(stopword_path).splitlines() #读取停用词
bunch = _readbunchobj(bunch_path) #导入分词后的词向量bunch对象
#构建tf-idf词向量空间对象
tfidfspace = Bunch(label=bunch.label, tdm=[], vocabulary={})#权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值
#使用TfidVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5, min_df=0.0001, use_idf=True,
max_features=15000)#print(vectorizer)
#文本转为词频矩阵,单独保存字典文件
tfidfspace.tdm =vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary=vectorizer.vocabulary_#创建词袋的持久化
_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功!!!")deftestvector_space(stopword_path, bunch_path, space_path, train_tfidf_path):
stpwrdlst= _readfile(stopword_path).splitlines() #把停用词变成列表
bunch =_readbunchobj(bunch_path)
tfidfspace= Bunch(label=bunch.label, tdm=[], vocabulary={})#导入训练集的TF-IDF词向量空间 ★★
trainbunch =_readbunchobj(train_tfidf_path)
tfidfspace.vocabulary=trainbunch.vocabulary
vectorizer= TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.7, vocabulary=trainbunch.vocabulary,
min_df=0.001)
tfidfspace.tdm=vectorizer.fit_transform(bunch.contents)
_writebunchobj(space_path, tfidfspace)print("if-idf词向量空间实例创建成功!!!")