//----------------------------
text = sc.textFile("file:///home/mysql1/word_text_new1/part-00000",1).map(lambda x: x.split("\t"))
# text.first()
//-------------------------------------初始化数据
# 提取所有特征
searchApp = text.map(lambda x: x[2]).map(lambda x: x.split(",")).flatMap(lambda x: x).distinct()
genreID = text.map(lambda x: x[4]).map(lambda x: x.split(",")).flatMap(lambda x: x).distinct()
# K = searchApp.count() # 特征總數
mapings_App = searchApp.zipWithIndex().collectAsMap() # 做集合映射
mapings_genre = genreID.zipWithIndex().collectAsMap() # 做集合映射
#要广播一个RDD或参考从行动或转化。RDD变换和行动只能由驱动程序调用,不能在其他变换.
all_app_ters_bcast = sc.broadcast(mapings_App)
all_genre_ters_bcast = sc.broadcast(mapings_genre)
#//<cName,genreID> 并处理最后的空行
cg = sc.parallelize(list(mysql.getWordPriority('select cName,genreID from _category'))).map(lambda (x,y): (x,int(y)))#.saveAsTextFile("file:///home/mysql1/_category")
genreMap = cg.collectAsMap()
#对 genreMap 做 key-value对换
genreMap_v_k = {value:key for key, value in genreMap.items()}
sw = sc.parallelize(list(mysql.getWordPriority('select word, searchApp from searchApp limit 1'))).map(lambda (x,y): (x,y))#.saveAsTextFile("file:///home/mysql1/_category")
searchApp = sw.collectAsMap()
#对 mapings_App 和 mapings_genre 做 key-value对换
mapings_App_v_k = {value:key for key, value in mapings_App.items()}
mapings_genre_v_k = {value:key for key, value in mapings_genre.items()}
#//-----------------------------| searchApp |
extract_App = text.map(lambda x: extract_searchApp(x[2].split(",")))
# extract_App.first()
def extract_searchApp(record):
mapings_App = all_app_ters_bcast.value
K = len(mapings_App)
binary_vec = np.zeros(K)
for g in record: #.collect()
m = mapings_App[g]
binary_vec[m] = 1.0
return print_List(binary_vec)
#//---------------------------| genreID |user_vectors = extract_genreID.map(lambda (id, vec):vec)
extract_genreID = text.map(lambda x: extract_genre(x[4].split(",")))
# params = [u'6006', u'6010', u'6005', u'6002', u'6017', u'6016', u'6018', u'6012', u'6024']
# extract_genreID.first() take(5)
def extract_genre(record):
mapings_genre = all_genre_ters_bcast.value
K = len(mapings_genre)
binary_vec = np.zeros(K)
for g in record: #.collect()
m = mapings_genre[g]
binary_vec[m] = 1.0
return print_List(binary_vec) #<-----
#//--------------------------遍历数组里的每一个元素,保存为只有集合形式
def print_List(list_nums):
list_tmp = []
for each_item in list_nums :
if isinstance(each_item,list):
print_List(each_item)
else:
#print(each_item)
list_tmp.append(each_item)
return list_tmp
//---------------------------- 聚类
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.clustering import KMeans
data = text.map(lambda x: (x[0], (float(x[1]), extract_searchApp(x[2].split(",")), float(x[3]), extract_genre(x[4].split(",")), float(x[5]) )))
# model_data.first() .map(lambda x: x[2].coleect())
model_data = data.map(lambda x: Vectors.dense(x[1]))
factors = data.map(lambda (item,factor): (item,Vectors.dense(factor)))
factors.first()
def parseVector(line):
return np.array([float(x) for x in line])
data = lines.map(parseVector)
numClusters = 5
numIterations = 50 #//最大迭代次数,默认100
numRuns = 3 #//算法并发运行数目,从多个起点并发执行,最后选择最佳结果,多次训练可有效找倒最优模型
initializationMode = "random" #//默认k-means 用来初始化聚类中心的方法
clusterModel = KMeans.train(vct,numClusters, numIterations, numRuns)
predictions = clusterModel.predict(vct)
print '对前十个样本的预测标签为:'+",".join([str(i) for i in predictions.take(10)])
# 将聚类后的结果做转换
def genre_binary_to_items(line):
genres = line.zipWithIndex.filter(lambda (g, idx):
g == "1.0"
).map(lambda (g, idx):
genreMap_v_k[int(mapings_genre_v_k[idx])]
)
return
def searchApp_binary_to_items(line):
genres = line.zipWithIndex.filter(lambda (g, idx):
g == "1.0"
).map(lambda (g, idx):
genreMap_v_k[mapings_genre_v_k[idx]]
)
return
aa = sc.parallelize(extract_genreID.first()).zipWithIndex().filter(lambda (g, idx): g == 1.0 ).map(lambda (g, idx):mapings_genre_v_k[idx])
#aa.first()
//------------------------------矩阵形式转换(得到看不dong的东西,不采用)
extract = text.map(lambda x: (x[2], x[3], create_vector(x[4].split(","), mapings)))
def create_vector(terms, term_dict):
from scipy import sparse as sp
num_terms = len(term_dict)
x = sp.csc_matrix((1, num_terms))
for t in terms:
if t in term_dict:
idx = term_dict[t]
x[0, idx] = 1
return x
#encodig=utf-8
# _*_ coding:utf-8 _*_
# Writer : byz
# dateTime : 2016-07-30
import sys
sys.path.append("/home/mysql1/anqu/python/code")
reload(sys)
import config
import MySQLdb
sys.setdefaultencoding('utf8')
import mysql_op
import numpy as np
import time
from chinese import chinese
from pyspark import SparkContext, SparkConf
SparkAppName = "Anqu数据处理"
#master = "192.168.40.128:7077"
conf = SparkConf().setAppName(SparkAppName).setMaster("local")
sc = SparkContext(conf=conf)
from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)
from pyspark.sql.functions import *
from pyspark.mllib.linalg import Vectors
class cluster_vector_data():
#初始化
def __init__(self, database = "mysql_anqu_chi"):
self.sqlContext = sqlContext
self.sqlContext.sql("use " + database)
# self.all_genre_ters_bcast = sc.broadcast(self.mapings_genre)
def extract_vector(self):
genres = self.sqlContext.sql("select genre from ansearchapp ")
genres_rdd = genres.map(list).collect()
num = len(genres_rdd)
genres_list = []
#vectors = self.extract_genre(genres_rdd[1][0])
for g in xrange(0,num):
vct = self.extract_genre(genres_rdd[g][0])
temp = Vectors.dense(vct)
genres_list.append(temp)
#vectors = genres_rdd.map(lambda x: self.extract_genre(x[0]))
# genres.show()
#aa = self.data_extract(genres)
print genres_list #vectors ,
return genres_list
def extract_genre(self, record):
genress = self.sqlContext.sql("select genres from ansearchapp LATERAL VIEW OUTER explode(genre) s AS genres ")
mapings_genre = genress.flatMap(lambda x: x).distinct().zipWithIndex().collectAsMap()
K = len(mapings_genre)
binary_vec = np.zeros(K)
for g in record: #.collect()
m = mapings_genre[g]
binary_vec[m] = 1.0
return self.print_List(binary_vec) #<-----
def print_List(self, list_nums):
list_tmp = []
for each_item in list_nums :
if isinstance(each_item,list):
print_List(each_item)
else:
#print(each_item)
list_tmp.append(each_item)
return list_tmp
def main():
data_vec = cluster_vector_data()
#data_vec.__init__()
# de = data_vec.data_extract()
# model_data = extract_genreID.map(lambda x: Vectors.dense(x))
data_vec.extract_vector()
sc.stop()
if __name__ == '__main__':
main()
转自: http://blog.csdn.net/lovebyz/article/details/52185707