目录
题目3:利用Spark ML中的TF-IDF和随机森林对文件进行聚类分析
题目1:根据给出的电影数据集、进行Spark中的RDD编程
1)求取并显示电影的总数量
import re
from pyspark import SparkContext
# 获取需处理的数据列,返回pairRDD
def get_col(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
elif '"' in line:
parts = re.split(',"|",', line)
return int(parts[column_index1]), parts[column_index2]
parts = line.split(',')
return int(parts[column_index1]), parts[column_index2]
if __name__ == '__main__':
sc = SparkContext(appName="movies")
movie_id_index_movie = 0
movie_title_index = 1
# 加载文件创建RDD
movie_lines = sc.textFile("../data/movies.csv")
movie_rdd = movie_lines\
.map(lambda x: get_col(x, movie_id_index_movie, movie_title_index))
print("movies.csv: \n")
print(movie_rdd.take(5))
movie_count = movie_rdd.count() - 1
print("电影总数量: %s" % movie_count)
sc.stop()
2)求取并显示用户评价的总数量
import re
from pyspark import SparkContext
def get_col(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
elif '"' in line:
parts = re.split(',"|",', line)
return int(parts[column_index1]), parts[column_index2]
parts = line.split(',')
return int(parts[column_index1]), parts[column_index2]
if __name__ == '__main__':
sc = SparkContext(appName="ratings1")
movie_title_index = 1
movie_rating_index = 2
ratings_lines = sc.textFile("../data/ratings.csv")
ratings_rdd = ratings_lines\
.map(lambda x: get_col(x, movie_title_index, movie_rating_index))
print("ratings.csv: \n")
print(ratings_rdd.take(5))
ratings_count = ratings_rdd.count() - 1
print("电影评价总数量: %s" % ratings_count)
sc.stop()
3)求取并显示电影总类数
from pyspark import SparkContext
def get_column(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
parts = line.split(',')
return int(parts[column_index1]), parts[len(parts)-1]
def genres_count(line):
if -1 in line: # 跳过 (-1,-1)
return
global genres_list
genres = line[1]
print(genres.split("|"))
genres_list = genres_list + genres.split("|")
# 去重
genres_list = list(set(genres_list))
print("电影的总类数: %s" % len(genres_list))
print(genres_list)
print("\n")
if __name__ == "__main__":
genres_list = []
# movies
sc = SparkContext(appName="movies")
movie_id_index_movie = 0
movie_genres_index = 2
movie_lines = sc.textFile("../data/movies.csv")
movie_rdd = movie_lines\
.map(lambda x: get_column(x, movie_id_index_movie, movie_genres_index))
movie_rdd.foreach(genres_count)
sc.stop()
4)求取并显示电影平均评分小于3的电影数
import re
from pyspark import SparkContext
def get_col(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
elif '"' in line:
parts = re.split(',"|",', line)
return int(parts[column_index1]), parts[column_index2]
parts = line.split(',')
return int(parts[column_index1]), parts[column_index2]
def avg_rating(line):
# 跳过(-1, -1)
if -1 in line:
return
rating_list = list(line[1])
# print(rating_list)
temp = 0
for item in rating_list:
temp += float(item)
temp = temp/len(rating_list)
return int(line[0]), temp
if __name__ == '__main__':
movie_id_index = 1
movie_rating_index = 2
filter_rating = 3
sc = SparkContext(appName="ratings")
ratings_line = sc.textFile("../data/ratings.csv")
ratings_rdd = ratings_line\
.map(lambda x: get_col(x, movie_id_index, movie_rating_index))
rdd1 = ratings_rdd.groupByKey()
rdd2 = rdd1.map(lambda x: avg_rating(x))
print(rdd2.top(5))
rdd3 = rdd2.filter(lambda x: x[1] < filter_rating)
print("平均分小于3的电影数:", rdd3.count() - 1)
sc.stop()
5)求取并显示每个用户的平均评分值
import re
from pyspark import SparkContext
def get_col(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
elif '"' in line:
parts = re.split(',"|",', line)
return int(parts[column_index1]), parts[column_index2]
parts = line.split(',')
return int(parts[column_index1]), parts[column_index2]
def avg_rating(line):
# 跳过(-1, -1)
if -1 in line:
return
rating_list = list(line[1])
temp = 0
for item in rating_list:
temp += float(item)
temp = temp/len(rating_list)
return int(line[0]), temp
if __name__ == '__main__':
user_id_index = 0
movie_rating_index = 2
filter_rating = 3
sc = SparkContext(appName="ratings")
ratings_line = sc.textFile("../data/ratings.csv")
ratings_rdd = ratings_line\
.map(lambda x: get_col(x, user_id_index, movie_rating_index))
rdd1 = ratings_rdd.groupByKey()
rdd2 = rdd1.map(lambda x: avg_rating(x)).sortByKey()
# rdd3 = rdd2.filter(lambda x: x[1] <= filter_rating)
print(rdd2.take(5))
sc.stop()
6)求取并显示每部电影的评价次数
import re
from pyspark import SparkContext
def get_col(line, column_index1, column_index2):
if 'userId' in line or 'movieId' in line:
return -1, -1
elif '"' in line:
parts = re.split(',"|",', line)
return int(parts[column_index1]), parts[column_index2]
parts = line.split(',')
return int(parts[column_index1]), parts[column_index2]
def rating_count(line):
rating_list = list(line[1])
num = len(rating_list)
return int(line[0]), num
if __name__ == '__main__':
movie_id_index = 1
rating_index = 2
movie_title_index = 1
sc = SparkContext(appName="ratings")
ratings_line = sc.textFile("../data/ratings.csv")
ra_num_rdd = ratings_line\
.map(lambda x: get_col(x, movie_id_index, rating_index))
movie_title_line = sc.textFile("../data/movies.csv")
movie_title_rdd = movie_title_line.map(lambda x: get_col(x, movie_id_index, movie_title_index))
rdd1 = ra_num_rdd.groupByKey()
rdd2 = rdd1.map(lambda x: rating_count(x))
rdd3 = rdd2.join(movie_title_rdd)
print(rdd3.take(5))
sc.stop()
题目2:根据已给的一个包含若干英文单词文本
1)写一个Python程序,根据该文本随机的产生单词
def get_random_word():
remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'
with open("D:/学习资料/spark/期末课程设计/words.txt", 'r', encoding='utf-8') as file:
words = file.readlines()
random_row = words[np.random.randint(1, len(words))]
random_row = re.sub(remove_chars, "", random_row)
random_words = random_row.split(' ')
if '' in random_words:
random_words.remove('')
if '\n' in random_words:
random_words.remove('\n')
random_index = np.random.randint(1, len(random_words))
word = random_words[random_index]
return word
2)编写Spark Streaming程序
a)统计每个时间段(自定)所有单词出现的总次数
b)统计每个时间段,所有单词出现总次数
c)统计每个时间段的热词
这里采用文件监听的方式模拟流数据。在第一问的基础上,不断向目录中写入文件:
for i in range(20):
time.sleep(np.random.uniform(low=0, high=1))
# time.sleep(1)
ww = get_random_word()
with open("D:/学习资料/spark/期末课程设计/new_words/log" + str(i+1) + ".txt", "w", encoding='utf-8')as fp:
fp.write(ww)
print(ww)
Spark Streaming编程
from operator import add
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext
conf = SparkConf()
conf.setAppName('TestDStream')
conf.setMaster('local[2]')
sc = SparkContext(conf=conf)
# 创建Spark Streaming,每5秒处理一次
ssc = StreamingContext(sc, 5)
lines = ssc.textFileStream('D:/学习资料/spark/期末课程设计/new_words')
# 拿到时间段里面的单词,统计单个单词出现次数
words = lines.flatMap(lambda line: line.split(' '))\
.map(lambda x: (x, 1))\
.reduceByKey(add)
# 统计单词总数
total = words.map(lambda x: x[1]).reduce(add)
# 统计热词,此处只是简单的筛选
hot_words = words.filter(lambda x: x[1] >= 2)
# 打印出结构
words.pprint()
total.pprint()
hot_words.pprint()
ssc.start()
ssc.awaitTermination()
题目3:利用Spark ML中的TF-IDF和随机森林对文件进行聚类分析
选取的分类数据集是旧金山犯罪记录,该数据集可在kaggle官方下载:
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
sc = SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv')\
.options(header='true', inferschema='true')\
.load('D:/学习资料/spark/期末课程设计/data/train/train.csv')
drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)
data.printSchema()
# 切分单词
regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W")
# 移除停用词
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "a", "an"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# 标签转换
label_stringIdx = StringIndexer(inputCol="Category", outputCol="label")
# TF-IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=500)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)
# 管道
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=999)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
# 模型建立
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=8, maxBins=32)
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions["prediction"] == 0)\
.select("Descript", "Category", "probability", "label", "prediction")\
.orderBy("probability", ascending=False)\
.show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
score = evaluator.evaluate(predictions)
print(score)
俗话说,前人栽树,后人乘凉,我也只不过是一个乘了上半身凉的计算机混子
技术支持@宇宙中心张店
参考:
【干货】Python大数据处理库PySpark实战——使用PySpark处理文本多分类问题 - 腾讯云开发者社区-腾讯云 (tencent.com)