pyspark课程设计(电影数据集RDD编程、Spark Streaming、TF-IDF和随机森林聚类)

1 篇文章 0 订阅
1 篇文章 0 订阅

目录

题目1:根据给出的电影数据集、进行Spark中的RDD编程

1)求取并显示电影的总数量

2)求取并显示用户评价的总数量

3)求取并显示电影总类数

4)求取并显示电影平均评分小于3的电影数

5)求取并显示每个用户的平均评分值 

6)求取并显示每部电影的评价次数

题目2:根据已给的一个包含若干英文单词文本

1)写一个Python程序,根据该文本随机的产生单词

2)编写Spark Streaming程序

题目3:利用Spark ML中的TF-IDF和随机森林对文件进行聚类分析


题目1:根据给出的电影数据集、进行Spark中的RDD编程

1)求取并显示电影的总数量

import re
from pyspark import SparkContext

# 获取需处理的数据列,返回pairRDD
def get_col(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    elif '"' in line:
        parts = re.split(',"|",', line)
        return int(parts[column_index1]), parts[column_index2]

    parts = line.split(',')
    return int(parts[column_index1]), parts[column_index2]


if __name__ == '__main__':

    sc = SparkContext(appName="movies")
    movie_id_index_movie = 0
    movie_title_index = 1
    # 加载文件创建RDD
    movie_lines = sc.textFile("../data/movies.csv")
    movie_rdd = movie_lines\
        .map(lambda x: get_col(x, movie_id_index_movie, movie_title_index))

    print("movies.csv: \n")
    print(movie_rdd.take(5))

    movie_count = movie_rdd.count() - 1
    print("电影总数量: %s" % movie_count)

    sc.stop()

2)求取并显示用户评价的总数量

import re
from pyspark import SparkContext


def get_col(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    elif '"' in line:
        parts = re.split(',"|",', line)
        return int(parts[column_index1]), parts[column_index2]

    parts = line.split(',')
    return int(parts[column_index1]), parts[column_index2]


if __name__ == '__main__':

    sc = SparkContext(appName="ratings1")
    movie_title_index = 1
    movie_rating_index = 2
    ratings_lines = sc.textFile("../data/ratings.csv")
    ratings_rdd = ratings_lines\
        .map(lambda x: get_col(x, movie_title_index, movie_rating_index))

    print("ratings.csv: \n")
    print(ratings_rdd.take(5))
    

    ratings_count = ratings_rdd.count() - 1
    print("电影评价总数量: %s" % ratings_count)
    

    sc.stop()

3)求取并显示电影总类数

from pyspark import SparkContext


def get_column(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    parts = line.split(',')

    return int(parts[column_index1]), parts[len(parts)-1]


def genres_count(line):
    if -1 in line:  # 跳过 (-1,-1)
        return

    global genres_list

    genres = line[1]
    print(genres.split("|"))
    genres_list = genres_list + genres.split("|")
    # 去重
    genres_list = list(set(genres_list))

    print("电影的总类数: %s" % len(genres_list))
    print(genres_list)
    print("\n")


if __name__ == "__main__":
    genres_list = []

    # movies
    sc = SparkContext(appName="movies")

    movie_id_index_movie = 0
    movie_genres_index = 2
    movie_lines = sc.textFile("../data/movies.csv")
    movie_rdd = movie_lines\
        .map(lambda x: get_column(x, movie_id_index_movie, movie_genres_index))

    movie_rdd.foreach(genres_count)

    sc.stop()

4)求取并显示电影平均评分小于3的电影数

import re
from pyspark import SparkContext


def get_col(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    elif '"' in line:
        parts = re.split(',"|",', line)
        return int(parts[column_index1]), parts[column_index2]

    parts = line.split(',')
    return int(parts[column_index1]), parts[column_index2]


def avg_rating(line):

    # 跳过(-1, -1)
    if -1 in line:
        return

    rating_list = list(line[1])
    # print(rating_list)
    temp = 0
    for item in rating_list:
        temp += float(item)
    temp = temp/len(rating_list)
    return int(line[0]), temp


if __name__ == '__main__':

    movie_id_index = 1
    movie_rating_index = 2
    filter_rating = 3

    sc = SparkContext(appName="ratings")
    ratings_line = sc.textFile("../data/ratings.csv")
    ratings_rdd = ratings_line\
        .map(lambda x: get_col(x, movie_id_index, movie_rating_index))
    rdd1 = ratings_rdd.groupByKey()
    rdd2 = rdd1.map(lambda x: avg_rating(x))
    print(rdd2.top(5))
    rdd3 = rdd2.filter(lambda x: x[1] < filter_rating)
    print("平均分小于3的电影数:", rdd3.count() - 1)
    sc.stop()

5)求取并显示每个用户的平均评分值 

import re
from pyspark import SparkContext


def get_col(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    elif '"' in line:
        parts = re.split(',"|",', line)
        return int(parts[column_index1]), parts[column_index2]

    parts = line.split(',')
    return int(parts[column_index1]), parts[column_index2]


def avg_rating(line):

    # 跳过(-1, -1)
    if -1 in line:
        return

    rating_list = list(line[1])
    temp = 0
    for item in rating_list:
        temp += float(item)
    temp = temp/len(rating_list)
    return int(line[0]), temp


if __name__ == '__main__':

    user_id_index = 0
    movie_rating_index = 2
    filter_rating = 3

    sc = SparkContext(appName="ratings")
    ratings_line = sc.textFile("../data/ratings.csv")
    ratings_rdd = ratings_line\
        .map(lambda x: get_col(x, user_id_index, movie_rating_index))
    rdd1 = ratings_rdd.groupByKey()
    rdd2 = rdd1.map(lambda x: avg_rating(x)).sortByKey()
    # rdd3 = rdd2.filter(lambda x: x[1] <= filter_rating)
    print(rdd2.take(5))

    sc.stop()

6)求取并显示每部电影的评价次数

import re
from pyspark import SparkContext


def get_col(line, column_index1, column_index2):
    if 'userId' in line or 'movieId' in line:
        return -1, -1
    elif '"' in line:
        parts = re.split(',"|",', line)
        return int(parts[column_index1]), parts[column_index2]

    parts = line.split(',')
    return int(parts[column_index1]), parts[column_index2]


def rating_count(line):
    rating_list = list(line[1])
    num = len(rating_list)
    return int(line[0]), num


if __name__ == '__main__':

    movie_id_index = 1
    rating_index = 2
    movie_title_index = 1

    sc = SparkContext(appName="ratings")
    ratings_line = sc.textFile("../data/ratings.csv")
    ra_num_rdd = ratings_line\
        .map(lambda x: get_col(x, movie_id_index, rating_index))

    movie_title_line = sc.textFile("../data/movies.csv")
    movie_title_rdd = movie_title_line.map(lambda x: get_col(x, movie_id_index, movie_title_index))

    rdd1 = ra_num_rdd.groupByKey()
    rdd2 = rdd1.map(lambda x: rating_count(x))
    rdd3 = rdd2.join(movie_title_rdd)
    print(rdd3.take(5))
    sc.stop()

题目2:根据已给的一个包含若干英文单词文本

1)写一个Python程序,根据该文本随机的产生单词

def get_random_word():
    remove_chars = '[·’!"\#$%&\'()#!()*+,-./:;<=>?\@,:?¥★、….>【】[]《》?“”‘’\[\\]^_`{|}~]+'

    with open("D:/学习资料/spark/期末课程设计/words.txt", 'r', encoding='utf-8') as file:
        words = file.readlines()
    random_row = words[np.random.randint(1, len(words))]
    random_row = re.sub(remove_chars, "", random_row)
    random_words = random_row.split(' ')
    if '' in random_words:
        random_words.remove('')
    if '\n' in random_words:
        random_words.remove('\n')
    random_index = np.random.randint(1, len(random_words))
    word = random_words[random_index]

    return word

2)编写Spark Streaming程序

        a)统计每个时间段(自定)所有单词出现的总次数

        b)统计每个时间段,所有单词出现总次数

        c)统计每个时间段的热词

这里采用文件监听的方式模拟流数据。在第一问的基础上,不断向目录中写入文件:

for i in range(20):
    time.sleep(np.random.uniform(low=0, high=1))
    # time.sleep(1)
    ww = get_random_word()
    with open("D:/学习资料/spark/期末课程设计/new_words/log" + str(i+1) + ".txt", "w", encoding='utf-8')as fp:
        fp.write(ww)
    print(ww)

Spark Streaming编程

from operator import add
from pyspark import SparkContext, SparkConf
from pyspark.streaming import StreamingContext

conf = SparkConf()
conf.setAppName('TestDStream')
conf.setMaster('local[2]')
sc = SparkContext(conf=conf)

# 创建Spark Streaming,每5秒处理一次
ssc = StreamingContext(sc, 5)
lines = ssc.textFileStream('D:/学习资料/spark/期末课程设计/new_words')

# 拿到时间段里面的单词,统计单个单词出现次数
words = lines.flatMap(lambda line: line.split(' '))\
    .map(lambda x: (x, 1))\
    .reduceByKey(add)

# 统计单词总数
total = words.map(lambda x: x[1]).reduce(add)

# 统计热词,此处只是简单的筛选
hot_words = words.filter(lambda x: x[1] >= 2)

# 打印出结构
words.pprint()
total.pprint()
hot_words.pprint()
ssc.start()
ssc.awaitTermination()

题目3:利用Spark ML中的TF-IDF和随机森林对文件进行聚类分析

选取的分类数据集是旧金山犯罪记录,该数据集可在kaggle官方下载:

from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

sc = SparkContext()
sqlContext = SQLContext(sc)
data = sqlContext.read.format('com.databricks.spark.csv')\
    .options(header='true', inferschema='true')\
    .load('D:/学习资料/spark/期末课程设计/data/train/train.csv')

drop_list = ['Dates', 'DayOfWeek', 'PdDistrict', 'Resolution', 'Address', 'X', 'Y']
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)
data.printSchema()

# 切分单词
regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W")

# 移除停用词
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "a", "an"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# 标签转换
label_stringIdx = StringIndexer(inputCol="Category", outputCol="label")

# TF-IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=500)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

# 管道
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=999)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

# 模型建立
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=8, maxBins=32)
rfModel = rf.fit(trainingData)
predictions = rfModel.transform(testData)
predictions.filter(predictions["prediction"] == 0)\
    .select("Descript", "Category", "probability", "label", "prediction")\
    .orderBy("probability", ascending=False)\
    .show(n=10, truncate=30)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
score = evaluator.evaluate(predictions)
print(score)

俗话说,前人栽树,后人乘凉,我也只不过是一个乘了上半身凉的计算机混子

技术支持@宇宙中心张店

参考:

【干货】Python大数据处理库PySpark实战——使用PySpark处理文本多分类问题 - 腾讯云开发者社区-腾讯云 (tencent.com)

  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
Spark中实现中文字符的TF-IDF,可以按照以下步骤进行: 1. 首先,需要对中文文本进行分词。可以使用jieba等中文分词工具进行分词。 2. 接着,需要对分词后的文本进行处理,去除停用词和标点符号等无关词汇。 3. 然后,需要计算每个词在文档中的词频(TF)和逆文档频率(IDF)。 4. 最后,将TF和IDF相乘,得到每个词的TF-IDF值,并按照降序排列,取排在最前面的几个词作为关键词。 下面是一个使用Spark实现中文字符TF-IDF的示例代码: ```python from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover # 加载数据 data = spark.read.text("data.txt") # 分词 tokenizer = Tokenizer(inputCol="value", outputCol="words") wordsData = tokenizer.transform(data) # 去除停用词和标点符号 stopwords = StopWordsRemover.loadDefaultStopWords("chinese") stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stopwords) filteredData = stopwordsRemover.transform(wordsData) # 计算TF hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000) featurizedData = hashingTF.transform(filteredData) # 计算IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # 计算TF-IDF并取前n个关键词 n = 10 keywords = rescaledData.select("filtered", "features").rdd.map(lambda x: x[0], x[1].toArray()).map(lambda x: list(zip(x, range(len(x))))).flatMap(lambda x: x).sortBy(lambda x: x[1], False).map(lambda x: x[0]).take(n) # 输出关键词 print(keywords) ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值