Spark教育项目练习代码
import os
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
os.environ['SPARK_HOME'] = '/export/server/spark-2.3.0-bin-hadoop2.7'
PYSPARK_PYTHON = "/root/anaconda3/envs/pyspark_env/bin/python"
os.environ["PYSPARK_PYTHON"] = PYSPARK_PYTHON
os.environ["PYSPARK_DRIVER_PYTHON"] = PYSPARK_PYTHON
if __name__ == '__main__':
spark = SparkSession.builder \
.appName('test') \
.getOrCreate()
sc = spark.sparkContext
jdbcDF = spark.read.format("csv")\
.option("header", True)\
.option("sep", "\t")\
.option("inferSchema", "true")\
.load("file:///tmp/pycharm_project_553/EduAnalysis/data/eduxxx.csv")
jdbcDF.printSchema()
jdbcDF.show(2)
"""
要求: 找到Top50热点题对应的科目,然后统计这些科目中,分别包含这几道热点题的条目数
热点题
题号 热度(数量) 学科
1 100 数学
2 99 数学
3 98 语文
最终结果:
学科 热点题数量
数学 2
语文 1
"""
allInfoDS = jdbcDF
allInfoDS.createOrReplaceTempView("t_answer")
def q1():
spark.sql("""
select subject_id, count(t_answer.question_id) as hot_question_count
from
(select question_id, count(*) as frequency
from t_answer
group by question_id
order by frequency desc limit 50) t1
join t_answer on t1.question_id = t_answer.question_id
group by subject_id
order by hot_question_count desc
""").show()
top50DS = allInfoDS.groupBy("question_id").count().orderBy("count", ascending=False).limit(50)
top50DS.join(allInfoDS, "question_id").groupBy("subject_id").count().orderBy("count", ascending=False).show()
"""
+----------+------------------+
|subject_id|hot_question_count|
+----------+------------------+
| 科目ID_1_数学| 311|
| 科目ID_2_语文| 276|
| 科目ID_3_英语| 267|
+----------+------------------+
"""
def q2():
spark.sql("""
select t4.subject_id, count(*) as frequency
from
(select distinct(t3.question_id), t_answer.subject_id
from
(select explode(split(t2.recommendations, ',')) as question_id
from
(select recommendations
from
(select question_id, count(*) as frequency
from t_answer
group by question_id
order by frequency
desc limit 20) t1
join t_answer
on t1.question_id=t_answer.question_id) t2) t3
join t_answer
on t3.question_id=t_answer.question_id) t4
group by t4.subject_id
order by frequency desc
""").show()
"""
+----------+---------+
|subject_id|frequency|
+----------+---------+
| 科目ID_3_英语| 262|
| 科目ID_2_语文| 240|
| 科目ID_1_数学| 239|
+----------+---------+
"""
top20DS = allInfoDS.groupBy("question_id").count().orderBy("count", ascending=False).limit(20)
recommendListDF = top20DS.join(allInfoDS, "question_id")
questionIdDF = recommendListDF.select(F.explode(F.split("recommendations", ",")).alias("question_id"))
questionIdAndSubjectIdDF = questionIdDF.distinct().\
join(allInfoDS.dropDuplicates(["question_id"]), on="question_id").select("question_id", "subject_id")
questionIdAndSubjectIdDF.groupBy("subject_id").count().orderBy("count", ascending=False).show()