PySpark的select fliter agg join

  1. 初始化
from __future__ import print_function, division
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession,functions as F
from pyspark.sql.types import StringType,MapType
from pyspark.sql.types import *
  1. 读取数据

def parse_log_schema(txt):
	try:
		fileds=txt.strip().split("\t")
		return {
		"userid":fileds[0],
		"docid":fileds[1],
		} 
	except:
		return None

#注册函数
udf_pasre_log=F.udf(parse_log_schema,MapType(StringType(),StringType()))			

# 1.text
spark = SparkSession.builder.master("local[2]").appName("test").enableHiveSupport().getOrCreate()
df=spark.read.text("path").select(udf_pasre_log(F.clo("value")).aslias("info") \
.filter(~F.isnull("info")) \
.select(F.clo("info")["docid"].alias("docid"),
		F.clo("info")["userid"].alias("userid")
		).dropDuplicates(["userid"],["docid"])
#2.csv
df = spark.read.csv('../data/rating.csv', sep = ',', header = True) #自动判断格式interSchema=True
df.show()

#.csv schema
data_schame= StructType([
StructField("docid",StringType(),ture),
StructField("userid",StringType(),ture),
StructField("ctr",DoubleType(),ture),
])
df=spark.read.csv("input_paths",schema=data_schame,sep="\t",header=False)
df.show()
#3.写入
df.coalesce(100).write.fromat("csv").mode("overwrite").option("delimiter","\t").save("out_path")

  1. select
#select userid ,docid from data   -- sql语句
df.select('userid','docid').show() 

#select userid as uid from data
df.selectExpr('userid as uid').show()
# percent to double
df.selectExpr('cast(percent as double)').printSchema() #转换类型cast
#select docid,ctr * 2  as ctr from data
df.selectExpr('docid', 'ctr * 2 as ctr').show()


  1. sum,agg
#select userid,docid sum(expose) expose,sum(click) click from data group by userid,docid
df.groupBy("userid","docid").agg({"expose":"sum","click":"sum"}).withColumnRenamed("sum(expose)","expose").withColumnRenamed("sum(click)","click")
  1. filter
#select docid, ctr from data where docid="123" and ctr > 0.3
df.select("docid", "ctr").filter('docid== 2 and ctr > 0.3').show()
  1. join

profile = [{'name': 'Alice', 'age': 1}, {'name': 'Bryan', 'age': 3}, {'name': 'Cool', 'age':2}]
df_profile = spark.createDataFrame(profile) #转换为数据框
df_profile.show()
# 创建数据框df_parents
parents = [{'name': 'Jason', 'child': 'Alice'}, 
     {'name': 'Bill', 'child': 'Bryan'}, 
     {'name': 'Sera', 'child': 'Bryan'}, 
     {'name': 'Jill', 'child': 'Ken'}]
df_parents = spark.createDataFrame(parents) #转换为数据框
df_parents.show()
#inner join 
df_profile.join(df_parents, df_profile.name == df_parents.child).show()
#left join
df_profile.join(df_parents, df_profile.name == df_parents.child, 'left').show()
#right join
df_profile.join(df_parents, df_profile.name == df_parents.child, 'right').show()
#full outer join
df_profile.join(df_parents, df_profile.name == df_parents.child, 'outer').show()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
很高兴回答你的问题!以下是我对问题的回答: (1)统计被评分次数最多的10部电影,并给出评分次数(电影名,评分次数) ```python ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") movie_counts = ratings.groupBy("movieId").count() top_10_movies = movie_counts.join(movies, movie_counts.movieId == movies.movieId).select("title", "count").orderBy("count", ascending=False).limit(10) ``` (2)分别统计男性、女性当中平均评分最高的20部电影(性别,电影名,评分) ```python from pyspark.sql.functions import avg ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") ratings_gender = ratings.join(users, ratings.userId == users.userId).select("gender", "movieId", "rating") top_20_male = ratings_gender.filter(ratings_gender.gender == "M").groupBy("movieId").agg(avg("rating").alias("avg_rating")).join(movies, "movieId").orderBy("avg_rating", ascending=False).limit(20) top_20_female = ratings_gender.filter(ratings_gender.gender == "F").groupBy("movieId").agg(avg("rating").alias("avg_rating")).join(movies, "movieId").orderBy("avg_rating", ascending=False).limit(20) ``` (3)统计movieID为 2116这部电影各年龄的平均评分(年龄,评分) ```python ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") movie_2116_ratings = ratings.filter(ratings.movieId == "2116") movie_2116_ratings_age = movie_2116_ratings.join(users, movie_2116_ratings.userId == users.userId).select("age", "rating") movie_2116_ratings_avg = movie_2116_ratings_age.groupBy("age").agg(avg("rating").alias("avg_rating")) ``` (4)统计各年评分最高的电影类型(年份,类型,影评分) ```python from pyspark.sql.functions import year ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") movies = spark.read.format("csv").option("header", "true").load("movies.csv") joined_data = ratings.join(movies, ratings.movieId == movies.movieId) year_ratings = joined_data.withColumn("year", year(joined_data.timestamp)).select("year", "genres", "rating") top_genre_year = year_ratings.groupBy("year", "genres").agg(avg("rating").alias("avg_rating")).orderBy(["year", "avg_rating"], ascending=[True, False]) ``` (5)统计评分最高的10部Comedy类电影(电影名,评分) ```python ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") movies = spark.read.format("csv").option("header", "true").load("movies.csv") comedy_ratings = ratings.join(movies, ratings.movieId == movies.movieId).filter(movies.genres.contains("Comedy")) top_10_comedy = comedy_ratings.groupBy("title").agg(avg("rating").alias("avg_rating")).orderBy("avg_rating", ascending=False).limit(10) ``` (6)统计评价较高(平均评分>=4.0)电影最多的那个年份的评价最高的10部电影(年份,电影名,评分) ```python ratings = spark.read.format("csv").option("header", "true").load("ratings.csv") movies = spark.read.format("csv").option("header", "true").load("movies.csv") high_ratings = ratings.groupBy("movieId").agg(avg("rating").alias("avg_rating")).filter("avg_rating>=4.0") high_ratings_data = high_ratings.join(movies, high_ratings.movieId == movies.movieId).withColumn("year", year(movies.title)).select("year", "title", "avg_rating") top_10_movies = high_ratings_data.groupBy("year", "title").agg(avg("avg_rating").alias("avg_rating")).orderBy(["year", "avg_rating"], ascending=[False, False]).limit(10) ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值