PySpark 环境准备
准备:【Spark2.3+ python版本需要python3.5+ | spark1.6 对应的python版本是python3.5】
1. 在 window 环境下解压 spark 程序,并配置 SPARK_HOME 环境变量
2. 安装 anaconda 并配置环境变量
ANACONDA_HOME
ANACONDA_HOME\Scripts
ANACONDA_HOME\Libary\bin
3. 安装模块 py4j 和 pyspark
第一种方式:
pip install
第二种方式:
从 %SPARK_HOME%\python\lib 拷贝 py4j 和 pyspark
到 ANACONDA_HOME\lib\site-packages
特别注意:配置好环境变量,一定要重启 IDE。
为 Anaconda 添加清华源
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main/
# 以上两条是Anaconda官方库的镜像
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge/
# 以上是Anaconda第三方库 Conda Forge的镜像
# for linux
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/
# for legacy win-64
conda config --add channels https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/peterjc123/
以上两条是Pytorch的Anaconda第三方镜像
conda config --set show_channel_urls yes
参考: 为 Anaconda 添加清华源
WordCount
"""
Python 版本 Spark WordCount
"""
from pyspark import SparkConf, SparkContext
import os
if __name__ == "__main__":
# os.environ['SPARK_HOME'] = "F:\\usr\\spark-2.3.1-bin-hadoop2.6"
# 创建 SparkConf
conf = SparkConf()
conf.setMaster("local[1]")
conf.setAppName("Python_Spark_WordCount")
sc = SparkContext(conf=conf)
file = sc.textFile("./words.txt")
file.foreach(print)
flap_words = file.flatMap(lambda line: line.split(" "))
map_words = flap_words.map(lambda word: (word, 1))
result_words = map_words.reduceByKey(lambda v1, v2: v1 + v2)
result = result_words.sortBy(lambda t: t[1])
result.foreach(print)
PVUV
数据格式:2020-02-05 192.168.143.199 uid98105 shanghai www.jd.com logout
计算每个网址访问量最高的 top2 地区
def every_site_visit_top2(file):
"""
计算每个网址访问量最高的 top2 地区
:param file:
:return:
"""
file.map(lambda one: (one.split("\t")[4], one.split("\t")[3])) \
.groupByKey() \
.map(lambda one: get_site_local_Top2(one)) \
.foreach(print)
def get_site_local_Top2(one):
"""
获取单个网站访问量最高的地区
:param one:
:return:
"""
site = one[0]
local_iterable = one[1]
# 创建字典,用于统计地区树
local_dict = {}
# 遍历地区
for local in local_iterable:
if local in local_dict:
local_dict[local] += 1
else:
local_dict[local] = 1
# 对字典进行排序
new_list = sorted(local_dict.items(), key=lambda pair: pair[1], reverse=True)
# 存放结果的列表
result_list = []
if len(new_list) > 2:
for i in range(2):
result_list.append(new_list[i])
else:
result_list = new_list
return site, result_list
计算每个网址最活跃的 top3 用户
def every_site_user_top3(file):
"""
计算每个网址最活跃的 top3 用户
:param file:
:return:
"""
file.map(lambda line: (line.split("\t")[2], line.split("\t")[4])) \
.groupByKey() \
.flatMap(lambda line: get_site2user_count(line)) \
.groupByKey() \
.map(lambda one: get_site2user_count_top3(one)) \
.foreach(print)
def get_site2user_count(line):
"""
数据格式转化:user,[site, site, site] => site,(user,count)
:param line:
:return:
"""
user = line[0]
sites = line[1]
site_dict = {}
for site in sites:
if site in site_dict:
site_dict[site] += 1
else:
site_dict[site] = 1
result_list = []
for site, count in site_dict.items():
result_list.append((site, (user, count)))
return result_list
def get_site2user_count_top3(one):
"""
site,[(user,count), (user,count), (user,count), ...]
求 top3
3 5 1 8 6 4 7
[3] [5, 3] [5, 3 ,1] 2 0 2 1 2 2
[8, 5, 3] [8,]
:param one:
:return:
"""
site = one[0]
user_count_iter = one[1]
top3_list = ['', '', '']
for user_count in user_count_iter:
count = user_count[1]
for i in range(len(top3_list)):
if top3_list[i] == '':
top3_list[i] = user_count
break
elif top3_list[i][1] < count:
for j in range(2, i, -1):
top3_list[j] = top3_list[j-1]
top3_list[i] = user_count
break
return site, top3_list
完整代码
"""
pv: page view 网站访问量
uv: unique view 单个用户访问网站的次数
"""
from pyspark import SparkConf, SparkContext
def pv(file):
"""
计算 pv
:param file:
:return:
"""
file.map(lambda line: (line.split("\t")[4], 1)) \
.reduceByKey(lambda v1, v2: v1 + v2) \
.sortBy(lambda t: t[1], ascending=False) \
.foreach(print)
def uv(file):
"""
计算 uv
:param file:
:return:
"""
file.map(lambda line: (line.split("\t")[1] + "#" + line.split("\t")[4], 1)) \
.distinct() \
.map(lambda pair: (pair[0].split("#")[1], 1)) \
.reduceByKey(lambda v1, v2: v1 + v2) \
.sortBy(lambda t: t[1], ascending=False) \
.foreach(print)
def bj_pv(file, location):
"""
计算北京地区的 pv
:param file:
:param location:
:return:
"""
file.filter(lambda one: one.split("\t")[3] == location) \
.map(lambda one: (one.split("\t")[4], 1)) \
.reduceByKey(lambda v1, v2: v1 + v2) \
.sortBy(lambda t: t[1], ascending=False) \
.foreach(print)
def every_site_user_top3(file):
"""
计算每个网址最活跃的 top3 用户
:param file:
:return:
"""
file.map(lambda line: (line.split("\t")[2], line.split("\t")[4])) \
.groupByKey() \
.flatMap(lambda line: get_site2user_count(line)) \
.groupByKey() \
.map(lambda one: get_site2user_count_top3(one)) \
.foreach(print)
def every_site_visit_top2(file):
"""
计算每个网址访问量最高的 top2 地区
:param file:
:return:
"""
file.map(lambda one: (one.split("\t")[4], one.split("\t")[3])) \
.groupByKey() \
.map(lambda one: get_site_local_Top2(one)) \
.foreach(print)
def get_site_local_Top2(one):
"""
获取单个网站访问量最高的地区
:param one:
:return:
"""
site = one[0]
local_iterable = one[1]
# 创建字典,用于统计地区树
local_dict = {}
# 遍历地区
for local in local_iterable:
if local in local_dict:
local_dict[local] += 1
else:
local_dict[local] = 1
# 对字典进行排序
new_list = sorted(local_dict.items(), key=lambda pair: pair[1], reverse=True)
# 存放结果的列表
result_list = []
if len(new_list) > 2:
for i in range(2):
result_list.append(new_list[i])
else:
result_list = new_list
return site, result_list
def get_site2user_count(line):
"""
数据格式转化:user,[site, site, site] => site,(user,count)
:param line:
:return:
"""
user = line[0]
sites = line[1]
site_dict = {}
for site in sites:
if site in site_dict:
site_dict[site] += 1
else:
site_dict[site] = 1
result_list = []
for site, count in site_dict.items():
result_list.append((site, (user, count)))
return result_list
def get_site2user_count_top3(one):
"""
site,[(user,count), (user,count), (user,count), ...]
求 top3
3 5 1 8 6 4 7
[3] [5, 3] [5, 3 ,1] 2 0 2 1 2 2
[8, 5, 3] [8,]
:param one:
:return:
"""
site = one[0]
user_count_iter = one[1]
top3_list = ['', '', '']
for user_count in user_count_iter:
count = user_count[1]
for i in range(len(top3_list)):
if top3_list[i] == '':
top3_list[i] = user_count
break
elif top3_list[i][1] < count:
for j in range(2, i, -1):
top3_list[j] = top3_list[j-1]
top3_list[i] = user_count
break
return site, top3_list
if __name__ == "__main__":
# 0 1 2 3 4 5
# 2020-02-05 192.168.118.234 uid39474 shanghai www.baidu.com comment
conf = SparkConf().setAppName("puvu").setMaster("local")
sc = SparkContext(conf=conf)
file = sc.textFile("./pvuvdata")
# 计算 pv
# pv(file)
# 计算 uv
# uv(file)
# 计算北京地区的 pv
# bj_pv(file, "beijing")
# 计算每个网址访问量最高的 top2 地区
# every_site_visit_top2(file)
# 计算每个网址最活跃的 top3 用户
every_site_user_top3(file)