0.数据格式
act_num_by_type: 1:63,3:1,4:2,2:5 feed_fenci: 一/m 到/v 五颗/m 星/n 就/d 开始/v 输/v
1.利用累加器
py程序
import sys
import os
from pyspark.sql import SparkSession
from pyspark.accumulators import AccumulatorParam
import time
import logging
# log 配置
logging.basicConfig(filename="./logs", level=logging.INFO, format="[%(levelname)s]\t%(asctime)s\tLINENO:%(lineno)d\t%(message)s", datefmt="%Y-%m-%d %H:%M:%S")
spark = SparkSession\
.builder\
.appName("**")\
.config("spark.dynamicAllocation.enabled", "False")\
.getOrCreate()
input_path = sys.argv[1]
class DictParam(AccumulatorParam):
def zero(self, value={}):
return {}
def addInPlace(self, acc1, acc2):
for val in acc2:
acc1[val] = acc1.get(val, 0) + acc2[val]
return acc1
sc = spark.sparkContext
dict1 = sc.accumulator(dict(), DictParam())
dict2 = sc.accumulator(dict(), DictParam())
dict3 = sc.accumulator(dict(), DictParam())
dict4 = sc.accumulator(dict(), DictParam())
def func(line_iter):
global dict1
global dict2
global dict3
global dict4
for line in line_iter:
line = line.strip()
line_items = line.split('\1')
if len(line_items) < 8:
continue
value_list = line_items[7].split('|')
words_list = line_items[3].split()
try:
val0 = float(value_list[0])
val1 = float(value_list[1])
val2 = float(value_list[2])
except:
continue
for word in words_list:
dict1 += {word:1}
dict2 += {word:val0}
dict3 += {word:val1}
dict4 += {word:val2}
return []
ss = spark.sparkContext.textFile(input_path)\
.mapPartitions(func)
ss.count()
dict1 = dict1.value
dict2 = dict2.value
dict3 = dict3.value
dict4 = dict4.value
with open('./result', 'w') as fw:
for value in dict1:
num = dict1[value]
fw.write('\t'.join(['{}'.format(k) for k in [value, dict1[value], dict2[value]/num, dict3[value]/num, dict4[value]/num]]) + '\n')
data_process.sh
#!/bin/bash
set +e
source ./config.sh
loglevel=0
logfile=logs
hadoop='/home/**/work_space/tools/hadoop-2.6.0/bin/hadoop'
spark_submit='/home/**/work_space/tools/spark-2.0.1-bin-2.6.0/bin/spark-submit'
info "启动统计任务"
#HINPUT="/user/recommend/warehouse/recommend.db/ml_nearby_feed_user_profile/partition_date=20181126"
HINPUT="/user/recommend/warehouse/recommend_test.db/feed_static_score_data/partition_date=20181118"
HOUTPUT="/user/recommend/**/**"
$hadoop fs -rm -r $HOUTPUT
$spark_submit\
--master yarn\
--deploy-mode client\
--queue root.bigdata.recommend\
--archives hdfs://nameservice3/user/recommend/**/python/python35.tar.gz#python\
--driver-memory "10g"\
--executor-memory "10g"\
--num-executors "40" \
--executor-cores "2" \
./test.py $HINPUT $HOUTPUT
if [ $? -ne 0 ]
then
error "$job_name failed $DATE - $0"
exit 255
fi
info "SHELL任务结束"
config.sh
#!/bin/bash
function log {
local msg
local logtype
logtype=$1
msg=$2
lineno=$3
datetime=`date +"%F %H:%M:%S"`
logformat="[${logtype}]\t${datetime}\tLINENO:${lineno}\t${msg}"
{
case $logtype in
DEBUG)
[[ $loglevel -le 0 ]] && echo -e "${logformat}" ;;
INFO)
[[ $loglevel -le 1 ]] && echo -e "${logformat}" ;;
WARNING)
[[ $loglevel -le 2 ]] && echo -e "${logformat}" ;;
ERROR)
[[ $loglevel -le 3 ]] && echo -e "${logformat}" ;;
esac
} | tee -a $logfile
}
debug () {
message=$1
lineno=`caller 0 | awk '{print$1}'`
log DEBUG "${message}" ${lineno}
}
info() {
message=$1
lineno=`caller 0 | awk '{print$1}'`
log INFO "${message}" ${lineno}
}
warn() {
message=$1
lineno=`caller 0 | awk '{print$1}'`
log WARNING "${message}" ${lineno}
}
error() {
message=$1
lineno=`caller 0 | awk '{print$1}'`
log ERROR "${message}" ${lineno}
}
checkHadoopFile()
{
if [ $# -lt 4 ]
then
return -1
fi
HADOOP_CLIENT=$1
CHECK_PATH=$2
TRY_NUM=$3
SLEEP_TIME=$4
while [ "$TRY_NUM" -ge 1 ]
do
$HADOOP_CLIENT fs -test -e $CHECK_PATH
if [ $? -eq 0 ]
then
return 0
fi
echo "try time $TRY_NUM"
TRY_NUM=`expr $TRY_NUM - 1`
sleep $SLEEP_TIME
done
return 1
}
wechat_alert()
{
msg=$1
wget -SO /dev/null http://common.platform.adt.sogou/weixin.php?desc=**%23""%23"${msg}"
}
Spark会自动重新执行失败的或较慢的任务来应对有错误的或者比较慢的机器,这就导致最终结果的不正确(同一个函数可能对同一个数据运行了多次,可采用foreach()这样的操作来避免) (出自《Spark快速大数据分析》P90~P91)。
2.map\reduce
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def get_labels(act_num_by_type): # 根据行为数据计算点击率、点赞率、评论率
act_num_by_type = str(act_num_by_type)
act_map = {}
result = ''
for act_item in act_num_by_type.split(','):
act, num = act_item.split(':')
act_map[str(act)] = float(num)
result += str(act_map.get('1',0))+ ',' # display_num
result += str(act_map.get('2',0)*1.0/act_map.get('1',1))+',' # click_rate
result += str(act_map.get('3',0)*1.0/act_map.get('1',1))+',' # like_rate
result += str(act_map.get('4',0)*1.0/act_map.get('1',1)) # comment_rate
return result
def splitFeed(line): # 把feed的每一个单词切分出来
result = []
try:
#print (line)
rate_and_weight = str(line[1]).split(',')
if rate_and_weight[0]<50: # display_num < 50
return result
else:
rate_and_weight[0] = 1
for word in line[0].split(' '):
temp = (word, rate_and_weight)
result.append(temp)
except:
#print (line)
return result
return result
def wordScoreSum(w1, w2): # 自定义累加函数
result = []
try:
if (len(w1)<4 & len(w2)<4):
return result
elif(len(w1)<4):
return w2
elif(len(w2)<4):
return w1
else:
arr1 = w1
arr2 = w2
for i in range(len(arr1)):
result.append(float(arr1[i])+float(arr2[i]))
except:
return result
return result
def getFeedScore(line):
rate_arr = line[1]
rate_arr[1] = float(rate_arr[1]) / int(rate_arr[0])
rate_arr[2] = float(rate_arr[2]) / int(rate_arr[0])
rate_arr[3] = float(rate_arr[3]) / int(rate_arr[0])
return (line[0], str(rate_arr))
if __name__ == "__main__":
spark = SparkSession.builder.appName("feature_process").getOrCreate()
file_name = 'hdfs://nameservice3/user/recommend/warehouse/recommend_test.db/feed_static_score_data/partition_date=20181118/part-00000-2b3329cc-a5fa-4099-9241-bbce4fed9fcb-c000'
col_schema = StructType([StructField("feed_id", StringType(), True), StructField("owner_momo_id", StringType(), True), StructField("num_by_act_type", StringType(), True), StructField("feed_fenci", StringType(), True), StructField("feed_context", StringType(), True), StructField("feed_basic_attr", StringType(), True), StructField("feed_owner_basic_attr", StringType(), True), StructField("feed_owner_statistic_attr", StringType(), True)])
ori_data = spark.read.csv(file_name, schema=col_schema, sep='\001')
#ori_data = ori_data.sample(False, 0.001, 12)
print ("feed_num: ",str(ori_data.count()))
ud_get_labels = udf(get_labels, StringType())
ori_data_with_label = ori_data.withColumn("act_type_rate", ud_get_labels("num_by_act_type"))
word_score_dic = ori_data_with_label.select("feed_fenci", "act_type_rate").rdd.flatMap(splitFeed).reduceByKey(wordScoreSum).map(getFeedScore)
print ("result_example: ",word_score_dic.take(20))
word_score_dic = spark.createDataFrame(word_score_dic)
print ("word_num: ",str(word_score_dic.count()))
word_score_dic.write.csv('hdfs://nameservice3/user/recommend/warehouse/recommend_test.db/temp_czz/word_score_dic.csv', mode='overwrite')
run0.sh
#!/usr/bin/env bash
/opt/spark-2.2.1-bin-2.6.0/bin/spark-submit \
--master yarn \
--deploy-mode cluster \
--name feed_static_score_feature_etl \
--queue root.bigdata.recommend \
--executor-cores 4 \
--executor-memory 15g \
--num-executors 150 \
--driver-memory 16g \
--conf spark.yarn.executor.memoryOverhead=10G \
--conf spark.kryoserializer.buffer.max=1536M \
--conf spark.driver.maxResultSize=8G \
--conf spark.shuffle.file.buffer=48K \
--conf spark.default.parallelism=5000 \
/data10/recommend/**/feature_process_pyspark.py