pyspark统计单词频数(加权)

0.数据格式

act_num_by_type:  1:63,3:1,4:2,2:5     feed_fenci:  一/m 到/v 五颗/m 星/n 就/d 开始/v 输/v

 

1.利用累加器

py程序

import sys
import os

from pyspark.sql import SparkSession
from pyspark.accumulators import AccumulatorParam
import time
import logging

# log 配置
logging.basicConfig(filename="./logs", level=logging.INFO, format="[%(levelname)s]\t%(asctime)s\tLINENO:%(lineno)d\t%(message)s", datefmt="%Y-%m-%d %H:%M:%S")

spark = SparkSession\
        .builder\
        .appName("**")\
        .config("spark.dynamicAllocation.enabled", "False")\
        .getOrCreate()

input_path = sys.argv[1]

class DictParam(AccumulatorParam):
    def zero(self,  value={}):
        return {}

    def addInPlace(self, acc1, acc2):
        for val in acc2:
           acc1[val] = acc1.get(val, 0) + acc2[val]
        return acc1

sc = spark.sparkContext

dict1 = sc.accumulator(dict(), DictParam())
dict2 = sc.accumulator(dict(), DictParam())
dict3 = sc.accumulator(dict(), DictParam())
dict4 = sc.accumulator(dict(), DictParam())


def func(line_iter):
    global dict1
    global dict2
    global dict3
    global dict4
    for line in line_iter:
        line = line.strip()
        line_items = line.split('\1')
        if len(line_items) < 8:
            continue
        value_list = line_items[7].split('|')
        words_list = line_items[3].split()
        try:
            val0 = float(value_list[0])
            val1 = float(value_list[1])
            val2 = float(value_list[2])
        except:
            continue
        for word in words_list:
            dict1 += {word:1}
            dict2 += {word:val0}
            dict3 += {word:val1}
            dict4 += {word:val2}
    return []

ss = spark.sparkContext.textFile(input_path)\
        .mapPartitions(func)
ss.count()
dict1 = dict1.value
dict2 = dict2.value
dict3 = dict3.value
dict4 = dict4.value

with open('./result', 'w') as fw:
    for value in dict1:
        num = dict1[value]
        fw.write('\t'.join(['{}'.format(k) for k in [value, dict1[value], dict2[value]/num, dict3[value]/num, dict4[value]/num]]) + '\n')
                                                                                                                                                                   



                                                                                                                                                                   

data_process.sh

#!/bin/bash
set +e
source ./config.sh
loglevel=0
logfile=logs
hadoop='/home/**/work_space/tools/hadoop-2.6.0/bin/hadoop'
spark_submit='/home/**/work_space/tools/spark-2.0.1-bin-2.6.0/bin/spark-submit'

info "启动统计任务"
#HINPUT="/user/recommend/warehouse/recommend.db/ml_nearby_feed_user_profile/partition_date=20181126"
HINPUT="/user/recommend/warehouse/recommend_test.db/feed_static_score_data/partition_date=20181118"
HOUTPUT="/user/recommend/**/**"

$hadoop fs -rm -r $HOUTPUT
$spark_submit\
        --master yarn\
        --deploy-mode client\
        --queue root.bigdata.recommend\
        --archives hdfs://nameservice3/user/recommend/**/python/python35.tar.gz#python\
        --driver-memory "10g"\
        --executor-memory "10g"\
        --num-executors "40" \
        --executor-cores "2" \
        ./test.py  $HINPUT $HOUTPUT

if [ $? -ne 0 ]
then
    error "$job_name failed $DATE - $0"
    exit 255
fi

info "SHELL任务结束"

config.sh

#!/bin/bash

function log {
        local msg
        local logtype
        logtype=$1
        msg=$2
        lineno=$3
        datetime=`date +"%F %H:%M:%S"`
        logformat="[${logtype}]\t${datetime}\tLINENO:${lineno}\t${msg}"
        {
        case $logtype in
                DEBUG)
                        [[ $loglevel -le 0 ]] && echo -e "${logformat}" ;;
                INFO)
                        [[ $loglevel -le 1 ]] && echo -e "${logformat}" ;;
                WARNING)
                        [[ $loglevel -le 2 ]] && echo -e "${logformat}" ;;
                ERROR)
                        [[ $loglevel -le 3 ]] && echo -e "${logformat}" ;;
        esac
        } | tee -a $logfile
}

debug () {
        message=$1
        lineno=`caller 0 | awk '{print$1}'`
        log DEBUG "${message}" ${lineno}
}
info() {
        message=$1
        lineno=`caller 0 | awk '{print$1}'`
        log INFO "${message}" ${lineno}
}
warn() {
        message=$1
        lineno=`caller 0 | awk '{print$1}'`
        log WARNING "${message}" ${lineno}
}
error() {
        message=$1
        lineno=`caller 0 | awk '{print$1}'`
        log ERROR "${message}" ${lineno}
}


checkHadoopFile()
{
    if [ $# -lt 4 ] 
    then
        return -1
    fi  
    HADOOP_CLIENT=$1
    CHECK_PATH=$2
    TRY_NUM=$3
    SLEEP_TIME=$4
    while [ "$TRY_NUM" -ge 1 ] 
    do
        $HADOOP_CLIENT fs -test -e $CHECK_PATH
        if [ $? -eq 0 ]
        then
            return 0
        fi
        echo "try time $TRY_NUM"
        TRY_NUM=`expr $TRY_NUM - 1`
        sleep $SLEEP_TIME
    done
    return 1
}

wechat_alert()
{
    msg=$1
    wget -SO /dev/null http://common.platform.adt.sogou/weixin.php?desc=**%23""%23"${msg}"
}

Spark会自动重新执行失败的或较慢的任务来应对有错误的或者比较慢的机器,这就导致最终结果的不正确(同一个函数可能对同一个数据运行了多次,可采用foreach()这样的操作来避免) (出自《Spark快速大数据分析》P90~P91)。

 

2.map\reduce

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf


def get_labels(act_num_by_type): # 根据行为数据计算点击率、点赞率、评论率
    act_num_by_type = str(act_num_by_type)
    act_map = {}
    result = ''
    for act_item in act_num_by_type.split(','):
        act, num = act_item.split(':')
        act_map[str(act)] = float(num)
    result += str(act_map.get('1',0))+ ',' # display_num
    result += str(act_map.get('2',0)*1.0/act_map.get('1',1))+',' # click_rate
    result += str(act_map.get('3',0)*1.0/act_map.get('1',1))+',' # like_rate
    result += str(act_map.get('4',0)*1.0/act_map.get('1',1)) # comment_rate
    return result

def splitFeed(line): # 把feed的每一个单词切分出来
    result = []
    try:
        #print (line)
        rate_and_weight = str(line[1]).split(',')
        if rate_and_weight[0]<50: # display_num < 50
                return result
        else:
                rate_and_weight[0] = 1
        for word in line[0].split(' '):
                temp = (word, rate_and_weight)
                result.append(temp)
    except:
        #print (line)
        return result
    return result

def wordScoreSum(w1, w2): # 自定义累加函数
    result = []
    try:
        if (len(w1)<4 & len(w2)<4):
                return result
        elif(len(w1)<4):
                return w2
        elif(len(w2)<4):
                return w1
        else:
                arr1 = w1
                arr2 = w2
                for i in range(len(arr1)):
                        result.append(float(arr1[i])+float(arr2[i]))
    except:
        return result
    return result

                                                                                                                                                                                                                                                                                         
def getFeedScore(line):
    rate_arr = line[1]
    rate_arr[1] = float(rate_arr[1]) / int(rate_arr[0])
    rate_arr[2] = float(rate_arr[2]) / int(rate_arr[0])
    rate_arr[3] = float(rate_arr[3]) / int(rate_arr[0])
    return (line[0], str(rate_arr))

if __name__ == "__main__":
        spark = SparkSession.builder.appName("feature_process").getOrCreate()

        file_name = 'hdfs://nameservice3/user/recommend/warehouse/recommend_test.db/feed_static_score_data/partition_date=20181118/part-00000-2b3329cc-a5fa-4099-9241-bbce4fed9fcb-c000'
        col_schema = StructType([StructField("feed_id", StringType(), True), StructField("owner_momo_id", StringType(), True), StructField("num_by_act_type", StringType(), True), StructField("feed_fenci", StringType(), True), StructField("feed_context", StringType(), True), StructField("feed_basic_attr", StringType(), True), StructField("feed_owner_basic_attr", StringType(), True), StructField("feed_owner_statistic_attr", StringType(), True)])
        ori_data = spark.read.csv(file_name, schema=col_schema, sep='\001')
        #ori_data = ori_data.sample(False, 0.001, 12)
        print ("feed_num: ",str(ori_data.count()))
        ud_get_labels = udf(get_labels, StringType())
        ori_data_with_label = ori_data.withColumn("act_type_rate", ud_get_labels("num_by_act_type"))
     
        
        word_score_dic = ori_data_with_label.select("feed_fenci", "act_type_rate").rdd.flatMap(splitFeed).reduceByKey(wordScoreSum).map(getFeedScore)
        print ("result_example: ",word_score_dic.take(20))
        word_score_dic = spark.createDataFrame(word_score_dic)
        print ("word_num: ",str(word_score_dic.count()))
        word_score_dic.write.csv('hdfs://nameservice3/user/recommend/warehouse/recommend_test.db/temp_czz/word_score_dic.csv', mode='overwrite')
                                                                                                                                                                   

run0.sh

#!/usr/bin/env bash

/opt/spark-2.2.1-bin-2.6.0/bin/spark-submit \
--master yarn \
--deploy-mode cluster \
--name feed_static_score_feature_etl \
--queue root.bigdata.recommend \
--executor-cores 4 \
--executor-memory 15g \
--num-executors 150 \
--driver-memory 16g \
--conf spark.yarn.executor.memoryOverhead=10G \
--conf spark.kryoserializer.buffer.max=1536M \
--conf spark.driver.maxResultSize=8G \
--conf spark.shuffle.file.buffer=48K \
--conf spark.default.parallelism=5000  \
/data10/recommend/**/feature_process_pyspark.py 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值