基于移动上网数据的预购(购车、购房)行为分析设计。通过数据分析获取有购买需求的用户并对其推荐可能喜欢的产品。
业务模块介绍
- 地址库维护:爬虫+人工(人工分析地址规则。爬虫按规则爬取,并将爬取的结果进行分类)。
注:此部分一般由专门部门负责,不属于大数据业务处理。
- 数据处理:通过MapReduce处理数据(数据过滤、匹配、统计计算)
- 算法分析:用spark mlib算法库实现算法业务(推荐,生存回归)
属性文件conf.properties:
filesplit=
outfilesplit=|
fileoutsplit=\\|
firstoutpath=/dx/firstdomainout
domainoutpath=/dx/domainout
urloutpath=/dx/urlout
mysql_username=root
mysql_pwd=xlh123
mysql_connection_url=jdbc:mysql://192.168.0.185:3306/hljxlhdb?useUnicode=true&characterEncoding=UTF-8
fileclear=.*\\.(jpg|png|bmp|jpeg|tif|gif|psd|ico|pdf|css|tmp|js|gz|rar|gzip|zip|txt|csv|xlsx|xls)(\\W.*|$)
fileclearpath=/dx/clearout
matchfileoutall=/dx/matchallout
matchfileout=/dx/matchout
nomatch=000000000000000000
filelength=15
usercountpath=/dx/userpathout
allcountpath=/dx/allpathout
pronomatch=not000001
prooutpath=/dx/proout
usercountpro=/dx/userproout
allcountpro=/dx/allproout
proaddress=t_dx_product_msg_addr.txt
log4j.properties:
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Define some default values that can be overridden by system properties
hadoop.root.logger=INFO,console
hadoop.log.dir=.
hadoop.log.file=hadoop.log
# Define the root logger to the system property "hadoop.root.logger".
log4j.rootLogger=${hadoop.root.logger}, EventCounter
# Logging Threshold
log4j.threshold=ALL
# Null Appender
log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender
#
# Rolling File Appender - cap space usage at 5gb.
#
hadoop.log.maxfilesize=256MB
hadoop.log.maxbackupindex=20
log4j.appender.RFA=org.apache.log4j.RollingFileAppender
log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}
log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize}
log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex}
log4j.appender.RFA.layout=org.apache.log4j.PatternLayout
# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
#
# Daily Rolling File Appender
#
log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}
# Rollover at midnight
log4j.appender.DRFA.DatePattern=.yyyy-MM-dd
log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout
# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n
#
# console
# Add "console" to rootlogger above if you want to use this
#
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
#
# TaskLog Appender
#
#Default values
hadoop.tasklog.taskid=null
hadoop.tasklog.iscleanup=false
hadoop.tasklog.noKeepSplits=4
hadoop.tasklog.totalLogFileSize=100
hadoop.tasklog.purgeLogSplits=true
hadoop.tasklog.logsRetainHours=12
log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}
log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
#
# HDFS block state change log from block manager
#
# Uncomment the following to suppress normal block state change
# messages from BlockManager in NameNode.
#log4j.logger.BlockStateChange=WARN
#
#Security appender
#
hadoop.security.logger=INFO,NullAppender
hadoop.security.log.maxfilesize=256MB
hadoop.security.log.maxbackupindex=20
log4j.category.SecurityLogger=${hadoop.security.logger}
hadoop.security.log.file=SecurityAuth-${user.name}.audit
log4j.appender.RFAS=org.apache.log4j.RollingFileAppender
log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize}
log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex}
#
# Daily Rolling Security appender
#
log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd
#
# hadoop configuration logging
#
# Uncomment the following line to turn off configuration deprecation warnings.
# log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN
#
# hdfs audit logging
#
hdfs.audit.logger=INFO,NullAppender
hdfs.audit.log.maxfilesize=256MB
hdfs.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log
log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize}
log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex}
#
# mapred audit logging
#
mapred.audit.logger=INFO,NullAppender
mapred.audit.log.maxfilesize=256MB
mapred.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
log4j.appender.MRAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.MRAUDIT.File=${hadoop.log.dir}/mapred-audit.log
log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.MRAUDIT.MaxFileSize=${mapred.audit.log.maxfilesize}
log4j.appender.MRAUDIT.MaxBackupIndex=${mapred.audit.log.maxbackupindex}
# Custom Logging levels
#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
#log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=DEBUG
# Jets3t library
log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR
# AWS SDK & S3A FileSystem
log4j.logger.com.amazonaws=ERROR
log4j.logger.com.amazonaws.http.AmazonHttpClient=ERROR
log4j.logger.org.apache.hadoop.fs.s3a.S3AFileSystem=WARN
#
# Event Counter Appender
# Sends counts of logging messages at different severity levels to Hadoop Metrics.
#
log4j.appender.EventCounter=org.apache.hadoop.log.metrics.EventCounter
#
# Job Summary Appender
#
# Use following logger to send summary to separate file defined by
# hadoop.mapreduce.jobsummary.log.file :
# hadoop.mapreduce.jobsummary.logger=INFO,JSA
#
hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger}
hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log
hadoop.mapreduce.jobsummary.log.maxfilesize=256MB
hadoop.mapreduce.jobsummary.log.maxbackupindex=20
log4j.appender.JSA=org.apache.log4j.RollingFileAppender
log4j.appender.JSA.File=${hadoop.log.dir}/${hadoop.mapreduce.jobsummary.log.file}
log4j.appender.JSA.MaxFileSize=${hadoop.mapreduce.jobsummary.log.maxfilesize}
log4j.appender.JSA.MaxBackupIndex=${hadoop.mapreduce.jobsummary.log.maxbackupindex}
log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
log4j.appender.JSA.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${hadoop.mapreduce.jobsummary.logger}
log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false
#
# Yarn ResourceManager Application Summary Log
#
# Set the ResourceManager summary log filename
yarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log
# Set the ResourceManager summary log level and appender
yarn.server.resourcemanager.appsummary.logger=${hadoop.root.logger}
#yarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY
# To enable AppSummaryLogging for the RM,
# set yarn.server.resourcemanager.appsummary.logger to
# <LEVEL>,RMSUMMARY in hadoop-env.sh
# Appender for ResourceManager Application Summary Log
# Requires the following properties to be set
# - hadoop.log.dir (Hadoop Log directory)
# - yarn.server.resourcemanager.appsummary.log.file (resource manager app summary log filename)
# - yarn.server.resourcemanager.appsummary.logger (resource manager app summary log level and appender)
log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger}
log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false
log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender
log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file}
log4j.appender.RMSUMMARY.MaxFileSize=256MB
log4j.appender.RMSUMMARY.MaxBackupIndex=20
log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout
log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
# HS audit log configs
#mapreduce.hs.audit.logger=INFO,HSAUDIT
#log4j.logger.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=${mapreduce.hs.audit.logger}
#log4j.additivity.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=false
#log4j.appender.HSAUDIT=org.apache.log4j.DailyRollingFileAppender
#log4j.appender.HSAUDIT.File=${hadoop.log.dir}/hs-audit.log
#log4j.appender.HSAUDIT.layout=org.apache.log4j.PatternLayout
#log4j.appender.HSAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
#log4j.appender.HSAUDIT.DatePattern=.yyyy-MM-dd
# Http Server Request Logs
#log4j.logger.http.requests.namenode=INFO,namenoderequestlog
#log4j.appender.namenoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.namenoderequestlog.Filename=${hadoop.log.dir}/jetty-namenode-yyyy_mm_dd.log
#log4j.appender.namenoderequestlog.RetainDays=3
#log4j.logger.http.requests.datanode=INFO,datanoderequestlog
#log4j.appender.datanoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.datanoderequestlog.Filename=${hadoop.log.dir}/jetty-datanode-yyyy_mm_dd.log
#log4j.appender.datanoderequestlog.RetainDays=3
#log4j.logger.http.requests.resourcemanager=INFO,resourcemanagerrequestlog
#log4j.appender.resourcemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.resourcemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-resourcemanager-yyyy_mm_dd.log
#log4j.appender.resourcemanagerrequestlog.RetainDays=3
#log4j.logger.http.requests.jobhistory=INFO,jobhistoryrequestlog
#log4j.appender.jobhistoryrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.jobhistoryrequestlog.Filename=${hadoop.log.dir}/jetty-jobhistory-yyyy_mm_dd.log
#log4j.appender.jobhistoryrequestlog.RetainDays=3
#log4j.logger.http.requests.nodemanager=INFO,nodemanagerrequestlog
#log4j.appender.nodemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.nodemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-nodemanager-yyyy_mm_dd.log
#log4j.appender.nodemanagerrequestlog.RetainDays=3
读取配置文件信息的类:
package hadoop_user1;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;
public class TProperties {
private static Properties p = new Properties();
/**
* 读取properties配置文件信息
*/
static{
try {
// String path = Thread.currentThread().getContextClassLoader().getResource("conf.properties").getPath();
// path = path.substring(1);
// System.out.println(path);
// p.load(new FileInputStream(path));
//jar内获取配置文仿
InputStream is=TProperties.class.getResourceAsStream("/conf.properties");
BufferedReader br=new BufferedReader(new InputStreamReader(is));
p.load(br);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根据key得到value的值
*/
public static String getValue(String key)
{
return p.getProperty(key);
}
}
匹配域名或ip地址的类:
package hadoop_user1;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TMatcher {
//正则提取字符
public static String getMatcherStr(String managers){
String str = "(?<=://)(.*?)(?=/)";
Pattern pattern = Pattern.compile(str);
Matcher matcher = pattern.matcher(managers);
while(matcher.find()){
return matcher.group(1); //matcher.group();
}
return "";
}
//正则判断域名或ip地址
public static String getDomain(String domain){
//判断IP
String ip = "([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}";
if(domain.matches(ip)){
return domain;
}
//判断域名
String str= "((\\w*|\\w*-\\w*)\\.?\\.(com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|hk|in|am|im|fm|tv|co|me|us|io|mobi|pw|so|gs|top|la|bin))$";
Pattern pattern = Pattern.compile(str);
Matcher matcher = pattern.matcher(domain);
while(matcher.find()){
return matcher.group(1);
}
return "";
}
}
一、数据清洗:不符合规格是数据和类型不符
产生结果集:{用户ID,一级域名,URL地址}
----Mapper类:
package hadoop_user1;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ClearMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
//定义输出类型
private Text ovalue = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] str = value.toString().split( TProperties.getValue("filesplit"));
// 判断数据字段数量,判断url字段是否有效,去除特定格式
if ((str.length + "").equals(TProperties.getValue("filelength"))
&& !"http://".equals(str[14])
&& !"https://".equals(str[14])
&& !"".equals(str[14])
&& !(str[14].toLowerCase()).matches(TProperties.getValue("fileclear"))) {
StringBuffer sb = new StringBuffer();
if (!str[14].startsWith("http://") && !str[14].startsWith("https://")) {
str[14] = "http://" + str[14];
}
// 获取域名
String domain = str[14].split("/", -1)[2];
// 去除端口
if (domain.indexOf(":") >= 0) {
domain = domain.split("\\:", -1)[0];
}
// 用户号码|一级域名|URL地址
sb.append(str[1]).append(TProperties.getValue("outfilesplit"))
.append(TMatcher.getDomain(domain)).append(TProperties.getValue("outfilesplit"))
.append(str[14]);
ovalue = new Text(sb.toString());
context.write(NullWritable.get(), ovalue);
}
}
}
----Driver类:
package hadoop_user1;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DxFileClearDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//设置map内存使用
System.setProperty("HADOOP_USER_NAME", "hyxy");
conf.set("mapreduce.map.memory.mb", "2048");
Job job = Job.getInstance(conf,"Dx_FileClear");
job.setJarByClass(DxFileClearDriver.class);
//map类
job.setMapperClass(ClearMapper.class);
//map输出k-v数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//不使用reduce
job.setNumReduceTasks(0);
//结果数据输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(TextInputFormat.class);
//将输入文件切片最小64M,最大128M
TextInputFormat.setMinInputSplitSize(job,1024*1024*64L); // 设置最小分片大小
TextInputFormat.setMaxInputSplitSize(job,1024*1024*128L); // 设置最大分片大小
//输入路径
FileInputFormat.addInputPath(job, new Path("file:///H:/000002_0"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/01.DxFileClear"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
二、行为匹配:根据清洗后的数据与地址库做Join操作,形成有效的行为数据
产生结果集:{行为ID,用户ID,是否产品,URL,预购类型}
----Mapper类:
package hadoop_user2;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import hadoop_user1.TProperties;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapJoinMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
private Text ovalue = new Text();
//Map<一级域名, TreeMap<url的拼接字符串, 空>>,用treemap按优先级排序
private Map<String, TreeMap<String, String>> joinData = new HashMap<String, TreeMap<String, String>>();
/**
* setup在map之前执行,加载地址库
* */
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 预处理把要关联的文件加载到缓存中
// 我们这里只缓存了一个文件,所以取第一个即可,创建BufferReader去读取
Path file = new Path(context.getCacheFiles()[0].getPath());
BufferedReader reader = new BufferedReader(new FileReader(file.toString()));
String str = "";
try {
// 一行一行读取
while ((str = reader.readLine()) != null) {
// 对缓存中的表进行分割
String[] splits = str.split(TProperties.getValue("fileoutsplit"));
//map是否包含一级域名
if (joinData.containsKey(splits[1])) {
//增加treemap中url值:匹配地址,匹配级别,行为ID,产品标识,预购类型
joinData.get(splits[1]).put(splits[2] + "," + splits[splits.length-3] + "," + splits[0] +","+ splits[splits.length-2]+ "," + splits[splits.length-1], "");
} else {
//创建map中key值,添加treemap,StringComparator修改treemap排序方式
TreeMap<String, String> treemap = new TreeMap<String, String>( new StringComparator());
//url地址,匹配级别,行为ID,是否产品,预购类型
treemap.put(splits[2] + "," + splits[splits.length-3]+ "," + splits[0] +","+ splits[splits.length-2] + "," + splits[splits.length-1], "");
//key一级域名
joinData.put(splits[1], treemap);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
reader.close();
}
}
/**
* map端循环匹配数据,不使用reduce部分提高处理效率。
* */
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取从HDFS中加载的表
String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
//youku.com
if (joinData.containsKey(values[1])) {
//获取一级域名下匹配地址(joinData集合中key对应的value值)
TreeMap<String, String> treeMap1 = joinData.get(values[1]);
Iterator iterator = treeMap1.keySet().iterator();//获取treeMap1集合中所有的key对象,并转换为迭代器
//按级别循环匹配
while (iterator.hasNext()) {
String[] krule = ((String) iterator.next()).split(",");
//values[2] = http://push.m.youku.com/feedback/recv/BT_713852830_436662@436662?pid
//values[2]和地址库treeMap的所有url(二级或三级域名)每个匹配
if (values[2].indexOf(krule[0]) >= 0) {
StringBuffer sb = new StringBuffer();
//行为id,用户号码,是否产品,url,预购类型
sb.append(krule[2]).append(TProperties.getValue("outfilesplit"))
.append(values[0]).append(TProperties.getValue("outfilesplit"))
.append(krule[3]).append(TProperties.getValue("outfilesplit"))
.append(values[2]).append(TProperties.getValue("outfilesplit"))
.append(krule[4]);
ovalue.set(sb.toString());
//匹配后输出并退出匹配循环
context.write(NullWritable.get(), ovalue);
return ;
}
}
}
}
}
----ip匹配比较器类:
package hadoop_user2;
import java.util.Comparator;
public class StringComparator implements Comparator<String> {
//匹配级别, 先匹配三级--> 二级 -->一级
@Override
public int compare(String k1, String k2) {
k1 = k1.split(",")[1];
k2 = k2.split(",")[1];
if (Integer.parseInt(k2) - Integer.parseInt(k1) == 0) {
return -1;
} else {
return Integer.parseInt(k2)- Integer.parseInt(k1);//k2-k1 降序
}
}
}
----Driver类:
package hadoop_user2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DxFileMatchDriver {
/**
*@param args
* args[0]:地址库配置数据文件位置
* args[1]:待解析数据(数据清洗结果数据)输入路径
* args[2]:处理结果数据输出路径
* Description:通过map端join处理,实现按地址库优先级匹配带解析数据
*/
public static void main(String[] args) {
try {
//创建配置信息
Configuration conf = new Configuration();
//map内存设置(默认1024)
conf.set("mapreduce.map.memory.mb", "5120");
//检查超时(默认600000ms),由于集群环境不稳定,有超时现象,所以设置为不检查,但需要保证程序无死循环
conf.set("mapreduce.task.timeout", "0");
//集群机器少的时候可以设置:客户端在写失败的时候,不使用更换策略
conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
// 创建任务
Job job = Job.getInstance(conf, "Dx_FileMatch");
// 打成jar包运行,这句话是关键
job.setJarByClass(DxFileMatchDriver.class);
// 设置最小输入分片大小(默认0)128M
TextInputFormat.setMinInputSplitSize(job,1024*1024*128L); // 设置最小输入分片大小(默认0)128M
//设置缓存文件(小表)
job.addCacheFile(new Path("file:///H:/t_dx_basic_msg_addr.txt").toUri());
// 1.2 设置自定义Mapper类、设置map函数输出数据的key和value的类型
job.setMapperClass(MapJoinMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
// 设置reduce数量为0
job.setNumReduceTasks(0);
//输入路径,也就是01数据清洗后的输出数据
FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/01.DxFileClear/part-m-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
三、个人行为数据统计分析:计算PV值
产生结果集:{行为ID,用户ID,PV值(访问量)}
----Mapper类:
package hadoop_user3;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 个人行为数据统计:识别用户高频访问地址(pv数)
*/
public class MapPvNumMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
Text tk = new Text();
IntWritable iw = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] vs = value.toString().split(TProperties.getValue("fileoutsplit"));
tk.set(vs[0].substring(0,6)+"000000000000"+TProperties.getValue("outfilesplit")
+vs[1]);//一级域名的行为ID|用户ID
context.write(tk,iw);
}
}
----Reducer类:
package hadoop_user3;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class PvNumReduce extends Reducer<Text,IntWritable,Text,NullWritable>{
Text tk = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i:values) {
sum += i.get();
}
tk.set(key+TProperties.getValue("outfilesplit")+sum);
context.write(tk,NullWritable.get());
}
}
----Driver类:
package hadoop_user3;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DxPathStatisticDriver {
public static void main(String[] args){
try {
//创建配置信息
Configuration conf = new Configuration();
//map内存设置(默认1024)
conf.set("mapreduce.map.memory.mb", "5120");
//检查超时(默认600000ms),由于集群环境不稳定,有超时现象,所以设置为不检查,但需要保证程序无死循环
conf.set("mapreduce.task.timeout", "0");
//集群机器少的时候可以设置:客户端在写失败的时候,不使用更换策略
conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
// 创建任务
Job job = Job.getInstance(conf, "Dx_File_pv_staticstic");
// 打成jar包运行,这句话是关键
job.setJarByClass(DxPathStatisticDriver.class);
// 设置最小输入分片大小(默认0)128M
TextInputFormat.setMinInputSplitSize(job,1024*1024*128L); // 设置最小输入分片大小(默认0)128M
// 1.2 设置自定义Mapper类、设置map函数输出数据的key和value的类型
job.setMapperClass(MapPvNumMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(PvNumReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// job.setNumReduceTasks(3);
//输入路径,也就是01数据清洗后的输出数据
FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch/part-m-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/031.DxFilePathStatistic/pv"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
四、整体行为数据统计分析:计算行为数据的UV和PV
产生结果集:{行为ID,UV值(用户数),PV值(总访问量)}
----Mapper类:
package hadoop_user4;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 整体行为数据统计:统计热门地址,用户访问最多的地址以及访问人数
* 计算行为id的 uv和pv
*/
public class MapperDxPathStatisticAll extends Mapper<LongWritable,Text,Text,Text>{
private Text okey = new Text();
private Text ovalue = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
//行为ID
okey.set(values[0]);
//用户id,访问次数(pv)
ovalue.set( values[1] + TProperties.getValue("outfilesplit") + values[2]);
context.write(okey, ovalue);
}
}
----Reducer类:
package hadoop_user4;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class ReducerDxPathStatisticAll extends Reducer<Text,Text,NullWritable,Text>{
private Text result = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//访问次数计数
int sum = 0;
//用户id存储
Map<String,Integer> map = new HashMap();
//reduce循环
for (Text val : values) {
String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
//map加载访问人
map.put(str[0], 1);
//累加访问次数
sum += Integer.parseInt(str[1]);
}
//行为ID,UV(用户数),PV(访问数量)
result.set(key.toString()+ TProperties.getValue("outfilesplit")
+ map.size() + TProperties.getValue("outfilesplit")
+ sum);
context.write(NullWritable.get(), result);
}
}
----Driver类:
package hadoop_user4;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DriverDxPathStatisticAll {
/**
* args[0]:输入路径(个人地址统计结果数据)
* args[1]:结果输出路径
* Description:通过mapreduce程序统计一级域名地址访问人数及访问次数
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//内存设置
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
Job job = Job.getInstance(conf, "Dx_PathStatisticAll");
job.setJarByClass(DriverDxPathStatisticAll.class);
//map与reduce类设置
job.setMapperClass(MapperDxPathStatisticAll.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
// job.setCombinerClass(JoinReducer.class);
job.setReducerClass(ReducerDxPathStatisticAll.class);
// job.setNumReduceTasks(0);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//输入输出文件路径
FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-*"));
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/04.DxPathStatisticAll"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
五、产品匹配:通过行为匹配数据和产品库进行Join操作
产生结果集:{用户ID,产品类型,产品ID,产品名称,品牌,价格,型号,车系,手\自,行为ID}
----Mapper类:
package hadoop_user5;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 产品匹配:关联产品地址库,标识用户行为匹配中的产品地址
*/
public class MapperDxProMatch extends Mapper<LongWritable,Text,Text,Text>{
Text ok = new Text();
Text ov = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] vs = value.toString().split(TProperties.getValue("fileoutsplit"));
if(vs.length==5 && Integer.parseInt(vs[2].toString())==1){//2的结果而且是产品
String ku = vs[0]/*+","+"u"*/;//行为ID
String vu = vs[1]+"|";//用户ID
ok.set(ku);
ov.set(vu);
context.write(ok,ov);
}else if(vs.length==9){
String kp = vs[0]/*+","+"p"*/;//行为ID
String vp = value.toString().substring(19);//行为ID之后的数据(产品的具体信息)
ok.set(kp);
ov.set(vp);
context.write(ok,ov);
}
}
}
----Reducer类:
package hadoop_user5;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class ReducerDxProMatch extends Reducer<Text,Text,Text,NullWritable>{
Text k3 = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> l1 = new ArrayList<String>();
List<String> l2 = new ArrayList<String>();
for(Text v:values){
int i = v.toString().split("\\|").length;
if(i==1){//用户ID
l1.add(v.toString());
}else {//产品信息
l2.add(v.toString());
}
}
//取出数据
for(String s1:l1){
for (String s2:l2){
k3.set(s1+s2+"|"+key);//用户ID|产品信息...|行为ID
context.write(k3,NullWritable.get());
}
}
}
}
----Driver类:
package hadoop_user5;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DriverDxProMatch {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf,"DxProMatch");
/*job.setGroupingComparatorClass(ComparatorDPM.class);
job.setPartitionerClass(DPMPartitioner.class);*/
Path input1 = new Path("file:///H:/useranaly/02.DxFileMatch/part-m-00*");
Path input2 = new Path("file:///H:/t_dx_product_msg_addr.txt");
Path[] inputs = new Path[]{input1,input2};
job.setMapperClass(MapperDxProMatch.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerDxProMatch.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,inputs);
FileOutputFormat.setOutputPath(job,new Path("file:///H:/useranaly/05.DxProMatch"));
System.exit(job.waitForCompletion(true)?0:1);
}
}
六、用户产品统计:统计用户访问的产品信息(PV)
产生结果集:{产品ID,用户ID,PV(访问量),产品类型}
----Mapper类:
package hadoop_user6;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 用户产品统计:统计用户访问的产品信息(PV)
{产品ID,用户ID,PV(访问量),产品类型}
*/
public class MapperDxProStatistic extends Mapper<LongWritable,Text,Text,IntWritable>{
private Text k2 = new Text();
private IntWritable v2 = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] ss = value.toString().split("\\|");
k2.set(ss[2]+","+ss[0]+","+ss[1]);//产品ID,用户ID,产品类型
context.write(k2,v2);//({产品ID,用户ID,产品类型},1)
}
}
----Reducer类:
package hadoop_user6;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class ReducerDxProStatistic extends Reducer<Text,IntWritable,Text,NullWritable>{
private Text k3 = new Text();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for(IntWritable v:values){
sum+=v.get();
}
String[] ss = key.toString().split(",");
k3.set(ss[0]+"|"+ss[1]+"|"+sum+"|"+ss[2]);
context.write(k3,NullWritable.get());
}
}
----Driver类:
package hadoop_user6;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DxProStatisticDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//设置map和reduce内存使用大小
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
Job job = Job.getInstance(conf, "Dx_ProStatistic");
job.setJarByClass(DxProStatisticDriver.class);
//map和reduce设置
job.setMapperClass(MapperDxProStatistic.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(ReducerDxProStatistic.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//输入路径
FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/05PG.DxProMatch/part-r-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/06.DxProStatistic"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
七、整体的产品统计:统计产品对应的用户访问量(UV),行为数据访问总量(PV)
产生结果集:{产品ID,UV数(用户数),PV数(总访问量),产品类型}
----Mapper类:
package hadoop_user7;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 整体的产品统计:统计产品对应的用户访问量(UV),行为数据访问总量(PV)
* 产生结果集:{产品ID,UV数(用户数),PV数(总访问量),产品类型}
*/
public class MapperDxProStatisticAll extends Mapper<LongWritable,Text,Text,Text>{
private Text k2 = new Text();
private Text v2 = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//{产品ID,UV数(用户数),PV数(总访问量),产品类型}
String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
//产品id,产品类型
k2.set(values[0] + TProperties.getValue("outfilesplit") + values[3]);
//用户id,访问次数
v2.set( values[1] + TProperties.getValue("outfilesplit") + values[2]);
context.write(k2, v2);
/*String[] psa = value.toString().split("\\|");
k2.set(psa[0]);
v2.set(psa[2]+","+psa[3]+","+1);
context.write(k2,v2);*/
}
}
----Reducer类:
package hadoop_user7;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class ReducerDxProStatisticAll extends Reducer<Text,Text,Text,NullWritable>{
// private Text k3 = new Text();
private Text result = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int sum = 0;
Map<String,Integer> map = new HashMap<String, Integer>();
//循环计数
for (Text val : values) {
//用户id,访问次数
String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
//用户集合
map.put(str[0], 1);
//访问次数(总pv访问量)
sum += Integer.parseInt(str[1]);
}
String[] str = key.toString().split(TProperties.getValue("fileoutsplit"));
//产品id,UV,PV,产品类型
result = new Text(str[0] + TProperties.getValue("outfilesplit")
+ map.size() + TProperties.getValue("outfilesplit")
+ sum + TProperties.getValue("outfilesplit")
+ str[1]);
context.write(result,NullWritable.get());
/*int sumUV = 0;
int sumPV = 0;
String type = "";
for(Text v:values){
String[] vs = v.toString().split(",");
sumUV+=Integer.parseInt(vs[2]);
sumPV+=Integer.parseInt(vs[0]);
type = vs[1];
}
k3.set(key+"|"+sumUV+"|"+sumPV+"|"+type);
context.write(k3,NullWritable.get());*/
}
}
----Driver类:
package hadoop_user7;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DxProStatisticAllDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
//设置map和reduce内存使用大小
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
Job job = Job.getInstance(conf, "Dx_ProStatisticAll");
job.setJarByClass(DxProStatisticAllDriver.class);
//map和reduce设置
job.setMapperClass(MapperDxProStatisticAll.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerDxProStatisticAll.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//输入路径
FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/06.DxProStatistic/part-r-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/07.DxProStatisticAll"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
八、用户行为排序TopN:获取用户的高频地址,默认按用户ID分组,以PV进行排序
产生结果集:{行为ID,用户ID,PV(访问次数),排序字段}
----Mapper类:
package hadoop_user8;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 8.用户行为排序TopN:获取用户的高频地址,默认按用户ID分组,以PV进行排序
* 产生结果集:{行为ID,用户ID,PV(访问次数),排序字段}
*/
public class MapperDxTopN extends Mapper<LongWritable,Text,Text,Text>{
private Text k2 = new Text();
private Text v2 = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//{行为ID,用户ID,PV值(访问量)}
String[] vs = value.toString().split("\\|");
k2.set(vs[1]);//用户ID
v2.set(vs[0]+","+vs[2]);//行为ID,pv值
context.write(k2,v2);
}
}
----Reducer类:
package hadoop_user8;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class ReducerDxTopN extends Reducer<Text,Text,Text,NullWritable>{
private Text k3 = new Text();//用户ID
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
List<String> ls = new ArrayList<String>();//acid+pv
//行为ID,pv数
for (Text v:values){
ls.add(v.toString());
}
ls.sort(new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return Integer.parseInt(o2.split(",")[1])-
Integer.parseInt(o1.split(",")[1]);
}
});
List<String> nl = new ArrayList<String>();//取pv值最大的前五个,放入到一个新集合
if(ls.size()>=5) {
// nl = ls.subList(ls.size()-5, ls.size());
nl = ls.subList(0,5);
}else {
nl = ls;
}
for(int i=0;i<nl.size();i++) {
k3.set(nl.get(i).split(",")[0] + "|"
+ key + "|"
+ nl.get(i).split(",")[1] + "|"
+ (i+1)/*(nl.size() - i)*/);
context.write(k3, NullWritable.get());
}
}
}
----Driver类:
package hadoop_user8;
import hadoop_user7.DxProStatisticAllDriver;
import hadoop_user7.MapperDxProStatisticAll;
import hadoop_user7.ReducerDxProStatisticAll;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DxTopNDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
Job job = Job.getInstance(conf,"Dx_TopNDriver");
job.setJarByClass(DxTopNDriver.class);
//map和reduce设置
job.setMapperClass(MapperDxTopN.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerDxTopN.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//输入路径
FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/082.DxTopN"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
九、用户画像:通过对行为数据的分类,将用户行为数据进行画像描述:通过六个维度
产生结果集:{用户ID, 分类ID, 用户分类pv, 分类总体pv, Z标准值}
----Mapper类:
package hadoop_user9;
import hadoop_user1.TProperties;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* 9.用户画像:通过对行为数据的分类,将用户行为数据进行画像描述:
* 通过六个维度:00001:企业中心
* 00002:综合服务
* 00003:观影爱好
* 00004:体闲娱乐
* 00005:生活教育
* 00006:驳杂八卦
* 产生结果集:{用户ID,分类ID,用户分类pv,分类总体pv,Z标准值}
*/
public class MapperDxPersona extends Mapper<LongWritable,Text,Text,Text>{
private Text okey = new Text();
private Text ovalue = new Text();
//分类关系地址库
private Map<String, String> typeData = new HashMap<String,String>();
//存储每个map数据中用户分类的记录数
//key:分类id+用户ID values:用户分类总PV
private Map<String, Integer> userType = new HashMap<String,Integer>();
private String typeKey ;
//在map执行前预先加载行为id与分类id对应关系数据,并通过typeDta(hashmap)存储。
@Override
protected void setup( Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
// 预处理把要关联的文件加载到缓存中
// 我们这里只缓存了一个文件,所以取第一个即可,创建BufferReader去读取
Path file = new Path(context.getCacheFiles()[0].getPath());
BufferedReader reader = new BufferedReader(new FileReader(file.toString()));
String str = null;
try {
// 一行一行读取
while ((str = reader.readLine()) != null) {
//对缓存中的表进行分割
String[] splits = str.split(TProperties.getValue("fileoutsplit"));
//行为ID,分类ID
typeData.put(splits[1], splits[0]);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
reader.close();
}
}
//map端累计用户分类访问次数
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
//是否包含行为ID对应的分类ID values={行为ID,用户ID,PV值(访问量)}
//map端的key拼接行为ID+用户ID, 对用户分类访问次数求和
if(typeData.containsKey(values[0])){
//userType集合中 key:分类id+用户ID v:用户分类总PV
typeKey = typeData.get(values[0]) +"," + values[1];
if(userType.containsKey(typeKey)){
//累加 用户分类访问次数PV
userType.put(typeKey, userType.get(typeKey) + Integer.parseInt(values[2]));
}else{
//加载 用户分类访问次数PV
userType.put(typeKey, Integer.parseInt(values[2]));
}
}
//map端没有输出k2 v2 ,userType(分类id+用户ID,用户分类总PV)
}
//map端的userType集合key拼接行为ID+用户ID, 统计用户分类访问次数PV量的和
//cleanup方法在map统计之后执行,由clean方法输出k2 v2(key为行为ID, value为用户ID+用户分类PV)
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
//keySet=分类id+用户ID
for (String key : userType.keySet()) {
//分类id
okey = new Text(key.split(",",-1)[0]);
//用户ID | 用户分类访问次数
ovalue= new Text(key.split(",",-1)[1] + TProperties.getValue("outfilesplit")
+ userType.get(key));
//map函数 --> key2:分类id+用户ID values:用户分类PV访问次数
//cleanup函数--> key3:分类id; values:用户ID,用户分类PV访问次数
context.write(okey , ovalue);
}
}
}
----Reducer类:
package hadoop_user9;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* 5063936616|00001|12|25007|4.819144080748042
* 513049960245|00001|126|25007|5.27731240951967
* 533038351015|00001|21|25007|4.855315264598433
*/
public class ReducerDxPersona extends Reducer<Text,Text,NullWritable,Text>{
private Text result = new Text();
@Override
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//_______start求平均值___________
//分类总体pv
int pv = 0;
//人数
int n = 0;
//_______end求平均值___________
//数据迭代
Map<String, Integer> map = new HashMap<String, Integer>();
//key:分类ID; value:用户ID,访问次数pv
//计算分类总pv
for (Text val : values) {
String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
//分类总访问数
pv = pv + Integer.parseInt(str[1]);
//用户分类pv累加
if(map.containsKey(str[0])){
// 用户ID 用户分类pv累加
map.put(str[0], map.get(str[0]) + Integer.parseInt(str[1]) );
}else{
// 用户ID 用户分类pv
map.put(str[0], Integer.parseInt(str[1]));
//人数
n = n +1;
}
}
//=========z-score标准差计算(反映数据集的离散程度,公式:(原数据-均值)/标准差)===========
//定义平方和
double math2 = 0;
//====1计算平均值
double avg = pv/n ;
//====2计算平方和 = (每个值-平均值)^2
for(String s : map.keySet()){
//math2=用户分类pv-均值后的平方累加和,
math2 = math2 + Math.pow(map.get(s)-avg,2);
}
/*
*3====.计算方差:
* (9 + 4 + 0 + 0+ 1 + 9)/6 = 24/6 = 4
*4====.计算标准差:
* √4 = 2
*/
//====标准差(方差开根号)
double fc = Math.sqrt(math2/n);
//循环用户列表
for(String skey : map.keySet()){
//====5.z-score标准差 = (用户分类pv-均值)/标准差
double b = (map.get(skey)-avg)/fc;
//数据平移 蜘蛛网图
b = b + 5;
//上限范围10
if( b > 10){
b = 10 ;
}
//下限范围0
if(b < 0){
b = 0;
}
//用户ID,分类ID,用户分类pv,分类总体pv,标准值
result = new Text(skey + TProperties.getValue("outfilesplit")
+ key.toString() + TProperties.getValue("outfilesplit")
+ map.get(skey) + TProperties.getValue("outfilesplit")
+ pv + TProperties.getValue("outfilesplit")
+ b);
context.write(NullWritable.get(), result);
}
}
}
----Driver类:
package hadoop_user9;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DriverDxPersona {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
conf.set("mapred.task.timeout", "0");
conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
/*conf.set("mapred.textoutputformat.ignoreseparator","true");
conf.set("mapred.textoutputformat.separator","|");*/
FileSystem fs = FileSystem.get(conf);
Path opath = new Path("H:/useranaly/09.DxPersona");
if (fs.exists(opath)){
fs.delete(opath,true);
}
Job job = Job.getInstance(conf,"DxPersonal_huaciang");
TextInputFormat.setMinInputSplitSize(job,1024*1024*128L);
job.setJarByClass(DriverDxPersona.class);
//map和reduce设置
job.setMapperClass(MapperDxPersona.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerDxPersona.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.addCacheFile(new Path("file:///H:/t_dx_basic_classify_link.txt").toUri());
//输入路径
FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-00*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/092.DxPersona"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
十、预购用户群提取:获取预购用户
产生结果集:{用户ID|预购类型1:PV(访问次数),预购类型2:PV(访问次数)}
----Mapper:
package hadoop_user10;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* 10.预购用户群提取:获取预购用户
* 产生结果集:{用户ID|预购类型1:PV(访问次数),预购类型2:PV(访问次数)}
*/
public class MapperDxSaleUser extends Mapper<LongWritable,Text,Text,Text>{
private Text k2 = new Text();
private Text v2 = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//2的结果集:{行为ID,用户ID,是否产品,URL,预购类型}
String[] vs = value.toString().split("\\|");
// 预购类型的值----0:其他;1:车;2:房
if(!"0".equals(vs[4])){
k2.set(vs[1]);//uid
v2.set(vs[4]);//预购类型
context.write(k2,v2);
}
}
}
----Reducer类:
package hadoop_user10;
import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class ReducerDxSaleUser extends Reducer<Text,Text,NullWritable,Text>{
private Text v3 = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
int pvNum=0;
Map<String,Integer> map = new HashMap<>();//预购类型 , pv数
//将一个用户的上网预购行为给存到map集合中:
for (Text val : values){
//累加计算用户每个预购类型的产品访问次数
if(map.containsKey(val.toString())){
pvNum = map.get(val.toString())+1;
map.put(val.toString(),pvNum);
}else {
//预购类型,产品访问次数
map.put(val.toString(), 1);
}
}
StringBuffer sb = new StringBuffer();
for(String str : map.keySet()){
//预购类型,产品访问次数
sb.append(str).append(":")
.append(map.get(str))
.append(",");
}
//用户ID|预购类型:产品访问次数,预购类型:产品访问次数
v3.set(key.toString() + TProperties.getValue("outfilesplit")
+ sb.toString().substring(0, sb.toString().length()-1) );
context.write(NullWritable.get(), v3);
}
}
----Driver类:
package hadoop_user10;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DriverDxSaleUser {
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
//设置map和reduce内存使用大小
conf.set("mapreduce.map.memory.mb", "3072");
conf.set("mapreduce.reduce.memory.mb", "2048");
Job job =Job.getInstance(conf, "Dx_SaleUser");
job.setJarByClass(DriverDxSaleUser.class);
//map与reduce设置
job.setMapperClass(MapperDxSaleUser.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(ReducerDxSaleUser.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//输入路径
FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch/part-m-*"));
//输出路径
FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/10.DxSaleUser"));
//提交作业 判断退出条件(0正常退出,1非正常退出)
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
十一、ALS协同过滤算法:
协同过滤ALS算法:
----------------------------------
ALS说明:
a.在spark MLlib库中,在【org.apache.spark.mllib.recommendation】推荐路径下
【ALS.scala】伴生对象类:
说明:用于调用交替最小二乘(ALS)矩阵分解的顶级方法。在ALS方法中,提供了以下函数:
1).训练方法:train()
说明:训练一个矩阵分解模型,给出用户对一个产品子集的评价的RDD。
评级矩阵(ratings)近似为给定秩(rank)的两个低秩矩阵的乘积
为了解决这些特性,可以使用一个可配置的迭代运行(iterations)ALS并行度级别。
2).train()方法返回结果:MatrixFactorizationModel(矩阵分解模型)
【org.apache.spark.mllib.recommendation.MatrixFactorizationModel】
说明:表示矩阵分解结果的模型;提供的方法如下:
1).预测方法:【def predict(user: Int, product: Int): Double】
说明:预测一个用户对一个产品的评价
predict方法作用: 设置迭代次数10 2号用户对13号产品的评分
2).推荐产品:【def recommendProducts(user: Int, num: Int): Array[Rating]】
说明:向指定用户推荐产品
3).推荐用户:【def recommendUsers(product: Int, num: Int): Array[Rating]】
说明:推荐用户使用产品。 也就是说,这会返回最有可能对产品感兴趣的用户
4).推荐顶级产品:【def recommendProductsForUsers(num: Int): RDD[(Int, Array[Rating])]】
说明:为所有用户推荐顶级产品。
5).推荐顶级用户:【def recommendUsersForProducts(num: Int): RDD[(Int, Array[Rating])]】
说明:推荐所有产品的顶级用户。
b.评级(Rating)说明:
在ALS中定义的样例类:case class Rating(user: Int,product: Int,rating: Double)
一个比Tuple3更紧凑的类[Int, Int, Double]来表示一个评级
1.编写思路:
step1.定义Rating,生成评分矩阵:Rating(user: Int,product: Int,rating: Double)
注意:类型转换
step2.通过ALS.train()方法进行训练,返回MatrixFactorizationModel(矩阵分解模型)
注意:train()方法的相关参数
step3.通过相对应的预测和推荐方法获取推荐数据
2.协同过滤功能:
要求:为用户喜好的产品进行推荐,推荐的评分最高的5个产品ID
输入数据:来自【用户产品统计数据】
{产品ID,用户ID,PV(访问量),产品类型}
car000466|513049888252|1|car
house000028|513041784009|1|house
house000044|4510174246 |2|house
house000044|513042437238|1|house
house000144|513040816140|11|house
house000144|523047653102|1|house
house000365|513041784009|1|house
结果数据分析如下:
用户ID 产品类型 推荐的评分最高的5个产品ID
513049094454|1|car000118,car000349,car000453,car000228,car000320
533035350388|1|car000287,car000336,car000248,car000319,car000321
5056715877 |1|car000248,car000334,car000336,car000321,car000317
513043175350|2|house000358,house000202,house000365
513049662857|1|car000079,car000439,car000011,car000334,car000202
513046140916|1|car000287,car000336,car000248,car000319,car000321
513046403383|1|car000317,car000082,car000349,car000264,car000228
----MyALS.scala
import java.util
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.{SparkConf, SparkContext}
object MyALS {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("MYALS")
val sc = new SparkContext(conf)
/**
* 数据数据:来自【用户产品统计数据】:{产品ID,用户ID,PV(访问量),产品类型}
* car000466 |513049888252|1|car
house000028|513041784009|1|house
house000044|4510174246|2|house
house000044|513042437238|1|house
house000144|513040816140|11|house
house000144|523047653102|1|house
house000365|513041784009|1|house
*/
val protj_rdd = sc.textFile("file:///H:/useranaly/06.DxProStatistic/part-r-00*")
/* 处理数据:将用户产品统计中产品id-->整型产品ID(HashCode)和用户ID-->整型用户ID(HashCode)
* 数据样例为三元组:用户id|用户IDHash码,产品id|产品IDHash码,PV(访问量)|产品类型
* (5064062000|2042345,car000044|000044,5|car)
*/
val basicId = protj_rdd.map(x => {
val thrarray = x.split("\\|")
(thrarray(1)+"|"+thrarray(1).hashCode,thrarray(0)+"|"+thrarray(0).hashCode,thrarray(2)+"|"+thrarray(3))
}).cache()
/* (5064062000|2042345,car000044|000044,5|car)
* 数据处理:对用户产品访问数据进行处理,并生成算法需要的数据格式Rating
* 数据样例:用户id(int),产品id(int),pv数
* 2074472858,000044,1
2101313658,000024,6
-2132044919,000014,5
*/
val ratings = basicId.map(x => {
//用户id(int),产品id(int),pv数
new Rating(x._1.split("\\|")(1).toInt, x._2.split("\\|")(1).toInt, x._3.split("\\|")(0).toDouble)
})
/* 算法数据建模,spark固定公式 */
//矩阵分解的维度,隐含因子的个数越多越好,但越多耗内存更多
val rank = 5
//最大迭代次数,默认5,建议值10次左右
val numIterations = 10
//推荐模型,用于预测 0.01正则化系数, 此参数正是交叉验证需要验证的参数之一
val model = ALS.train(ratings, rank, numIterations, 0.01)
//------------------------------------------------------------------------------------------------------
//user_rdd数据集:表示的是所有用户ID的Hash码的数据 2042345
val user_rdd = basicId.map(x => x._1.split("\\|")(1).toInt).distinct()
//pro_rdd数据集: 表示的是所有产品ID的Hash码的数据 000044
val pro_rdd = basicId.map(x => x._2.split("\\|")(1).toInt).distinct()
//cartesian笛卡尔积 ((2042345,000044))
val userProScore = user_rdd.cartesian(pro_rdd)
//predict()数据预测,根据模型预测用户对每个产品的评分。int型用户id;int型产品id,评分
//用户产品对 (2042345|00044,0.5)
val userProPair = model.predict(userProScore).map(x => ("" + x.user + "|" + x.product, x.rating))
//获取用户ID与相对应的Hash码
// 5064062000|2042345,car000044|000044,5|car
// car,5064062000|2042345
val full_user_rdd = basicId.map(x => {
(x._3.split("\\|")(1), x._1)
}).distinct
//获取产品ID与相对应的Hash码
// 5064062000|2042345,car000044|000044,5|car
//car,car000044|000044
val full_pro_rdd = basicId.map(x => {
(x._3.split("\\|")(1), x._2)
}).distinct
//用户产品join连接
//car ,(5064062000|2042345,car000044|000044)
val full_user_pro = full_user_rdd.join(full_pro_rdd).map(x => {
var saletype = x._1 //car
var type1 = 1
saletype match {
case "car" => type1 = 1
case "house" => type1 = 2
}
//(2042345|00044,5064062000|car00044|1)
(x._2._1.split("\\|")(1) + "|" + x._2._2.split("\\|")(1),
x._2._1.split("\\|")(0) + "|" + x._2._2.split("\\|")(0) + "|" + type1)
})
//userProPair=(2042345|00044,0.5)
//full_user_pro=(2042345|00044,5064062000|car00044|1)
//userProMsg=2042345|00044,(0.5,5064062000|car00044|1)
//5064062000|1,car00044|0.5
val userProMsg = userProPair.join(full_user_pro).map(x => {
(x._2._2.split("\\|")(0) + "|" + x._2._2.split("\\|")(2),
x._2._2.split("\\|")(1) + "|" + x._2._1)
})
/* 数据处理:按评分取前5
* 样例数据:用户id|预购类型|推荐产品列表
* 533047060723|2|house000402,house000098,house000403,house000388,house000382
513047554444|2|house000296,house000117,house000304,house000100,house000402
683045963832|1|car000300,car000471,car000097,car000382,car000305
4510341559|1|car000393,car000107,car000105,car000527,car000004
* */
//按用户、预购类型分类,每类别取评分前5
///userProMsg: 5064062000|1,car00044|0.5
val userTypeTop = userProMsg.groupByKey.map(x => {
val ite = x._2.iterator
val topMap : util.TreeMap[Double, String] = new util.TreeMap[Double, String]
while (ite.hasNext) {
val proPirce = ite.next.split("\\|")
//取正评分,可根据需求调大,值越大喜欢程度越高。
if (proPirce(1).toDouble > 0) {
topMap.put(proPirce(1).toDouble, proPirce(0))
}
if (topMap.size > 5) {
topMap.remove(topMap.firstKey)
}
}
//拼接高分产品
val sb = new StringBuffer
import scala.collection.JavaConversions._
for (key <- topMap.keySet) {
sb.append(topMap.get(key)).append(",")
}
x._1 + "|" + sb.toString.substring(0, sb.toString.length - 1)
})
/** 测试 **/
userTypeTop.saveAsTextFile("file:///H:/useranaly/11.DxMyALS")
sc.stop()
}
}