《项目 移动用户网上预购行为分析》

基于移动上网数据的预购(购车、购房)行为分析设计。通过数据分析获取有购买需求的用户并对其推荐可能喜欢的产品。

 

业务模块介绍

  1. 地址库维护:爬虫+人工(人工分析地址规则。爬虫按规则爬取,并将爬取的结果进行分类)。

注:此部分一般由专门部门负责,不属于大数据业务处理。

  1. 数据处理:通过MapReduce处理数据(数据过滤、匹配、统计计算)
  2. 算法分析:用spark mlib算法库实现算法业务(推荐,生存回归)

 

属性文件conf.properties:

filesplit=
outfilesplit=|
fileoutsplit=\\|
firstoutpath=/dx/firstdomainout
domainoutpath=/dx/domainout
urloutpath=/dx/urlout
mysql_username=root
mysql_pwd=xlh123
mysql_connection_url=jdbc:mysql://192.168.0.185:3306/hljxlhdb?useUnicode=true&characterEncoding=UTF-8
fileclear=.*\\.(jpg|png|bmp|jpeg|tif|gif|psd|ico|pdf|css|tmp|js|gz|rar|gzip|zip|txt|csv|xlsx|xls)(\\W.*|$)
fileclearpath=/dx/clearout
matchfileoutall=/dx/matchallout
matchfileout=/dx/matchout
nomatch=000000000000000000
filelength=15
usercountpath=/dx/userpathout
allcountpath=/dx/allpathout
pronomatch=not000001
prooutpath=/dx/proout
usercountpro=/dx/userproout
allcountpro=/dx/allproout
proaddress=t_dx_product_msg_addr.txt

log4j.properties:

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Define some default values that can be overridden by system properties
hadoop.root.logger=INFO,console
hadoop.log.dir=.
hadoop.log.file=hadoop.log

# Define the root logger to the system property "hadoop.root.logger".
log4j.rootLogger=${hadoop.root.logger}, EventCounter

# Logging Threshold
log4j.threshold=ALL

# Null Appender
log4j.appender.NullAppender=org.apache.log4j.varia.NullAppender

#
# Rolling File Appender - cap space usage at 5gb.
#
hadoop.log.maxfilesize=256MB
hadoop.log.maxbackupindex=20
log4j.appender.RFA=org.apache.log4j.RollingFileAppender
log4j.appender.RFA.File=${hadoop.log.dir}/${hadoop.log.file}

log4j.appender.RFA.MaxFileSize=${hadoop.log.maxfilesize}
log4j.appender.RFA.MaxBackupIndex=${hadoop.log.maxbackupindex}

log4j.appender.RFA.layout=org.apache.log4j.PatternLayout

# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.RFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n


#
# Daily Rolling File Appender
#

log4j.appender.DRFA=org.apache.log4j.DailyRollingFileAppender
log4j.appender.DRFA.File=${hadoop.log.dir}/${hadoop.log.file}

# Rollover at midnight
log4j.appender.DRFA.DatePattern=.yyyy-MM-dd

log4j.appender.DRFA.layout=org.apache.log4j.PatternLayout

# Pattern format: Date LogLevel LoggerName LogMessage
log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
# Debugging Pattern format
#log4j.appender.DRFA.layout.ConversionPattern=%d{ISO8601} %-5p %c{2} (%F:%M(%L)) - %m%n


#
# console
# Add "console" to rootlogger above if you want to use this 
#

log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.target=System.err
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n

#
# TaskLog Appender
#

#Default values
hadoop.tasklog.taskid=null
hadoop.tasklog.iscleanup=false
hadoop.tasklog.noKeepSplits=4
hadoop.tasklog.totalLogFileSize=100
hadoop.tasklog.purgeLogSplits=true
hadoop.tasklog.logsRetainHours=12

log4j.appender.TLA=org.apache.hadoop.mapred.TaskLogAppender
log4j.appender.TLA.taskId=${hadoop.tasklog.taskid}
log4j.appender.TLA.isCleanup=${hadoop.tasklog.iscleanup}
log4j.appender.TLA.totalLogFileSize=${hadoop.tasklog.totalLogFileSize}

log4j.appender.TLA.layout=org.apache.log4j.PatternLayout
log4j.appender.TLA.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n

#
# HDFS block state change log from block manager
#
# Uncomment the following to suppress normal block state change
# messages from BlockManager in NameNode.
#log4j.logger.BlockStateChange=WARN

#
#Security appender
#
hadoop.security.logger=INFO,NullAppender
hadoop.security.log.maxfilesize=256MB
hadoop.security.log.maxbackupindex=20
log4j.category.SecurityLogger=${hadoop.security.logger}
hadoop.security.log.file=SecurityAuth-${user.name}.audit
log4j.appender.RFAS=org.apache.log4j.RollingFileAppender 
log4j.appender.RFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.RFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.RFAS.MaxFileSize=${hadoop.security.log.maxfilesize}
log4j.appender.RFAS.MaxBackupIndex=${hadoop.security.log.maxbackupindex}

#
# Daily Rolling Security appender
#
log4j.appender.DRFAS=org.apache.log4j.DailyRollingFileAppender 
log4j.appender.DRFAS.File=${hadoop.log.dir}/${hadoop.security.log.file}
log4j.appender.DRFAS.layout=org.apache.log4j.PatternLayout
log4j.appender.DRFAS.layout.ConversionPattern=%d{ISO8601} %p %c: %m%n
log4j.appender.DRFAS.DatePattern=.yyyy-MM-dd

#
# hadoop configuration logging
#

# Uncomment the following line to turn off configuration deprecation warnings.
# log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN

#
# hdfs audit logging
#
hdfs.audit.logger=INFO,NullAppender
hdfs.audit.log.maxfilesize=256MB
hdfs.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=${hdfs.audit.logger}
log4j.additivity.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=false
log4j.appender.RFAAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.RFAAUDIT.File=${hadoop.log.dir}/hdfs-audit.log
log4j.appender.RFAAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize}
log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex}

#
# mapred audit logging
#
mapred.audit.logger=INFO,NullAppender
mapred.audit.log.maxfilesize=256MB
mapred.audit.log.maxbackupindex=20
log4j.logger.org.apache.hadoop.mapred.AuditLogger=${mapred.audit.logger}
log4j.additivity.org.apache.hadoop.mapred.AuditLogger=false
log4j.appender.MRAUDIT=org.apache.log4j.RollingFileAppender
log4j.appender.MRAUDIT.File=${hadoop.log.dir}/mapred-audit.log
log4j.appender.MRAUDIT.layout=org.apache.log4j.PatternLayout
log4j.appender.MRAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
log4j.appender.MRAUDIT.MaxFileSize=${mapred.audit.log.maxfilesize}
log4j.appender.MRAUDIT.MaxBackupIndex=${mapred.audit.log.maxbackupindex}

# Custom Logging levels

#log4j.logger.org.apache.hadoop.mapred.JobTracker=DEBUG
#log4j.logger.org.apache.hadoop.mapred.TaskTracker=DEBUG
#log4j.logger.org.apache.hadoop.hdfs.server.namenode.FSNamesystem.audit=DEBUG

# Jets3t library
log4j.logger.org.jets3t.service.impl.rest.httpclient.RestS3Service=ERROR

# AWS SDK & S3A FileSystem
log4j.logger.com.amazonaws=ERROR
log4j.logger.com.amazonaws.http.AmazonHttpClient=ERROR
log4j.logger.org.apache.hadoop.fs.s3a.S3AFileSystem=WARN

#
# Event Counter Appender
# Sends counts of logging messages at different severity levels to Hadoop Metrics.
#
log4j.appender.EventCounter=org.apache.hadoop.log.metrics.EventCounter

#
# Job Summary Appender 
#
# Use following logger to send summary to separate file defined by 
# hadoop.mapreduce.jobsummary.log.file :
# hadoop.mapreduce.jobsummary.logger=INFO,JSA
# 
hadoop.mapreduce.jobsummary.logger=${hadoop.root.logger}
hadoop.mapreduce.jobsummary.log.file=hadoop-mapreduce.jobsummary.log
hadoop.mapreduce.jobsummary.log.maxfilesize=256MB
hadoop.mapreduce.jobsummary.log.maxbackupindex=20
log4j.appender.JSA=org.apache.log4j.RollingFileAppender
log4j.appender.JSA.File=${hadoop.log.dir}/${hadoop.mapreduce.jobsummary.log.file}
log4j.appender.JSA.MaxFileSize=${hadoop.mapreduce.jobsummary.log.maxfilesize}
log4j.appender.JSA.MaxBackupIndex=${hadoop.mapreduce.jobsummary.log.maxbackupindex}
log4j.appender.JSA.layout=org.apache.log4j.PatternLayout
log4j.appender.JSA.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n
log4j.logger.org.apache.hadoop.mapred.JobInProgress$JobSummary=${hadoop.mapreduce.jobsummary.logger}
log4j.additivity.org.apache.hadoop.mapred.JobInProgress$JobSummary=false

#
# Yarn ResourceManager Application Summary Log 
#
# Set the ResourceManager summary log filename
yarn.server.resourcemanager.appsummary.log.file=rm-appsummary.log
# Set the ResourceManager summary log level and appender
yarn.server.resourcemanager.appsummary.logger=${hadoop.root.logger}
#yarn.server.resourcemanager.appsummary.logger=INFO,RMSUMMARY

# To enable AppSummaryLogging for the RM, 
# set yarn.server.resourcemanager.appsummary.logger to 
# <LEVEL>,RMSUMMARY in hadoop-env.sh

# Appender for ResourceManager Application Summary Log
# Requires the following properties to be set
#    - hadoop.log.dir (Hadoop Log directory)
#    - yarn.server.resourcemanager.appsummary.log.file (resource manager app summary log filename)
#    - yarn.server.resourcemanager.appsummary.logger (resource manager app summary log level and appender)

log4j.logger.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=${yarn.server.resourcemanager.appsummary.logger}
log4j.additivity.org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary=false
log4j.appender.RMSUMMARY=org.apache.log4j.RollingFileAppender
log4j.appender.RMSUMMARY.File=${hadoop.log.dir}/${yarn.server.resourcemanager.appsummary.log.file}
log4j.appender.RMSUMMARY.MaxFileSize=256MB
log4j.appender.RMSUMMARY.MaxBackupIndex=20
log4j.appender.RMSUMMARY.layout=org.apache.log4j.PatternLayout
log4j.appender.RMSUMMARY.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n

# HS audit log configs
#mapreduce.hs.audit.logger=INFO,HSAUDIT
#log4j.logger.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=${mapreduce.hs.audit.logger}
#log4j.additivity.org.apache.hadoop.mapreduce.v2.hs.HSAuditLogger=false
#log4j.appender.HSAUDIT=org.apache.log4j.DailyRollingFileAppender
#log4j.appender.HSAUDIT.File=${hadoop.log.dir}/hs-audit.log
#log4j.appender.HSAUDIT.layout=org.apache.log4j.PatternLayout
#log4j.appender.HSAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
#log4j.appender.HSAUDIT.DatePattern=.yyyy-MM-dd

# Http Server Request Logs
#log4j.logger.http.requests.namenode=INFO,namenoderequestlog
#log4j.appender.namenoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.namenoderequestlog.Filename=${hadoop.log.dir}/jetty-namenode-yyyy_mm_dd.log
#log4j.appender.namenoderequestlog.RetainDays=3

#log4j.logger.http.requests.datanode=INFO,datanoderequestlog
#log4j.appender.datanoderequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.datanoderequestlog.Filename=${hadoop.log.dir}/jetty-datanode-yyyy_mm_dd.log
#log4j.appender.datanoderequestlog.RetainDays=3

#log4j.logger.http.requests.resourcemanager=INFO,resourcemanagerrequestlog
#log4j.appender.resourcemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.resourcemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-resourcemanager-yyyy_mm_dd.log
#log4j.appender.resourcemanagerrequestlog.RetainDays=3

#log4j.logger.http.requests.jobhistory=INFO,jobhistoryrequestlog
#log4j.appender.jobhistoryrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.jobhistoryrequestlog.Filename=${hadoop.log.dir}/jetty-jobhistory-yyyy_mm_dd.log
#log4j.appender.jobhistoryrequestlog.RetainDays=3

#log4j.logger.http.requests.nodemanager=INFO,nodemanagerrequestlog
#log4j.appender.nodemanagerrequestlog=org.apache.hadoop.http.HttpRequestLogAppender
#log4j.appender.nodemanagerrequestlog.Filename=${hadoop.log.dir}/jetty-nodemanager-yyyy_mm_dd.log
#log4j.appender.nodemanagerrequestlog.RetainDays=3

读取配置文件信息的类:

package hadoop_user1;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Properties;

public class TProperties {
    private static Properties p = new Properties();

    /**
     * 读取properties配置文件信息
     */
    static{
        try {
//    		String path = Thread.currentThread().getContextClassLoader().getResource("conf.properties").getPath();
//    		path = path.substring(1);
//    		System.out.println(path);
//    		p.load(new FileInputStream(path));

            //jar内获取配置文仿
            InputStream is=TProperties.class.getResourceAsStream("/conf.properties");
            BufferedReader br=new BufferedReader(new InputStreamReader(is));
            p.load(br);


        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 根据key得到value的值
     */
    public static String getValue(String key)
    {
        return p.getProperty(key);
    }
}

匹配域名或ip地址的类:

package hadoop_user1;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TMatcher {
    //正则提取字符
    public static String getMatcherStr(String managers){

        String str = "(?<=://)(.*?)(?=/)";
        Pattern pattern = Pattern.compile(str);
        Matcher matcher = pattern.matcher(managers);
        while(matcher.find()){
            return matcher.group(1); //matcher.group();
        }
        return "";
    }

    //正则判断域名或ip地址
    public static String getDomain(String domain){
        //判断IP
        String ip = "([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}";
        if(domain.matches(ip)){
            return domain;
        }
        //判断域名
        String str= "((\\w*|\\w*-\\w*)\\.?\\.(com.cn|net.cn|gov.cn|org\\.nz|org.cn|com|net|org|gov|cc|biz|info|cn|hk|in|am|im|fm|tv|co|me|us|io|mobi|pw|so|gs|top|la|bin))$";
        Pattern pattern = Pattern.compile(str);
        Matcher matcher = pattern.matcher(domain);
        while(matcher.find()){
            return matcher.group(1);
        }
        return "";
    }
}

一、数据清洗:不符合规格是数据和类型不符
       产生结果集:{用户ID,一级域名,URL地址}

----Mapper类:

package hadoop_user1;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class ClearMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
    //定义输出类型
    private Text ovalue = new Text();
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] str = value.toString().split( TProperties.getValue("filesplit"));
        // 判断数据字段数量,判断url字段是否有效,去除特定格式
        if ((str.length + "").equals(TProperties.getValue("filelength"))
                && !"http://".equals(str[14])
                && !"https://".equals(str[14])
                && !"".equals(str[14])
                && !(str[14].toLowerCase()).matches(TProperties.getValue("fileclear"))) {

            StringBuffer sb = new StringBuffer();
            if (!str[14].startsWith("http://") && !str[14].startsWith("https://")) {
                str[14] = "http://" + str[14];
            }
            // 获取域名
            String domain = str[14].split("/", -1)[2];
            // 去除端口
            if (domain.indexOf(":") >= 0) {
                domain = domain.split("\\:", -1)[0];
            }
            // 用户号码|一级域名|URL地址
            sb.append(str[1]).append(TProperties.getValue("outfilesplit"))
                    .append(TMatcher.getDomain(domain)).append(TProperties.getValue("outfilesplit"))
                    .append(str[14]);

            ovalue = new Text(sb.toString());
            context.write(NullWritable.get(), ovalue);
        }
    }
}

----Driver类:

package hadoop_user1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DxFileClearDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //设置map内存使用
        System.setProperty("HADOOP_USER_NAME", "hyxy");
        conf.set("mapreduce.map.memory.mb", "2048");
        Job job = Job.getInstance(conf,"Dx_FileClear");

        job.setJarByClass(DxFileClearDriver.class);
        //map类
        job.setMapperClass(ClearMapper.class);
        //map输出k-v数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //不使用reduce
        job.setNumReduceTasks(0);
        //结果数据输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(TextInputFormat.class);
        //将输入文件切片最小64M,最大128M
        TextInputFormat.setMinInputSplitSize(job,1024*1024*64L); // 设置最小分片大小
        TextInputFormat.setMaxInputSplitSize(job,1024*1024*128L); // 设置最大分片大小

        //输入路径
        FileInputFormat.addInputPath(job, new Path("file:///H:/000002_0"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/01.DxFileClear"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

二、行为匹配:根据清洗后的数据与地址库做Join操作,形成有效的行为数据
        产生结果集:{行为ID,用户ID,是否产品,URL,预购类型}

----Mapper类:

package hadoop_user2;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;

import hadoop_user1.TProperties;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MapJoinMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
    private Text ovalue = new Text();
    //Map<一级域名, TreeMap<url的拼接字符串, 空>>,用treemap按优先级排序
    private Map<String, TreeMap<String, String>> joinData = new HashMap<String, TreeMap<String, String>>();

    /**
     * setup在map之前执行,加载地址库
     * */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        // 预处理把要关联的文件加载到缓存中
        // 我们这里只缓存了一个文件,所以取第一个即可,创建BufferReader去读取
        Path file = new Path(context.getCacheFiles()[0].getPath());
        BufferedReader reader = new BufferedReader(new FileReader(file.toString()));

        String str = "";
        try {
            // 一行一行读取
            while ((str = reader.readLine()) != null) {
                // 对缓存中的表进行分割
                String[] splits = str.split(TProperties.getValue("fileoutsplit"));
                //map是否包含一级域名
                if (joinData.containsKey(splits[1])) {
                    //增加treemap中url值:匹配地址,匹配级别,行为ID,产品标识,预购类型
                    joinData.get(splits[1]).put(splits[2] + "," + splits[splits.length-3] + "," + splits[0] +","+ splits[splits.length-2]+ "," + splits[splits.length-1], "");
                } else {
                    //创建map中key值,添加treemap,StringComparator修改treemap排序方式
                    TreeMap<String, String> treemap = new TreeMap<String, String>( new StringComparator());
                    //url地址,匹配级别,行为ID,是否产品,预购类型
                    treemap.put(splits[2] + "," + splits[splits.length-3]+ "," + splits[0] +","+ splits[splits.length-2] + "," + splits[splits.length-1], "");
                    //key一级域名
                    joinData.put(splits[1], treemap);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            reader.close();
        }
    }

    /**
     * map端循环匹配数据,不使用reduce部分提高处理效率。
     * */
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 获取从HDFS中加载的表
        String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
        //youku.com
        if (joinData.containsKey(values[1])) {
            //获取一级域名下匹配地址(joinData集合中key对应的value值)
            TreeMap<String, String> treeMap1 = joinData.get(values[1]);
            Iterator iterator = treeMap1.keySet().iterator();//获取treeMap1集合中所有的key对象,并转换为迭代器
            //按级别循环匹配
            while (iterator.hasNext()) {
                String[] krule = ((String) iterator.next()).split(",");
                //values[2] = http://push.m.youku.com/feedback/recv/BT_713852830_436662@436662?pid
                //values[2]和地址库treeMap的所有url(二级或三级域名)每个匹配
                if (values[2].indexOf(krule[0]) >= 0) {
                    StringBuffer sb = new StringBuffer();
                    //行为id,用户号码,是否产品,url,预购类型
                    sb.append(krule[2]).append(TProperties.getValue("outfilesplit"))
                            .append(values[0]).append(TProperties.getValue("outfilesplit"))
                            .append(krule[3]).append(TProperties.getValue("outfilesplit"))
                            .append(values[2]).append(TProperties.getValue("outfilesplit"))
                            .append(krule[4]);
                    ovalue.set(sb.toString());
                    //匹配后输出并退出匹配循环
                    context.write(NullWritable.get(), ovalue);
                    return ;
                }
            }
        }
    }
}

----ip匹配比较器类:

package hadoop_user2;

import java.util.Comparator;

public class StringComparator implements Comparator<String> {
    //匹配级别, 先匹配三级--> 二级  -->一级
    @Override
    public int compare(String k1, String k2) {
        k1 = k1.split(",")[1];
        k2 = k2.split(",")[1];
        if (Integer.parseInt(k2) - Integer.parseInt(k1) == 0) {
            return -1;
        } else {
            return Integer.parseInt(k2)- Integer.parseInt(k1);//k2-k1   降序
        }
    }
}

----Driver类:

package hadoop_user2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DxFileMatchDriver {
    /**
     *@param args
     * args[0]:地址库配置数据文件位置
     * args[1]:待解析数据(数据清洗结果数据)输入路径
     * args[2]:处理结果数据输出路径
     * Description:通过map端join处理,实现按地址库优先级匹配带解析数据
     */
    public static void main(String[] args) {
        try {
            //创建配置信息
            Configuration conf = new Configuration();
            //map内存设置(默认1024)
            conf.set("mapreduce.map.memory.mb", "5120");
            //检查超时(默认600000ms),由于集群环境不稳定,有超时现象,所以设置为不检查,但需要保证程序无死循环
            conf.set("mapreduce.task.timeout", "0");
            //集群机器少的时候可以设置:客户端在写失败的时候,不使用更换策略
            conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
            conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
            // 创建任务
            Job job = Job.getInstance(conf, "Dx_FileMatch");
            // 打成jar包运行,这句话是关键
            job.setJarByClass(DxFileMatchDriver.class);

            // 设置最小输入分片大小(默认0)128M
            TextInputFormat.setMinInputSplitSize(job,1024*1024*128L); // 设置最小输入分片大小(默认0)128M
            //设置缓存文件(小表)
            job.addCacheFile(new Path("file:///H:/t_dx_basic_msg_addr.txt").toUri());
            // 1.2 设置自定义Mapper类、设置map函数输出数据的key和value的类型
            job.setMapperClass(MapJoinMapper.class);
            job.setMapOutputKeyClass(NullWritable.class);
            job.setMapOutputValueClass(Text.class);
            //  设置reduce数量为0
            job.setNumReduceTasks(0);
            //输入路径,也就是01数据清洗后的输出数据
            FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/01.DxFileClear/part-m-00*"));
            //输出路径
            FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch"));
            //提交作业 判断退出条件(0正常退出,1非正常退出)
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

三、个人行为数据统计分析:计算PV值
        产生结果集:{行为ID,用户ID,PV值(访问量)}

----Mapper类:

package hadoop_user3;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 个人行为数据统计:识别用户高频访问地址(pv数)
 */
public class MapPvNumMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
    Text tk = new Text();
    IntWritable iw = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] vs = value.toString().split(TProperties.getValue("fileoutsplit"));
        tk.set(vs[0].substring(0,6)+"000000000000"+TProperties.getValue("outfilesplit")
                +vs[1]);//一级域名的行为ID|用户ID
        context.write(tk,iw);
    }
}

----Reducer类:

package hadoop_user3;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class PvNumReduce extends Reducer<Text,IntWritable,Text,NullWritable>{
    Text tk = new Text();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable i:values) {
            sum += i.get();
        }
        tk.set(key+TProperties.getValue("outfilesplit")+sum);
        context.write(tk,NullWritable.get());
    }
}

----Driver类:

package hadoop_user3;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DxPathStatisticDriver {
    public static void main(String[] args){
        try {
            //创建配置信息
            Configuration conf = new Configuration();
            //map内存设置(默认1024)
            conf.set("mapreduce.map.memory.mb", "5120");
            //检查超时(默认600000ms),由于集群环境不稳定,有超时现象,所以设置为不检查,但需要保证程序无死循环
            conf.set("mapreduce.task.timeout", "0");
            //集群机器少的时候可以设置:客户端在写失败的时候,不使用更换策略
            conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");
            conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
            // 创建任务
            Job job = Job.getInstance(conf, "Dx_File_pv_staticstic");
            // 打成jar包运行,这句话是关键
            job.setJarByClass(DxPathStatisticDriver.class);

            // 设置最小输入分片大小(默认0)128M
            TextInputFormat.setMinInputSplitSize(job,1024*1024*128L); // 设置最小输入分片大小(默认0)128M

            // 1.2 设置自定义Mapper类、设置map函数输出数据的key和value的类型
            job.setMapperClass(MapPvNumMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            job.setReducerClass(PvNumReduce.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

//            job.setNumReduceTasks(3);
            //输入路径,也就是01数据清洗后的输出数据
            FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch/part-m-00*"));
            //输出路径
            FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/031.DxFilePathStatistic/pv"));
            //提交作业 判断退出条件(0正常退出,1非正常退出)
            System.exit(job.waitForCompletion(true) ? 0 : 1);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

四、整体行为数据统计分析:计算行为数据的UV和PV
        产生结果集:{行为ID,UV值(用户数),PV值(总访问量)}

----Mapper类:

package hadoop_user4;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 整体行为数据统计:统计热门地址,用户访问最多的地址以及访问人数
 *      计算行为id的 uv和pv
 */
public class MapperDxPathStatisticAll extends Mapper<LongWritable,Text,Text,Text>{
    private Text okey = new Text();
    private Text ovalue = new Text();
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
        //行为ID
        okey.set(values[0]);
        //用户id,访问次数(pv)
        ovalue.set( values[1] + TProperties.getValue("outfilesplit") + values[2]);
        context.write(okey, ovalue);
    }

}

----Reducer类:

package hadoop_user4;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class ReducerDxPathStatisticAll extends Reducer<Text,Text,NullWritable,Text>{
    private Text result = new Text();
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //访问次数计数
        int sum = 0;
        //用户id存储
        Map<String,Integer> map = new HashMap();
        //reduce循环
        for (Text val : values) {
            String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
            //map加载访问人
            map.put(str[0], 1);
            //累加访问次数
            sum += Integer.parseInt(str[1]);
        }
        //行为ID,UV(用户数),PV(访问数量)
        result.set(key.toString()+ TProperties.getValue("outfilesplit")
                + map.size() + TProperties.getValue("outfilesplit")
                + sum);
        context.write(NullWritable.get(), result);
    }
}

----Driver类:

package hadoop_user4;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DriverDxPathStatisticAll {
    /**
     * args[0]:输入路径(个人地址统计结果数据)
     * args[1]:结果输出路径
     * Description:通过mapreduce程序统计一级域名地址访问人数及访问次数
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //内存设置
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        Job job = Job.getInstance(conf, "Dx_PathStatisticAll");
        job.setJarByClass(DriverDxPathStatisticAll.class);
        //map与reduce类设置
        job.setMapperClass(MapperDxPathStatisticAll.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // job.setCombinerClass(JoinReducer.class);
        job.setReducerClass(ReducerDxPathStatisticAll.class);
        // job.setNumReduceTasks(0);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        //输入输出文件路径
        FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-*"));
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/04.DxPathStatisticAll"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

五、产品匹配:通过行为匹配数据和产品库进行Join操作
        产生结果集:{用户ID,产品类型,产品ID,产品名称,品牌,价格,型号,车系,手\自,行为ID}

----Mapper类:

package hadoop_user5;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 产品匹配:关联产品地址库,标识用户行为匹配中的产品地址
 */
public class MapperDxProMatch extends Mapper<LongWritable,Text,Text,Text>{
    Text ok = new Text();
    Text ov = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] vs = value.toString().split(TProperties.getValue("fileoutsplit"));
        if(vs.length==5 && Integer.parseInt(vs[2].toString())==1){//2的结果而且是产品
            String ku = vs[0]/*+","+"u"*/;//行为ID
            String vu = vs[1]+"|";//用户ID
            ok.set(ku);
            ov.set(vu);
            context.write(ok,ov);
        }else if(vs.length==9){
            String kp = vs[0]/*+","+"p"*/;//行为ID
            String vp = value.toString().substring(19);//行为ID之后的数据(产品的具体信息)
            ok.set(kp);
            ov.set(vp);
            context.write(ok,ov);
        }
    }
}

----Reducer类:

package hadoop_user5;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class ReducerDxProMatch extends Reducer<Text,Text,Text,NullWritable>{
    Text k3 = new Text();

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        List<String> l1 = new ArrayList<String>();
        List<String> l2 = new ArrayList<String>();
        for(Text v:values){
            int i = v.toString().split("\\|").length;
            if(i==1){//用户ID
                l1.add(v.toString());
            }else {//产品信息
                l2.add(v.toString());
            }
        }
        //取出数据
        for(String s1:l1){
            for (String s2:l2){
                k3.set(s1+s2+"|"+key);//用户ID|产品信息...|行为ID
                context.write(k3,NullWritable.get());
            }
        }
    }
}

----Driver类:

package hadoop_user5;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;


public class DriverDxProMatch {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf,"DxProMatch");
        /*job.setGroupingComparatorClass(ComparatorDPM.class);
        job.setPartitionerClass(DPMPartitioner.class);*/

        Path input1 = new Path("file:///H:/useranaly/02.DxFileMatch/part-m-00*");
        Path input2 = new Path("file:///H:/t_dx_product_msg_addr.txt");
        Path[] inputs = new Path[]{input1,input2};

        job.setMapperClass(MapperDxProMatch.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReducerDxProMatch.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job,inputs);
        FileOutputFormat.setOutputPath(job,new Path("file:///H:/useranaly/05.DxProMatch"));
        System.exit(job.waitForCompletion(true)?0:1);

    }
}

六、用户产品统计:统计用户访问的产品信息(PV)
        产生结果集:{产品ID,用户ID,PV(访问量),产品类型}

----Mapper类:

package hadoop_user6;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 用户产品统计:统计用户访问的产品信息(PV)
      {产品ID,用户ID,PV(访问量),产品类型}
 */
public class MapperDxProStatistic extends Mapper<LongWritable,Text,Text,IntWritable>{
    private Text k2 = new Text();
    private IntWritable v2 = new IntWritable(1);
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] ss = value.toString().split("\\|");
        k2.set(ss[2]+","+ss[0]+","+ss[1]);//产品ID,用户ID,产品类型
        context.write(k2,v2);//({产品ID,用户ID,产品类型},1)
    }
}

----Reducer类:

package hadoop_user6;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;



public class ReducerDxProStatistic extends Reducer<Text,IntWritable,Text,NullWritable>{
    private Text k3 = new Text();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for(IntWritable v:values){
            sum+=v.get();
        }
        String[] ss = key.toString().split(",");
        k3.set(ss[0]+"|"+ss[1]+"|"+sum+"|"+ss[2]);
        context.write(k3,NullWritable.get());
    }
}

----Driver类:

package hadoop_user6;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class DxProStatisticDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        //设置map和reduce内存使用大小
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        Job job = Job.getInstance(conf, "Dx_ProStatistic");
        job.setJarByClass(DxProStatisticDriver.class);
        //map和reduce设置
        job.setMapperClass(MapperDxProStatistic.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(ReducerDxProStatistic.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //输入路径
        FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/05PG.DxProMatch/part-r-00*"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/06.DxProStatistic"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

七、整体的产品统计:统计产品对应的用户访问量(UV),行为数据访问总量(PV)
        产生结果集:{产品ID,UV数(用户数),PV数(总访问量),产品类型}

----Mapper类:

package hadoop_user7;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 整体的产品统计:统计产品对应的用户访问量(UV),行为数据访问总量(PV)
 * 产生结果集:{产品ID,UV数(用户数),PV数(总访问量),产品类型}
 */
public class MapperDxProStatisticAll extends Mapper<LongWritable,Text,Text,Text>{
    private Text k2 = new Text();
    private Text v2 = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //{产品ID,UV数(用户数),PV数(总访问量),产品类型}
        String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
        //产品id,产品类型
        k2.set(values[0] + TProperties.getValue("outfilesplit") + values[3]);
        //用户id,访问次数
        v2.set( values[1] + TProperties.getValue("outfilesplit") + values[2]);
        context.write(k2, v2);

        /*String[] psa = value.toString().split("\\|");
        k2.set(psa[0]);
        v2.set(psa[2]+","+psa[3]+","+1);
        context.write(k2,v2);*/
    }
}

----Reducer类:

package hadoop_user7;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class ReducerDxProStatisticAll extends Reducer<Text,Text,Text,NullWritable>{
//    private Text k3 = new Text();
    private Text result = new Text();
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        Map<String,Integer> map = new HashMap<String, Integer>();
        //循环计数
        for (Text val : values) {
            //用户id,访问次数
            String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
            //用户集合
            map.put(str[0], 1);
            //访问次数(总pv访问量)
            sum += Integer.parseInt(str[1]);
        }
        String[] str = key.toString().split(TProperties.getValue("fileoutsplit"));
        //产品id,UV,PV,产品类型
        result = new Text(str[0] + TProperties.getValue("outfilesplit")
                + map.size() + TProperties.getValue("outfilesplit")
                + sum + TProperties.getValue("outfilesplit")
                + str[1]);
        context.write(result,NullWritable.get());
        /*int sumUV = 0;
        int sumPV = 0;
        String type = "";
        for(Text v:values){
            String[] vs = v.toString().split(",");
            sumUV+=Integer.parseInt(vs[2]);
            sumPV+=Integer.parseInt(vs[0]);
            type = vs[1];
        }

        k3.set(key+"|"+sumUV+"|"+sumPV+"|"+type);
        context.write(k3,NullWritable.get());*/
    }
}

----Driver类:

package hadoop_user7;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class DxProStatisticAllDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        //设置map和reduce内存使用大小
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        Job job = Job.getInstance(conf, "Dx_ProStatisticAll");
        job.setJarByClass(DxProStatisticAllDriver.class);
        //map和reduce设置
        job.setMapperClass(MapperDxProStatisticAll.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReducerDxProStatisticAll.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //输入路径
        FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/06.DxProStatistic/part-r-00*"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/07.DxProStatisticAll"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

八、用户行为排序TopN:获取用户的高频地址,默认按用户ID分组,以PV进行排序
        产生结果集:{行为ID,用户ID,PV(访问次数),排序字段}

----Mapper类:

package hadoop_user8;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 8.用户行为排序TopN:获取用户的高频地址,默认按用户ID分组,以PV进行排序
 *   产生结果集:{行为ID,用户ID,PV(访问次数),排序字段}
 */
public class MapperDxTopN extends Mapper<LongWritable,Text,Text,Text>{
    private Text k2 = new Text();
    private Text v2 = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //{行为ID,用户ID,PV值(访问量)}
        String[] vs = value.toString().split("\\|");
        k2.set(vs[1]);//用户ID
        v2.set(vs[0]+","+vs[2]);//行为ID,pv值
        context.write(k2,v2);
    }
}

----Reducer类:

package hadoop_user8;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

public class ReducerDxTopN extends Reducer<Text,Text,Text,NullWritable>{

    private Text k3 = new Text();//用户ID
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

        List<String> ls = new ArrayList<String>();//acid+pv
        //行为ID,pv数
        for (Text v:values){
            ls.add(v.toString());
        }
        ls.sort(new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                return Integer.parseInt(o2.split(",")[1])-
                        Integer.parseInt(o1.split(",")[1]);
            }
        });
        List<String> nl = new ArrayList<String>();//取pv值最大的前五个,放入到一个新集合
        if(ls.size()>=5) {
//            nl = ls.subList(ls.size()-5, ls.size());
            nl = ls.subList(0,5);
        }else {
            nl = ls;
        }
        for(int i=0;i<nl.size();i++) {
            k3.set(nl.get(i).split(",")[0] + "|"
                    + key + "|"
                    + nl.get(i).split(",")[1] + "|"
                    + (i+1)/*(nl.size() - i)*/);
            context.write(k3, NullWritable.get());
        }

    }
}

----Driver类:

package hadoop_user8;

import hadoop_user7.DxProStatisticAllDriver;
import hadoop_user7.MapperDxProStatisticAll;
import hadoop_user7.ReducerDxProStatisticAll;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class DxTopNDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        Job job = Job.getInstance(conf,"Dx_TopNDriver");
        job.setJarByClass(DxTopNDriver.class);
        //map和reduce设置
        job.setMapperClass(MapperDxTopN.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReducerDxTopN.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //输入路径
        FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-00*"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/082.DxTopN"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

九、用户画像:通过对行为数据的分类,将用户行为数据进行画像描述:通过六个维度
        产生结果集:{用户ID, 分类ID, 用户分类pv, 分类总体pv, Z标准值}

----Mapper类:

package hadoop_user9;

import hadoop_user1.TProperties;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * 9.用户画像:通过对行为数据的分类,将用户行为数据进行画像描述:
 *   通过六个维度:00001:企业中心
 *               00002:综合服务
 *               00003:观影爱好
 *               00004:体闲娱乐
 *               00005:生活教育
 *               00006:驳杂八卦
 *   产生结果集:{用户ID,分类ID,用户分类pv,分类总体pv,Z标准值}
 */
public class MapperDxPersona extends Mapper<LongWritable,Text,Text,Text>{
    private Text okey = new Text();
    private Text ovalue = new Text();
    //分类关系地址库
    private Map<String, String> typeData = new HashMap<String,String>();
    //存储每个map数据中用户分类的记录数
    //key:分类id+用户ID  values:用户分类总PV
    private Map<String, Integer> userType = new HashMap<String,Integer>();
    private String typeKey ;

    //在map执行前预先加载行为id与分类id对应关系数据,并通过typeDta(hashmap)存储。
    @Override
    protected void setup( Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        // 预处理把要关联的文件加载到缓存中
        // 我们这里只缓存了一个文件,所以取第一个即可,创建BufferReader去读取
        Path file = new Path(context.getCacheFiles()[0].getPath());
        BufferedReader reader = new BufferedReader(new FileReader(file.toString()));
        String str = null;
        try {
            // 一行一行读取
            while ((str = reader.readLine()) != null) {
                //对缓存中的表进行分割
                String[] splits = str.split(TProperties.getValue("fileoutsplit"));
                //行为ID,分类ID
                typeData.put(splits[1], splits[0]);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            reader.close();
        }
    }
    //map端累计用户分类访问次数
    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] values = value.toString().split(TProperties.getValue("fileoutsplit"));
        //是否包含行为ID对应的分类ID   values={行为ID,用户ID,PV值(访问量)}
        //map端的key拼接行为ID+用户ID, 对用户分类访问次数求和
        if(typeData.containsKey(values[0])){
            //userType集合中 key:分类id+用户ID   v:用户分类总PV
            typeKey = typeData.get(values[0]) +"," + values[1];
            if(userType.containsKey(typeKey)){
                //累加 用户分类访问次数PV
                userType.put(typeKey, userType.get(typeKey) + Integer.parseInt(values[2]));
            }else{
                //加载 用户分类访问次数PV
                userType.put(typeKey, Integer.parseInt(values[2]));
            }
        }

        //map端没有输出k2  v2 ,userType(分类id+用户ID,用户分类总PV)
    }

    //map端的userType集合key拼接行为ID+用户ID, 统计用户分类访问次数PV量的和
    //cleanup方法在map统计之后执行,由clean方法输出k2 v2(key为行为ID, value为用户ID+用户分类PV)
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        //keySet=分类id+用户ID
        for (String key : userType.keySet()) {
            //分类id
            okey = new Text(key.split(",",-1)[0]);

            //用户ID | 用户分类访问次数
            ovalue= new Text(key.split(",",-1)[1] + TProperties.getValue("outfilesplit")
                    + userType.get(key));

            //map函数    --> key2:分类id+用户ID  values:用户分类PV访问次数
            //cleanup函数--> key3:分类id;       values:用户ID,用户分类PV访问次数
            context.write(okey , ovalue);
        }
    }
}

----Reducer类:

package hadoop_user9;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * 5063936616|00001|12|25007|4.819144080748042
 * 513049960245|00001|126|25007|5.27731240951967
 * 533038351015|00001|21|25007|4.855315264598433
 */
public class ReducerDxPersona extends Reducer<Text,Text,NullWritable,Text>{
    private Text result = new Text();
    @Override
    public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //_______start求平均值___________
        //分类总体pv
        int pv = 0;
        //人数
        int n = 0;
        //_______end求平均值___________

        //数据迭代
        Map<String, Integer> map = new HashMap<String, Integer>();
        //key:分类ID; value:用户ID,访问次数pv
        //计算分类总pv
        for (Text val : values) {
            String[] str = val.toString().split(TProperties.getValue("fileoutsplit"));
            //分类总访问数
            pv = pv + Integer.parseInt(str[1]);

            //用户分类pv累加
            if(map.containsKey(str[0])){
                //      用户ID      用户分类pv累加
                map.put(str[0], map.get(str[0]) + Integer.parseInt(str[1]) );
            }else{
                //       用户ID      用户分类pv
                map.put(str[0], Integer.parseInt(str[1]));
                //人数
                n = n +1;
            }
        }
        //=========z-score标准差计算(反映数据集的离散程度,公式:(原数据-均值)/标准差)===========

        //定义平方和
        double math2 = 0;

        //====1计算平均值
        double avg = pv/n ;

        //====2计算平方和 = (每个值-平均值)^2
        for(String s : map.keySet()){
            //math2=用户分类pv-均值后的平方累加和,
            math2 = math2 + Math.pow(map.get(s)-avg,2);
        }


        /*
         *3====.计算方差:
         *      (9 + 4 + 0 + 0+ 1 + 9)/6 = 24/6 = 4
         *4====.计算标准差:
         *      √4 = 2
         */
        //====标准差(方差开根号)
        double fc = Math.sqrt(math2/n);

        //循环用户列表
        for(String skey : map.keySet()){
            //====5.z-score标准差 = (用户分类pv-均值)/标准差
            double b = (map.get(skey)-avg)/fc;
            //数据平移 蜘蛛网图
            b = b + 5;
            //上限范围10
            if( b > 10){
                b = 10 ;
            }
            //下限范围0
            if(b < 0){
                b = 0;
            }
            //用户ID,分类ID,用户分类pv,分类总体pv,标准值
            result = new Text(skey + TProperties.getValue("outfilesplit")
                    + key.toString() + TProperties.getValue("outfilesplit")
                    + map.get(skey) + TProperties.getValue("outfilesplit")
                    + pv + TProperties.getValue("outfilesplit")
                    + b);
            context.write(NullWritable.get(), result);
        }
    }
}

----Driver类:

package hadoop_user9;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class DriverDxPersona {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        conf.set("mapred.task.timeout", "0");
        conf.set("dfs.client.block.write.replace-datanode-on-failure.policy","NEVER");
        conf.set("dfs.client.block.write.replace-datanode-on-failure.enable","true");

        /*conf.set("mapred.textoutputformat.ignoreseparator","true");
        conf.set("mapred.textoutputformat.separator","|");*/

        FileSystem fs = FileSystem.get(conf);
        Path opath = new Path("H:/useranaly/09.DxPersona");
        if (fs.exists(opath)){
            fs.delete(opath,true);
        }

        Job job = Job.getInstance(conf,"DxPersonal_huaciang");
        TextInputFormat.setMinInputSplitSize(job,1024*1024*128L);
        job.setJarByClass(DriverDxPersona.class);
        //map和reduce设置
        job.setMapperClass(MapperDxPersona.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(ReducerDxPersona.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        job.addCacheFile(new Path("file:///H:/t_dx_basic_classify_link.txt").toUri());
        //输入路径
        FileInputFormat.addInputPath(job,new Path("file:///H:/useranaly/03.DxFilePathStatistic/pv/part-r-00*"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/092.DxPersona"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

十、预购用户群提取:获取预购用户
        产生结果集:{用户ID|预购类型1:PV(访问次数),预购类型2:PV(访问次数)}

----Mapper:

package hadoop_user10;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 10.预购用户群提取:获取预购用户
 *    产生结果集:{用户ID|预购类型1:PV(访问次数),预购类型2:PV(访问次数)}
 */
public class MapperDxSaleUser extends Mapper<LongWritable,Text,Text,Text>{
    private Text k2 = new Text();
    private Text v2 = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //2的结果集:{行为ID,用户ID,是否产品,URL,预购类型}
        String[] vs = value.toString().split("\\|");
        // 预购类型的值----0:其他;1:车;2:房
        if(!"0".equals(vs[4])){
            k2.set(vs[1]);//uid
            v2.set(vs[4]);//预购类型
            context.write(k2,v2);
        }
    }
}

----Reducer类:

package hadoop_user10;

import hadoop_user1.TProperties;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class ReducerDxSaleUser extends Reducer<Text,Text,NullWritable,Text>{
    private Text v3 = new Text();
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        int pvNum=0;
        Map<String,Integer> map = new HashMap<>();//预购类型 , pv数
        //将一个用户的上网预购行为给存到map集合中:
        for (Text val : values){
            //累加计算用户每个预购类型的产品访问次数
            if(map.containsKey(val.toString())){
                pvNum = map.get(val.toString())+1;
                map.put(val.toString(),pvNum);
            }else {
                //预购类型,产品访问次数
                map.put(val.toString(), 1);
            }
        }

        StringBuffer sb = new StringBuffer();
        for(String str : map.keySet()){
            //预购类型,产品访问次数
            sb.append(str).append(":")
                    .append(map.get(str))
                    .append(",");
        }
        //用户ID|预购类型:产品访问次数,预购类型:产品访问次数
        v3.set(key.toString() + TProperties.getValue("outfilesplit")
                + sb.toString().substring(0, sb.toString().length()-1) );
        context.write(NullWritable.get(), v3);
    }
}

----Driver类:

package hadoop_user10;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DriverDxSaleUser {
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        //设置map和reduce内存使用大小
        conf.set("mapreduce.map.memory.mb", "3072");
        conf.set("mapreduce.reduce.memory.mb", "2048");
        Job job =Job.getInstance(conf, "Dx_SaleUser");
        job.setJarByClass(DriverDxSaleUser.class);
        //map与reduce设置
        job.setMapperClass(MapperDxSaleUser.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(ReducerDxSaleUser.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);
        //输入路径
        FileInputFormat.addInputPath(job, new Path("file:///H:/useranaly/02.DxFileMatch/part-m-*"));
        //输出路径
        FileOutputFormat.setOutputPath(job, new Path("file:///H:/useranaly/10.DxSaleUser"));
        //提交作业 判断退出条件(0正常退出,1非正常退出)
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

十一、ALS协同过滤算法:

 

协同过滤ALS算法:
----------------------------------

    ALS说明: 
         a.在spark MLlib库中,在【org.apache.spark.mllib.recommendation】推荐路径下
        【ALS.scala】伴生对象类:
           说明:用于调用交替最小二乘(ALS)矩阵分解的顶级方法。在ALS方法中,提供了以下函数:
              1).训练方法:train()
             说明:训练一个矩阵分解模型,给出用户对一个产品子集的评价的RDD。
                        评级矩阵(ratings)近似为给定秩(rank)的两个低秩矩阵的乘积
                        为了解决这些特性,可以使用一个可配置的迭代运行(iterations)ALS并行度级别。
          2).train()方法返回结果:MatrixFactorizationModel(矩阵分解模型)
             【org.apache.spark.mllib.recommendation.MatrixFactorizationModel】
             说明:表示矩阵分解结果的模型;提供的方法如下:
                1).预测方法:【def predict(user: Int, product: Int): Double】
                   说明:预测一个用户对一个产品的评价 
                          predict方法作用:  设置迭代次数10    2号用户对13号产品的评分
                          
                2).推荐产品:【def recommendProducts(user: Int, num: Int): Array[Rating]】
                   说明:向指定用户推荐产品
                   
                3).推荐用户:【def recommendUsers(product: Int, num: Int): Array[Rating]】
                   说明:推荐用户使用产品。 也就是说,这会返回最有可能对产品感兴趣的用户
                
                4).推荐顶级产品:【def recommendProductsForUsers(num: Int): RDD[(Int, Array[Rating])]】
                   说明:为所有用户推荐顶级产品。
                   
                5).推荐顶级用户:【def recommendUsersForProducts(num: Int): RDD[(Int, Array[Rating])]】
                   说明:推荐所有产品的顶级用户。
             b.评级(Rating)说明:
             在ALS中定义的样例类:case class Rating(user: Int,product: Int,rating: Double)
                 一个比Tuple3更紧凑的类[Int, Int, Double]来表示一个评级 
             
             
    1.编写思路:
       step1.定义Rating,生成评分矩阵:Rating(user: Int,product: Int,rating: Double)
          注意:类型转换
       step2.通过ALS.train()方法进行训练,返回MatrixFactorizationModel(矩阵分解模型)
          注意:train()方法的相关参数
       step3.通过相对应的预测和推荐方法获取推荐数据
       
    2.协同过滤功能:
      要求:为用户喜好的产品进行推荐,推荐的评分最高的5个产品ID
      输入数据:来自【用户产品统计数据】
            {产品ID,用户ID,PV(访问量),产品类型}
          car000466|513049888252|1|car
        house000028|513041784009|1|house
        house000044|4510174246  |2|house
        house000044|513042437238|1|house
        house000144|513040816140|11|house
        house000144|523047653102|1|house
        house000365|513041784009|1|house
    结果数据分析如下:
          用户ID  产品类型     推荐的评分最高的5个产品ID
        513049094454|1|car000118,car000349,car000453,car000228,car000320
        533035350388|1|car000287,car000336,car000248,car000319,car000321
        5056715877  |1|car000248,car000334,car000336,car000321,car000317
        513043175350|2|house000358,house000202,house000365
        513049662857|1|car000079,car000439,car000011,car000334,car000202
        513046140916|1|car000287,car000336,car000248,car000319,car000321
        513046403383|1|car000317,car000082,car000349,car000264,car000228

----MyALS.scala

import java.util
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.{SparkConf, SparkContext}

object MyALS {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("MYALS")
    val sc = new SparkContext(conf)
    /**
      * 数据数据:来自【用户产品统计数据】:{产品ID,用户ID,PV(访问量),产品类型}
      * car000466  |513049888252|1|car
        house000028|513041784009|1|house
        house000044|4510174246|2|house
        house000044|513042437238|1|house
        house000144|513040816140|11|house
        house000144|523047653102|1|house
        house000365|513041784009|1|house
      */
    val protj_rdd = sc.textFile("file:///H:/useranaly/06.DxProStatistic/part-r-00*")

    /* 处理数据:将用户产品统计中产品id-->整型产品ID(HashCode)和用户ID-->整型用户ID(HashCode)
	   * 数据样例为三元组:用户id|用户IDHash码,产品id|产品IDHash码,PV(访问量)|产品类型
	   * (5064062000|2042345,car000044|000044,5|car)
	   */
    val basicId = protj_rdd.map(x => {
      val thrarray = x.split("\\|")
      (thrarray(1)+"|"+thrarray(1).hashCode,thrarray(0)+"|"+thrarray(0).hashCode,thrarray(2)+"|"+thrarray(3))
    }).cache()

    /* (5064062000|2042345,car000044|000044,5|car)
     * 数据处理:对用户产品访问数据进行处理,并生成算法需要的数据格式Rating
	   * 数据样例:用户id(int),产品id(int),pv数
	   *  2074472858,000044,1
		    2101313658,000024,6
		    -2132044919,000014,5
	   */
    val ratings = basicId.map(x => {
      //用户id(int),产品id(int),pv数
      new Rating(x._1.split("\\|")(1).toInt, x._2.split("\\|")(1).toInt, x._3.split("\\|")(0).toDouble)
    })

    /*  算法数据建模,spark固定公式   */
    //矩阵分解的维度,隐含因子的个数越多越好,但越多耗内存更多
    val rank = 5
    //最大迭代次数,默认5,建议值10次左右
    val numIterations = 10
    //推荐模型,用于预测         0.01正则化系数, 此参数正是交叉验证需要验证的参数之一
    val model = ALS.train(ratings, rank, numIterations, 0.01)

    //------------------------------------------------------------------------------------------------------
    //user_rdd数据集:表示的是所有用户ID的Hash码的数据    2042345
    val user_rdd = basicId.map(x => x._1.split("\\|")(1).toInt).distinct()
    //pro_rdd数据集: 表示的是所有产品ID的Hash码的数据     000044
    val pro_rdd = basicId.map(x => x._2.split("\\|")(1).toInt).distinct()

    //cartesian笛卡尔积     ((2042345,000044))
    val userProScore = user_rdd.cartesian(pro_rdd)

    //predict()数据预测,根据模型预测用户对每个产品的评分。int型用户id;int型产品id,评分
    //用户产品对                                        (2042345|00044,0.5)
    val userProPair = model.predict(userProScore).map(x => ("" + x.user + "|" + x.product, x.rating))

    //获取用户ID与相对应的Hash码
    // 5064062000|2042345,car000044|000044,5|car
    // car,5064062000|2042345
    val full_user_rdd = basicId.map(x => {
      (x._3.split("\\|")(1), x._1)
    }).distinct

    //获取产品ID与相对应的Hash码
    // 5064062000|2042345,car000044|000044,5|car
    //car,car000044|000044
    val full_pro_rdd = basicId.map(x => {
      (x._3.split("\\|")(1), x._2)
    }).distinct

    //用户产品join连接
    //car ,(5064062000|2042345,car000044|000044)
    val full_user_pro = full_user_rdd.join(full_pro_rdd).map(x => {
      var saletype = x._1   //car
      var type1 = 1
      saletype match {
        case "car" => type1 = 1
        case "house" => type1 = 2
      }
      //(2042345|00044,5064062000|car00044|1)
      (x._2._1.split("\\|")(1) + "|" + x._2._2.split("\\|")(1),
        x._2._1.split("\\|")(0) + "|" + x._2._2.split("\\|")(0) + "|" + type1)
    })


    //userProPair=(2042345|00044,0.5)
    //full_user_pro=(2042345|00044,5064062000|car00044|1)
    //userProMsg=2042345|00044,(0.5,5064062000|car00044|1)
    //5064062000|1,car00044|0.5
    val userProMsg = userProPair.join(full_user_pro).map(x => {
      (x._2._2.split("\\|")(0) + "|" + x._2._2.split("\\|")(2),
        x._2._2.split("\\|")(1) + "|" + x._2._1)
    })

    /* 数据处理:按评分取前5
   * 样例数据:用户id|预购类型|推荐产品列表
   *  533047060723|2|house000402,house000098,house000403,house000388,house000382
      513047554444|2|house000296,house000117,house000304,house000100,house000402
      683045963832|1|car000300,car000471,car000097,car000382,car000305
      4510341559|1|car000393,car000107,car000105,car000527,car000004
   * */
    //按用户、预购类型分类,每类别取评分前5
    ///userProMsg:  5064062000|1,car00044|0.5
    val userTypeTop = userProMsg.groupByKey.map(x => {
      val ite = x._2.iterator
      val topMap : util.TreeMap[Double, String] = new util.TreeMap[Double, String]
      while (ite.hasNext) {
        val proPirce = ite.next.split("\\|")
        //取正评分,可根据需求调大,值越大喜欢程度越高。
        if (proPirce(1).toDouble > 0) {
          topMap.put(proPirce(1).toDouble, proPirce(0))
        }
        if (topMap.size > 5) {
          topMap.remove(topMap.firstKey)
        }
      }
      //拼接高分产品
      val sb = new StringBuffer
      import scala.collection.JavaConversions._
      for (key <- topMap.keySet) {
        sb.append(topMap.get(key)).append(",")
      }
      x._1 + "|" + sb.toString.substring(0, sb.toString.length - 1)
    })

    /** 测试 **/
    userTypeTop.saveAsTextFile("file:///H:/useranaly/11.DxMyALS")
    sc.stop()

  }
}
  • 4
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值