数据挖掘——商品推荐和大数据处理

zkinglin

已于 2022-02-21 17:31:18 修改

阅读量3.6k

点赞数 6

分类专栏：数据挖掘习题文章标签：数据挖掘 python 数据分析大数据云计算

于 2021-04-16 13:47:52 首次发布

本文链接：https://blog.csdn.net/weixin_43246400/article/details/115756254

版权

本文介绍了使用Hadoop云计算基础进行数据处理，包括WordCount、HDFS操作、倒排索引和PageRank算法。接着通过Numpy进行数据加载和处理，实现了商品推荐的亲和性分析，包括数据转换、支持度和置信度计算，并对规则进行排序，以找到最佳推荐规则。

摘要由CSDN通过智能技术生成

一、Hadoop云计算基础

该部分为转载内容；仅粘贴代码，详情请见该网页

第1关：WordCount词频统计

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCount {
   
    /*
     * MapReduceBase类:实现Mapper和Reducer接口的基类 Mapper接口：
     * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类要实现此接口。
     */
    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {
   
        /*
         * LongWritable,IntWritable,Text是Hadoop中实现的用于封装Java数据类型的类，
         * 这些类实现了WritableComparable接口，
         * 都能够被串行化，便于在分布式环境中进行数据交换，可以视为long,int,String数据类型的替代。
         */
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();// Text实现了BinaryComparable类，可以作为key值

        /*
         * Mapper接口中的map方法： void map(K1 key, V1 value, OutputCollector<K2,V2> output,
         * Reporter reporter) 映射一个单个的输入<K1,V1>对到一个中间输出<K2,V2>对
         * 中间输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
         * OutputCollector接口：收集Mapper和Reducer输出的<K,V>对。 OutputCollector接口的collect(k,
         * v)方法:增加一个(k,v)对到output Reporter 用于报告整个应用的运行进度
         */

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
   
            /*
             * 原始数据（以test1.txt为例）： tale as old as time true as it can be beauty and the
             * beast map阶段，数据如下形式作为map的输入值：key为偏移量 <0 tale as old as time> <21 world java
             * hello> <39 you me too>
             */

            /**
             * 解析(Spliting)后以得到键值对<K2,V2>（仅以test1.txt为例） 格式如下：前者是键值，后者数字是值 tale 1 as 1 old 1
             * as 1 time 1 true 1 as 1 it 1 can 1 be 1 beauty 1 and 1 the 1 beast 1
             * 这些键值对作为map的输出数据
             */

            // ****请补全map函数内容****//
            /********* begin *********/
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
   
                word.set(itr.nextToken());
                context.write(word, one);
            }
            /********* end **********/

        }
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
   
        private IntWritable result = new IntWritable();

        /*
         * reduce过程是对输入键值对洗牌（Shuffing）形成<K2,list(V2)>格式数据（仅以test1.txt为例)： (tablie [1])
         * (as [1,1,1]) (old [1]) (time [1]) (true [1]) (it [1]) (can [1]) (be [1])
         * (beauty [1]) (and [1]) (the [1]) (beast [1]) 作为reduce的输入
         * 
         */
        public void reduce(Text key, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
   
            // ****请补全reduce对<k2， list（v2）> 进行合计得到list（<k3，v3>）过程****//
            /********* begin *********/
            int sum = 0;
            for (IntWritable val : values) {
   
                sum += val.get();
            }
            /********* end **********/

            // ****请将list（<k3，v3>）统计输出****//

            /********* begin *********/
            result.set(sum);
            context.write(key, result);
            /********* end **********/
        }
    }

    public static void main(String[] args) throws Exception {
   
        /**
         * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
         * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration conf)等
         */
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        /*
         * 需要配置输入和输出的HDFS的文件路径参数 可以使用"Usage: wordcount <in> <out>"实现程序运行时动态指定输入输出
         */
        if (otherArgs.length != 2) {
   
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "word count");// Job(Configuration conf,String jobName)设置job名称
        job.setJarByClass(WordCount.class);// 为job设置Mapper类
        /********* begin *********/
        // ****请为job设置Mapper类****//
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);// 为job设置Combiner类
        // ****请为job设置Reduce类****//
        job.setReducerClass(IntSumReducer.class);
        // ****请设置输出key的参数类型****//
        job.setOutputKeyClass(Text.class);
        // ****请设置输出value的类型****//
        job.setOutputValueClass(IntWritable.class);
        /********* end **********/
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));// 为map-reduce任务设置InputFormat实现类，设置输入路径
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));// 为map-reduce任务设置OutputFormat实现类，设置输出路径
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

第2关：HDFS文件读写

import java.io.IOException;
import java.sql.Date;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

public class hdfs {
   

    public static void main(String[] args) throws IOException {
   
        // throws IOException捕获异常声明

        // ****请根据提示补全文件创建过程****//
        /********* begin *********/
        Configuration conf = new Configuration(); // 实例化设置文件，configuration类实现hadoop各模块之间值的传递
        FileSystem fs = FileSystem.get(conf); // 是hadoop访问系统的抽象类，获取文件系统，
                                              // FileSystem的get()方法得到实例fs，然后fs调动create()创建文件，open(）打开文件
        System.out.println(fs.getUri());
        // 实现文件读写主要包含以下步骤：
        // 读取hadoop文件系统配置
        // 实例化设置文件，configuration类实现hadoop各模块之间值的传递
        // FileSystem是hadoop访问系统的抽象类，获取文件系统，
        // FileSystem的get()方法得到实例fs，然后fs调动create()创建文件，调用open()打开文件,调用close()关闭文件

        // *****请按照题目填写要创建的路径，其他路径及文件名无法被识别******//

        Path file = new Path("/user/hadoop/myfile");

        /********