头歌-旅游大数据分析(全)代码(粘贴复制即可)


这学期深受这个头歌平台的折磨,找代码过程也十分繁琐。为此在结课之际当一次挖井人。把全部代码写出。

旅游网站大数据分析 - 数据清洗

第1关清洗HTML文档中无意义数据

package step1;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
public class Task {
    
    //通过filePath文件路径获取Docment对象
    public Document getDoc(String filePath) throws IOException{
        /**********   Begin   **********/
        File input = new File(filePath);
        Document doc = Jsoup.parse(input,"UTF-8","http://www.educoder.net/");
        return doc;
        /**********   End   **********/
    }
    /**
     * 获取清理后的信息
     * @param doc
     * @return
     */
    public List<String> cleanHTML(Document doc){
        /**********   Begin   **********/
        List<String> list=new ArrayList<>();
        list.add(Jsoup.clean(doc.toString(), Whitelist.basic()));
        list.add(Jsoup.clean(doc.toString(), Whitelist.simpleText()));
        return list;
        /**********   End   **********/
    }
    
}

第2关获取携程网北京市的所有酒店信息

package step2;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.io.*;
public class Task {
    
    /**
     * 使用fastjson解析数据
     * @param hotelResult    已经为你解析的所需json数据
     * @return
     */
    public List<Hotel> getHotle(String hotelResult){
        /**********   Begin   **********/
        List<Hotel> hotels = new ArrayList<Hotel>();
        // 解析酒店数据
        JSONObject hotelResultObj = JSONObject.parseObject(hotelResult);
        List<Hotel> pageHotels = JSON.parseArray(hotelResultObj.getString("hotelPositionJSON"), Hotel.class);
        // 增加价格数据
        JSONArray hotelsPrice = hotelResultObj.getJSONArray("htllist");
        if (hotelsPrice != null && !hotelsPrice.isEmpty()) {
            for (int j = 0; j < pageHotels.size(); j++) {
                JSONObject priceObj = hotelsPrice.getJSONObject(j);
                if (priceObj != null && !priceObj.isEmpty()) {
                    Hotel hotel = pageHotels.get(j);
                    String hotelId = priceObj.getString("hotelid");
                    double price = priceObj.getDoubleValue("amount");
                    if (hotel.getId().equals(hotelId)) {
                        hotel.setPrice(price);
                    }
                }
            }
        }
        hotels.addAll(pageHotels);
        return hotels; 
        /**********   End   **********/       
    }
    /**
     * 由于携程网站经常更新,为了不影响测试,我们直接读取本地文件。
     * @return
     */
    public  String getHotelListString(String cityId,String url){
        String hotelResult="";
        try {
            InputStream is = new FileInputStream(new File("src/step2/hotelResult.txt"));
            byte[] b=new byte[1024];
            int len=0;
            try {
                while((len=is.read(b))!=-1){
                    String str=new String(b,0,len);
                    hotelResult+=str;
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        return hotelResult;
    }
}

旅游网站大数据分析 - 数据抓取

第1关利用Jsoup抓取携程旅游网的数据

package step1;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class Task {
	/**
	 * @param filePath	文件路径:backups/www.ctrip.com.txt/
	 * @return
	 * @throws IOException
	 */

	public Document getHtml1(String url) throws IOException{
           
        Document document = Jsoup.parse( new File( "./backups/www.ctrip.com.txt" ) , "utf-8" );
		   // System.out.println(document.title());
		   // System.out.println(document);
        
		return document;
	} 
	/**
	 * 
	 * @param url	网址http://hotels.ctrip.com/domestic-city-hotel.html
	 * @return
	 * @throws IOException
	 */
	public Document getHtml2(String url) throws IOException{
        Document document = Jsoup.parse( new File( "./backups/hotels.ctrip.com_domestic-city-hotel.txt" ) , "utf-8" );
        //System.out.println(document.title());
		return document;
	} 


}

第2关解析并提取HTML 元素(一)

package step2;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
    
 public Document getDoc1(String url) throws IOException{
      File file=new File("./backups/www.ctrip.com.txt");
       Document document =Jsoup.parse(file,"UTF-8","http://www.ctrip.com/");
    
       return document ;
    }

    //获取“http://you.ctrip.com/”的Docment对象
    public Document getDoc2(String url) throws IOException{
        File file=new File("./backups/you.ctrip.com.txt");
        Document document =Jsoup.parse(file,"UTF-8","http://you.ctrip.com");
        
        return document ;
    }


    //获取所有链接

    public Elements getLinks(Document doc){
       Elements links=doc.select("link[href]");
        return links;
    }
    
    //获取第一个class为“pop_attention”的div
    public Element getDiv(Document doc){
       Element element =doc.select("div.pop_attention").first();
        return element ;
    }
    
    //获取所有li之后的i标签
    public Elements getI(Document doc){
     Elements element =doc.select("li>i");
        return element ;
    }

}


第3关 解析并提取HTML 元素(二)

package step3;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
   
   //通过filePath文件路径获取Docment对象
   public Document getDoc(String filePath) throws IOException{
   	/**********   Begin   **********/
File file=new File("./backups/hotel.ctrip.com.txt");
      Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
       return doc;
   	/**********   End   **********/
   }

   //获取所有链接
   public List<String> getLinks(Document doc){
   	/**********   Begin   **********/
       List<String> ar=new ArrayList<>();
       Elements kk=doc.select("a[href]");
       for(Element gg:kk){
          ar.add(gg.tagName()+"$"+gg.attr("abs:href")+"("+gg.text()+")"); 
       } 
       return ar; 
   	/**********   End   **********/
   }
   
   //获取图片
   public List<String> getMedia(Document doc){
   	/**********   Begin   **********/
      List<String> list=new ArrayList<>(); 
       Elements ll=doc.select("[src]"); 
       for(Element h:ll){ 
           if(h.tagName().equals("img")){ 
               list.add(h.tagName()+"$"+h.attr("abs:src"));
           }
       }
       
       return list;
   	/**********   End   **********/
   }
   
   //获取link[href]链接
   public List<String> getImports(Document doc){
   	/**********   Begin   **********/
List<String> list=new ArrayList<>();
       Elements kk=doc.select("link[href]");
       for(Element g:kk){
             list.add(g.tagName()+"$"+g.attr("abs:href")+"("+g.attr("rel")+")");
       }
       
       return list;
   	/**********   End   **********/
   }
   
}


第4关 使用Jsoup抓取携程旅游网全国城市信息

package step4;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
    
   public Document getDoc(String url) throws IOException{
        
      File file=new File("backups/hotels.ctrip.com_domestic-city-hotel.txt");
       Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
        return doc;
	}
    
    /**
	 * 获取所有城市返回城市信息集合
	 * @param doc	
	 * @return
	 */
	public List<HotelCity> getAllCitys(Document doc){
       List<HotelCity> cities = new ArrayList<HotelCity>(); 
        
Elements aa= doc.getElementsByClass("pinyin_filter_detail layoutfix");
        Element pp = aa.first();
        Elements hh= pp.getElementsByTag("dd");
        Elements hts=pp.getElementsByTag("dt");
        
       for (int i = 0; i < hh.size(); i++) {
         Element bb = hts.get(i);
        Element head_hotelsLink = hh.get(i);
     Elements links = head_hotelsLink.children();
           
        for (Element link : links) {
                String pinyin_cityId = link.attr("href").replace("/hotel/", "");
                String pinyin = pinyin_cityId.replace(StringUtil.getNumbers(link.attr("href")), "");//截取拼音
                HotelCity city = new HotelCity();
                city.setCityId(StringUtil.getNumbers(link.attr("href"))); //截取cityId
                city.setCityName(link.text());
                city.setHeadPinyin(bb.text());
                city.setPinyin(pinyin);
                cities.add(city);

    }
		
	}
        return cities;
    }

}


旅游网站大数据分析 - 数据存储

第1关 保存酒店和城市数据

package com.savedata;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import com.alibaba.fastjson.JSONObject;
import com.entity.Hotel;
import com.entity.HotelComment;
import com.util.HBaseUtil;
public class SaveData {
    /**
     * 获取并保存酒店和城市数据
     */
    public static void saveCityAndHotelInfo() {
        /**********   Begin   **********/        
                try {
            HBaseUtil.createTable("t_city_hotels_info", new String[] { "cityInfo", "hotel_info" });
        } catch (Exception e) {
            // 创建表失败
            e.printStackTrace();
        }
        List<Put> puts = new ArrayList<>();
        // 添加数据
        try {
            InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("aomen.txt");
            String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
            List<Hotel> parseArray = JSONObject.parseArray(readFileToString, Hotel.class);
            String hongkong = IOUtils.toString(SaveData.class.getClassLoader().getResourceAsStream("hongkong.txt"),
                    "UTF-8");
            List<Hotel> hongkongHotel = JSONObject.parseArray(hongkong, Hotel.class);
            parseArray.addAll(hongkongHotel);
            for (Hotel hotel : parseArray) {
                String cityId = hotel.getCity_id();
                String hotelId = hotel.getId();
                Put put = new Put(Bytes.toBytes(cityId + "_" + hotelId));
                // 添加city数据
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityId"), Bytes.toBytes(cityId));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityName"),
                        Bytes.toBytes(hotel.getCity_name()));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("pinyin"), Bytes.toBytes(hotel.getPinyin()));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("collectionTime"),
                        Bytes.toBytes(hotel.getCollectionTime()));
                // 添加hotel数据
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("id"), Bytes.toBytes(hotel.getId()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("name"), Bytes.toBytes(hotel.getName()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("price"), Bytes.toBytes(String.valueOf(hotel.getPrice())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("lon"), Bytes.toBytes(String.valueOf(hotel.getLon())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("url"), Bytes.toBytes(hotel.getUrl()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("img"), Bytes.toBytes(hotel.getImg()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("address"), Bytes.toBytes(hotel.getAddress()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("score"), Bytes.toBytes(String.valueOf(hotel.getScore())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpscore"), Bytes.toBytes(String.valueOf(hotel.getDpscore())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpcount"), Bytes.toBytes(String.valueOf(hotel.getDpcount())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("star"), Bytes.toBytes(hotel.getStar()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("stardesc"),
                        Bytes.toBytes(hotel.getStardesc()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("shortName"),
                        Bytes.toBytes(hotel.getShortName()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("isSingleRec"),
                        Bytes.toBytes(hotel.getIsSingleRec()));
                puts.add(put);
            }
            // 批量保存数据
            HBaseUtil.putByTable("t_city_hotels_info", puts);
        } catch (Exception e) {
            e.printStackTrace();
        }
        
        
        
        /**********   End   **********/         
    }
    
    /**
     * 获取和保存酒店的评论数据
     */
    public static void saveCommentInfo() {
        /**********   Begin   **********/
         
         
         
         
        /**********   End   **********/
    }
}

第2关 保存酒店评论信息

package com.savedata;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import com.alibaba.fastjson.JSONObject;
import com.entity.Hotel;
import com.entity.HotelComment;
import com.util.HBaseUtil;
public class SaveData {
    /**
     * 获取并保存酒店和城市数据
     */
    public static void saveCityAndHotelInfo() {
        /**********   Begin   **********/        
                try {
            HBaseUtil.createTable("t_city_hotels_info", new String[] { "cityInfo", "hotel_info" });
        } catch (Exception e) {
            // 创建表失败
            e.printStackTrace();
        }
        List<Put> puts = new ArrayList<>();
        // 添加数据
        try {
            InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("aomen.txt");
            String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
            List<Hotel> parseArray = JSONObject.parseArray(readFileToString, Hotel.class);
            String hongkong = IOUtils.toString(SaveData.class.getClassLoader().getResourceAsStream("hongkong.txt"),
                    "UTF-8");
            List<Hotel> hongkongHotel = JSONObject.parseArray(hongkong, Hotel.class);
            parseArray.addAll(hongkongHotel);
            for (Hotel hotel : parseArray) {
                String cityId = hotel.getCity_id();
                String hotelId = hotel.getId();
                Put put = new Put(Bytes.toBytes(cityId + "_" + hotelId));
                // 添加city数据
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityId"), Bytes.toBytes(cityId));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityName"),
                        Bytes.toBytes(hotel.getCity_name()));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("pinyin"), Bytes.toBytes(hotel.getPinyin()));
                put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("collectionTime"),
                        Bytes.toBytes(hotel.getCollectionTime()));
                // 添加hotel数据
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("id"), Bytes.toBytes(hotel.getId()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("name"), Bytes.toBytes(hotel.getName()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("price"), Bytes.toBytes(String.valueOf(hotel.getPrice())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("lon"), Bytes.toBytes(String.valueOf(hotel.getLon())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("url"), Bytes.toBytes(hotel.getUrl()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("img"), Bytes.toBytes(hotel.getImg()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("address"), Bytes.toBytes(hotel.getAddress()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("score"), Bytes.toBytes(String.valueOf(hotel.getScore())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpscore"), Bytes.toBytes(String.valueOf(hotel.getDpscore())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpcount"), Bytes.toBytes(String.valueOf(hotel.getDpcount())));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("star"), Bytes.toBytes(hotel.getStar()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("stardesc"),
                        Bytes.toBytes(hotel.getStardesc()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("shortName"),
                        Bytes.toBytes(hotel.getShortName()));
                put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("isSingleRec"),
                        Bytes.toBytes(hotel.getIsSingleRec()));
                puts.add(put);
            }
            // 批量保存数据
            HBaseUtil.putByTable("t_city_hotels_info", puts);
        } catch (Exception e) {
            e.printStackTrace();
        }
        
        
        
        /**********   End   **********/         
    }
    
    /**
     * 获取和保存酒店的评论数据
     */
    public static void saveCommentInfo() {
        /**********   Begin   **********/
        // 创建评论表
        try {
            HBaseUtil.createTable("t_hotel_comment", new String[] { "hotel_info", "comment_info" });
        } catch (Exception e) {
            // 创建表失败
            e.printStackTrace();
        }
        InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("comment.txt");
        try {
        String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
        List<HotelComment> otherCommentListByPage = JSONObject.parseArray(readFileToString, HotelComment.class);
        // 获取数据
        List<Put> puts = new ArrayList<>();
        // 定义Put对象
        for (HotelComment comment : otherCommentListByPage) {
            Put put = new Put((comment.getHotel_id()  + "_" + comment.getId()).getBytes());
            put.addColumn("hotel_info".getBytes(), "hotel_name".getBytes(),
                    comment.getHotel_name().getBytes());
            put.addColumn("hotel_info".getBytes(), "hotel_id".getBytes(), comment.getHotel_id().getBytes());
            // 数据量很大在这里只保存用作分析的数据
            put.addColumn("comment_info".getBytes(), "id".getBytes(), Bytes.toBytes(String.valueOf(comment.getId())));
            put.addColumn("comment_info".getBytes(), "baseRoomId".getBytes(), Bytes.toBytes(String.valueOf(comment.getBaseRoomId())));
            if (comment.getBaseRoomId() != -1 && comment.getBaseRoomName() != null) {
                put.addColumn("comment_info".getBytes(), "baseRoomName".getBytes(),
                        Bytes.toBytes(comment.getBaseRoomName()));
            }
            put.addColumn("comment_info".getBytes(), "checkInDate".getBytes(), Bytes.toBytes(comment.getCheckInDate()));
            put.addColumn("comment_info".getBytes(), "postDate".getBytes(), Bytes.toBytes(comment.getPostDate()));
            put.addColumn("comment_info".getBytes(), "content".getBytes(), Bytes.toBytes(comment.getContent()));
            put.addColumn("comment_info".getBytes(), "highlightPosition".getBytes(),
                    Bytes.toBytes(comment.getHighlightPosition()));
            put.addColumn("comment_info".getBytes(), "hasHotelFeedback".getBytes(),
                    Bytes.toBytes(String.valueOf(comment.getHasHotelFeedback())));
            put.addColumn("comment_info".getBytes(), "userNickName".getBytes(),
                    Bytes.toBytes(comment.getUserNickName()));
            puts.add(put);
        }
            // 上传数据
            HBaseUtil.putByTable("t_hotel_comment", puts);
        } catch (Exception e) {
            e.printStackTrace();
        }
         
         
         
         
        /**********   End   **********/
    }
}

旅游网站之数据分析

第1关 统计每个城市的宾馆平均价格

package com.processdata;

import java.io.IOException;
import java.util.Scanner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.util.HBaseUtil;

/**
 * 使用MapReduce程序处理HBase中的数据并将最终结果存入到另一张表 1中
 */
public class HBaseMapReduce extends Configured implements Tool {

    public static class MyMapper extends TableMapper<Text, DoubleWritable> {
        public static final byte[] column = "price".getBytes();
        public static final byte[] family = "hotel_info".getBytes();

        @Override
        protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
                throws IOException, InterruptedException {
            /********** Begin *********/
		    String cityId = Bytes.toString(result.getValue("cityInfo".getBytes(), "cityId".getBytes()));     
            byte[] value = result.getValue(family, column);     
            Double value1 = Double.parseDouble(Bytes.toString(value));     
            DoubleWritable i = new DoubleWritable(value1);     
            String priceKey = cityId;     
            context.write(new Text(priceKey),i); 
		  	/********** End *********/
        }
    }

    public static class MyTableReducer extends TableReducer<Text, DoubleWritable, ImmutableBytesWritable> {
        @Override
        public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
            /********** Begin *********/
		    double sum = 0; 
            int len = 0; 
            for (DoubleWritable price : values)
            { 
                len ++; 
                sum += price.get(); 
            } 
            Put put = new Put(Bytes.toBytes(key.toString())); 
            put.addColumn("average_infos".getBytes(),"price".getBytes(),Bytes.toBytes(String.valueOf(sum / len))); 
            context.write(null, put);
			/********** End *********/
        }
    }
    
    public int run(String[] args) throws Exception {
        //配置Job
        Configuration conf = HBaseConfiguration.create(getConf());
        conf.set("hbase.zookeeper.quorum", "127.0.0.1");  //hbase 服务地址
        conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
        Scanner sc = new Scanner(System.in);
        String arg1 = sc.next();
        String arg2 = sc.next();
        //String arg1 = "t_city_hotels_info";
        //String arg2 = "average_table";
        try {
			HBaseUtil.createTable("average_table", new String[] {"average_infos"});
		} catch (Exception e) {
			// 创建表失败
			e.printStackTrace();
		}
        Job job = configureJob(conf,new String[]{arg1,arg2});
        return job.waitForCompletion(true) ? 0 : 1;
    }

    private Job configureJob(Configuration conf, String[] args) throws IOException {
        String tablename = args[0];
        String targetTable = args[1];
        Job job = new Job(conf,tablename);
        Scan scan = new Scan();
        scan.setCaching(300);
        scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存
        //初始化Mapreduce程序
        TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class, Text.class, DoubleWritable.class,job);
        //初始化Reduce
        TableMapReduceUtil.initTableReducerJob(
                targetTable,        // output table
                MyTableReducer.class,    // reducer class
                job);
        job.setNumReduceTasks(1);
        return job;
    }
}


第2关 统计酒店评论中词频较高的词

package com.processdata;
import java.io.IOException;
import java.util.List;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.Word;
import com.util.HBaseUtil;
import com.vdurmont.emoji.EmojiParser;

/**
 * 词频统计
 *
 */
public class WorldCountMapReduce extends Configured implements Tool {
    

    public static class MyMapper extends TableMapper<Text, IntWritable> {
        private static byte[] family = "comment_info".getBytes();
    	private static byte[] column = "content".getBytes();
        
        @Override
        protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
                throws IOException, InterruptedException {
            /********** Begin *********/
            byte[] value = result.getValue(family, column); 
            String word = new String(value,"utf-8"); 
            if(!word.isEmpty())
            { 
                String filter = EmojiParser.removeAllEmojis(word); 
                List<Word> segs = WordSegmenter.seg(filter); 
                for(Word cont : segs) 
                { 
                    Text text = new Text(cont.getText()); 
                    IntWritable v = new IntWritable(1); 
                    context.write(text,v); 
                } 
            }
			/********** End *********/
    	}
    }

    public static class MyReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {
        private static byte[] family =  "word_info".getBytes();
        private static byte[] column = "count".getBytes();
        
        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            /********** Begin *********/
		    int sum = 0; 
            for (IntWritable value : values) 
                sum += value.get(); 
            Put put = new Put(Bytes.toBytes(key.toString())); 
            put.addColumn(family,column,Bytes.toBytes(sum)); 
            context.write(null,put);
		  	/********** End *********/
        }

    }
    public int run(String[] args) throws Exception {
        //配置Job
        Configuration conf = HBaseConfiguration.create(getConf());
        conf.set("hbase.zookeeper.quorum", "127.0.0.1");  //hbase 服务地址
        conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
        Scanner sc = new Scanner(System.in);
        String arg1 = sc.next();
        String arg2 = sc.next();
        try {
			HBaseUtil.createTable("comment_word_count", new String[] {"word_info"});
		} catch (Exception e) {
			// 创建表失败
			e.printStackTrace();
		}
        Job job = configureJob(conf,new String[]{arg1,arg2});
        return job.waitForCompletion(true) ? 0 : 1;
    }

    private Job configureJob(Configuration conf, String[] args) throws IOException {
        String tablename = args[0];
        String targetTable = args[1];
        Job job = new Job(conf,tablename);
        Scan scan = new Scan();
        scan.setCaching(300);
        scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存
        //初始化Mapper Reduce程序
        TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class, Text.class, IntWritable.class,job);
        TableMapReduceUtil.initTableReducerJob(targetTable,MyReducer.class,job);
        job.setNumReduceTasks(1);
        return job;
    }

}



酒店智能推荐—智慧旅游

第1关 构建用户-酒店矩阵

import numpy as np

def create_user_hotel_matrix(users, items, data, hotel_id):
    '''
    构建用户-酒店矩阵
    :param users: 用户数量,类型为整数
    :param items: 酒店数量,类型为整数
    :param data: 原始数据,类型为DataFrame
    :param hotel_id: 酒店ID的列表,类型为列表
    :return: user_hotel_matrix
    '''
    user_hotel_matrix = np.zeros((users, items))
    for line in data.itertuples():
        #********* Begin *********#
        users = data.user_id.unique().shape[0]
# 将data中有多少个酒店统计出来并保存到items变量中
        items = data.id.unique().shape[0]
        user_hotel_matrix[line[3], hotel_id.index(line[1])] = line[4]
        #********* End *********#
    return user_hotel_matrix
    data = pd.read_csv('./step1/hotel_data.csv', encoding='utf8')

第2关 酒店智能推荐

import numpy as np

def recommend_hotel(A, userid):
    '''
    向用户id为userid的用户推荐3家酒店
    :param A: 已经更新好了的矩阵A
    :param userid: 待推荐的userid,类型为整数
    :return: recommend
    '''
    #********* Begin *********#
    m,n = A.shape
    d=5
    alpha=0.1
    lr=0.01
    B = np.random.uniform(0,1,(m,d))
    C = np.random.uniform(0,1,(d,n))
    record = np.array(A>0, dtype=int)
    B_grads = np.dot(np.multiply(record, np.dot(B,C)-A),C.T)
    # 用和上面一样的方式按公式计算loss对C的偏导
    C_grads = np.dot(B.T, np.multiply(record,np.dot(B,C)-A))
    # 根据公式更新矩阵B和矩阵C
    B = alpha*B - lr*B_grads
    C = alpha*C - lr*C_grads
    pred_ratings = np.dot(B, C)
    # 对矩阵A中userid对应的行进行升序排序
    ranklist = np.argsort(A[userid])
    #********* End *********#
    recommend = ranklist[-1:-4:-1]
    return recommend[-1], recommend[-2], recommend[-3]

旅游网站之数据可视化

第1关 词云的绘制

package com.showdata;

import java.awt.Dimension;
import java.io.IOException;
import java.util.List;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;

public class WordCloud_img {
    
	public WordCloud get() throws IOException  {
        /**********     Begin   **********/
        FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
        frequencyAnalyzer.setWordFrequenciesToReturn(200);
        List<WordFrequency> wordFrequencyList = frequencyAnalyzer.load("wordcloud.txt");
        Dimension dimension = new Dimension(500,312);
        //2.修改词云的实例化
        WordCloud wordCloud = new WordCloud(dimension,CollisionMode.PIXEL_PERFECT);
//3.生成词云并写入图片
        wordCloud.build(wordFrequencyList);
        wordCloud.writeToFile("imgs/wordcloud_img.png");
		/**********     End   **********/        
        return wordCloud;
    }
	
}

第2关 词云的渲染

package com.showdata;

import java.awt.Color;
import java.awt.Dimension;
import java.io.IOException;
import java.util.List;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.PixelBoundryBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.image.AngleGenerator;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.palette.ColorPalette;
import com.kennycason.kumo.wordstart.CenterWordStart;
import java.awt.Font;

public class WordCloud_render {
	
	public WordCloud get() throws IOException {
		FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
        frequencyAnalyzer.setWordFrequenciesToReturn(200);
        List<WordFrequency> wordFrequencies = frequencyAnalyzer.load("wordcloud.txt");
        Dimension dimension = new Dimension(500, 312);
        WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
		/**********     Begin   **********/        
        java.awt.Font font = new java.awt.Font("宋体",3, 24);//3表示粗斜体
        wordCloud.setKumoFont(new KumoFont(font));
        wordCloud.setPadding(2);
        wordCloud.setBackgroundColor(Color.white);
        wordCloud.setBackground(new PixelBoundryBackground("myImgs/whale_small.png"));
        wordCloud.setColorPalette(new ColorPalette(Color.RED, Color.BLUE, Color.GREEN));
        wordCloud.setWordStartStrategy(new CenterWordStart());
        wordCloud.setAngleGenerator(new AngleGenerator(0));//0表示横向  若填180则表示横向基础旋转180°





		/**********     End   **********/
        wordCloud.build(wordFrequencies);
        wordCloud.writeToFile("imgs/wordcloud_render.png");
        return wordCloud;
	}
	
}

第3关 获取酒店评论数据生成词云

package com.showdata;

import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.PixelBoundryBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.image.AngleGenerator;
import com.kennycason.kumo.palette.LinearGradientColorPalette;
import com.kennycason.kumo.wordstart.CenterWordStart;
import com.util.HBaseUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 词云
 */
public class CommentWordCloud {

      public WordCloud get() throws IOException {
          Connection conn=HBaseUtil.getConnection();
          /**********     Begin   **********/
          TableName tableName = TableName.valueOf(Bytes.toBytes("comment_word_count"));
          Table table = conn.getTable(tableName);
          ResultScanner scanner = table.getScanner(new Scan());
          List<WordFrequency> words = new ArrayList<>();
          for (Result result : scanner) {
              String word = new String(result.getRow(), "utf-8");
              int count = Bytes.toInt(result.getValue(Bytes.toBytes("word_info"), Bytes.toBytes("count")));
              WordFrequency wordFrequency = new WordFrequency(word, count);
              if (count > 10) {
                  words.add(wordFrequency);
                  }
            }
            // 2.生成词云并设置样式
            Dimension dimension = new Dimension(500, 312);
            WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
            java.awt.Font font = new java.awt.Font("宋体", Font.ITALIC, 24);
            wordCloud.setKumoFont(new KumoFont(font));
            wordCloud.setPadding(2);
            wordCloud.setBackgroundColor(Color.WHITE);
            wordCloud.setBackground(new PixelBoundryBackground("myImgs/whale_small.png"));
            wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30));
            wordCloud.setWordStartStrategy(new CenterWordStart());
            wordCloud.setAngleGenerator(new AngleGenerator(0));
            // 将词频数据加载到词云中
            wordCloud.build(words);

          /**********     End   **********/
          wordCloud.writeToFile("imgs/wordcloud_comment.png");
          return wordCloud;
    }
}

  • 23
    点赞
  • 41
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值