头歌educator-旅游大数据分析实训答案
- 旅游网站大数据分析 - 数据清洗
- 第1关清洗HTML文档中无意义数据
- 第2关获取携程网北京市的所有酒店信息
- 旅游网站大数据分析 - 数据抓取
- 第1关利用Jsoup抓取携程旅游网的数据
- 第2关解析并提取HTML 元素(一)
- 第3关 解析并提取HTML 元素(二)
- 第4关 使用Jsoup抓取携程旅游网全国城市信息
- 旅游网站大数据分析 - 数据存储
- 第1关 保存酒店和城市数据
- 第2关 保存酒店评论信息
- 旅游网站之数据分析
- 第1关 统计每个城市的宾馆平均价格
- 第2关 统计酒店评论中词频较高的词
- 酒店智能推荐---智慧旅游
- 第1关 构建用户-酒店矩阵
- 第2关 酒店智能推荐
- 旅游网站之数据可视化
- 第1关 词云的绘制
- 第2关 词云的渲染
- 第3关 获取酒店评论数据生成词云
这学期深受这个头歌平台的折磨,找代码过程也十分繁琐。为此在结课之际当一次挖井人。把全部代码写出。
旅游网站大数据分析 - 数据清洗
第1关清洗HTML文档中无意义数据
package step1;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.safety.Whitelist;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc(String filePath) throws IOException{
/********** Begin **********/
File input = new File(filePath);
Document doc = Jsoup.parse(input,"UTF-8","http://www.educoder.net/");
return doc;
/********** End **********/
}
/**
* 获取清理后的信息
* @param doc
* @return
*/
public List<String> cleanHTML(Document doc){
/********** Begin **********/
List<String> list=new ArrayList<>();
list.add(Jsoup.clean(doc.toString(), Whitelist.basic()));
list.add(Jsoup.clean(doc.toString(), Whitelist.simpleText()));
return list;
/********** End **********/
}
}
第2关获取携程网北京市的所有酒店信息
package step2;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.io.*;
public class Task {
/**
* 使用fastjson解析数据
* @param hotelResult 已经为你解析的所需json数据
* @return
*/
public List<Hotel> getHotle(String hotelResult){
/********** Begin **********/
List<Hotel> hotels = new ArrayList<Hotel>();
// 解析酒店数据
JSONObject hotelResultObj = JSONObject.parseObject(hotelResult);
List<Hotel> pageHotels = JSON.parseArray(hotelResultObj.getString("hotelPositionJSON"), Hotel.class);
// 增加价格数据
JSONArray hotelsPrice = hotelResultObj.getJSONArray("htllist");
if (hotelsPrice != null && !hotelsPrice.isEmpty()) {
for (int j = 0; j < pageHotels.size(); j++) {
JSONObject priceObj = hotelsPrice.getJSONObject(j);
if (priceObj != null && !priceObj.isEmpty()) {
Hotel hotel = pageHotels.get(j);
String hotelId = priceObj.getString("hotelid");
double price = priceObj.getDoubleValue("amount");
if (hotel.getId().equals(hotelId)) {
hotel.setPrice(price);
}
}
}
}
hotels.addAll(pageHotels);
return hotels;
/********** End **********/
}
/**
* 由于携程网站经常更新,为了不影响测试,我们直接读取本地文件。
* @return
*/
public String getHotelListString(String cityId,String url){
String hotelResult="";
try {
InputStream is = new FileInputStream(new File("src/step2/hotelResult.txt"));
byte[] b=new byte[1024];
int len=0;
try {
while((len=is.read(b))!=-1){
String str=new String(b,0,len);
hotelResult+=str;
}
} catch (IOException e) {
e.printStackTrace();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return hotelResult;
}
}
旅游网站大数据分析 - 数据抓取
第1关利用Jsoup抓取携程旅游网的数据
package step1;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class Task {
/**
* @param filePath 文件路径:backups/www.ctrip.com.txt/
* @return
* @throws IOException
*/
public Document getHtml1(String url) throws IOException{
Document document = Jsoup.parse( new File( "./backups/www.ctrip.com.txt" ) , "utf-8" );
// System.out.println(document.title());
// System.out.println(document);
return document;
}
/**
*
* @param url 网址http://hotels.ctrip.com/domestic-city-hotel.html
* @return
* @throws IOException
*/
public Document getHtml2(String url) throws IOException{
Document document = Jsoup.parse( new File( "./backups/hotels.ctrip.com_domestic-city-hotel.txt" ) , "utf-8" );
//System.out.println(document.title());
return document;
}
}
第2关解析并提取HTML 元素(一)
package step2;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
public Document getDoc1(String url) throws IOException{
File file=new File("./backups/www.ctrip.com.txt");
Document document =Jsoup.parse(file,"UTF-8","http://www.ctrip.com/");
return document ;
}
//获取“http://you.ctrip.com/”的Docment对象
public Document getDoc2(String url) throws IOException{
File file=new File("./backups/you.ctrip.com.txt");
Document document =Jsoup.parse(file,"UTF-8","http://you.ctrip.com");
return document ;
}
//获取所有链接
public Elements getLinks(Document doc){
Elements links=doc.select("link[href]");
return links;
}
//获取第一个class为“pop_attention”的div
public Element getDiv(Document doc){
Element element =doc.select("div.pop_attention").first();
return element ;
}
//获取所有li之后的i标签
public Elements getI(Document doc){
Elements element =doc.select("li>i");
return element ;
}
}
第3关 解析并提取HTML 元素(二)
package step3;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
//通过filePath文件路径获取Docment对象
public Document getDoc(String filePath) throws IOException{
/********** Begin **********/
File file=new File("./backups/hotel.ctrip.com.txt");
Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
return doc;
/********** End **********/
}
//获取所有链接
public List<String> getLinks(Document doc){
/********** Begin **********/
List<String> ar=new ArrayList<>();
Elements kk=doc.select("a[href]");
for(Element gg:kk){
ar.add(gg.tagName()+"$"+gg.attr("abs:href")+"("+gg.text()+")");
}
return ar;
/********** End **********/
}
//获取图片
public List<String> getMedia(Document doc){
/********** Begin **********/
List<String> list=new ArrayList<>();
Elements ll=doc.select("[src]");
for(Element h:ll){
if(h.tagName().equals("img")){
list.add(h.tagName()+"$"+h.attr("abs:src"));
}
}
return list;
/********** End **********/
}
//获取link[href]链接
public List<String> getImports(Document doc){
/********** Begin **********/
List<String> list=new ArrayList<>();
Elements kk=doc.select("link[href]");
for(Element g:kk){
list.add(g.tagName()+"$"+g.attr("abs:href")+"("+g.attr("rel")+")");
}
return list;
/********** End **********/
}
}
第4关 使用Jsoup抓取携程旅游网全国城市信息
package step4;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {
public Document getDoc(String url) throws IOException{
File file=new File("backups/hotels.ctrip.com_domestic-city-hotel.txt");
Document doc=Jsoup.parse(file,"UTF-8","http://hotels.ctrip.com/");
return doc;
}
/**
* 获取所有城市返回城市信息集合
* @param doc
* @return
*/
public List<HotelCity> getAllCitys(Document doc){
List<HotelCity> cities = new ArrayList<HotelCity>();
Elements aa= doc.getElementsByClass("pinyin_filter_detail layoutfix");
Element pp = aa.first();
Elements hh= pp.getElementsByTag("dd");
Elements hts=pp.getElementsByTag("dt");
for (int i = 0; i < hh.size(); i++) {
Element bb = hts.get(i);
Element head_hotelsLink = hh.get(i);
Elements links = head_hotelsLink.children();
for (Element link : links) {
String pinyin_cityId = link.attr("href").replace("/hotel/", "");
String pinyin = pinyin_cityId.replace(StringUtil.getNumbers(link.attr("href")), "");//截取拼音
HotelCity city = new HotelCity();
city.setCityId(StringUtil.getNumbers(link.attr("href"))); //截取cityId
city.setCityName(link.text());
city.setHeadPinyin(bb.text());
city.setPinyin(pinyin);
cities.add(city);
}
}
return cities;
}
}
旅游网站大数据分析 - 数据存储
第1关 保存酒店和城市数据
package com.savedata;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import com.alibaba.fastjson.JSONObject;
import com.entity.Hotel;
import com.entity.HotelComment;
import com.util.HBaseUtil;
public class SaveData {
/**
* 获取并保存酒店和城市数据
*/
public static void saveCityAndHotelInfo() {
/********** Begin **********/
try {
HBaseUtil.createTable("t_city_hotels_info", new String[] { "cityInfo", "hotel_info" });
} catch (Exception e) {
// 创建表失败
e.printStackTrace();
}
List<Put> puts = new ArrayList<>();
// 添加数据
try {
InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("aomen.txt");
String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
List<Hotel> parseArray = JSONObject.parseArray(readFileToString, Hotel.class);
String hongkong = IOUtils.toString(SaveData.class.getClassLoader().getResourceAsStream("hongkong.txt"),
"UTF-8");
List<Hotel> hongkongHotel = JSONObject.parseArray(hongkong, Hotel.class);
parseArray.addAll(hongkongHotel);
for (Hotel hotel : parseArray) {
String cityId = hotel.getCity_id();
String hotelId = hotel.getId();
Put put = new Put(Bytes.toBytes(cityId + "_" + hotelId));
// 添加city数据
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityId"), Bytes.toBytes(cityId));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityName"),
Bytes.toBytes(hotel.getCity_name()));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("pinyin"), Bytes.toBytes(hotel.getPinyin()));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("collectionTime"),
Bytes.toBytes(hotel.getCollectionTime()));
// 添加hotel数据
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("id"), Bytes.toBytes(hotel.getId()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("name"), Bytes.toBytes(hotel.getName()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("price"), Bytes.toBytes(String.valueOf(hotel.getPrice())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("lon"), Bytes.toBytes(String.valueOf(hotel.getLon())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("url"), Bytes.toBytes(hotel.getUrl()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("img"), Bytes.toBytes(hotel.getImg()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("address"), Bytes.toBytes(hotel.getAddress()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("score"), Bytes.toBytes(String.valueOf(hotel.getScore())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpscore"), Bytes.toBytes(String.valueOf(hotel.getDpscore())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpcount"), Bytes.toBytes(String.valueOf(hotel.getDpcount())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("star"), Bytes.toBytes(hotel.getStar()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("stardesc"),
Bytes.toBytes(hotel.getStardesc()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("shortName"),
Bytes.toBytes(hotel.getShortName()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("isSingleRec"),
Bytes.toBytes(hotel.getIsSingleRec()));
puts.add(put);
}
// 批量保存数据
HBaseUtil.putByTable("t_city_hotels_info", puts);
} catch (Exception e) {
e.printStackTrace();
}
/********** End **********/
}
/**
* 获取和保存酒店的评论数据
*/
public static void saveCommentInfo() {
/********** Begin **********/
/********** End **********/
}
}
第2关 保存酒店评论信息
package com.savedata;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import com.alibaba.fastjson.JSONObject;
import com.entity.Hotel;
import com.entity.HotelComment;
import com.util.HBaseUtil;
public class SaveData {
/**
* 获取并保存酒店和城市数据
*/
public static void saveCityAndHotelInfo() {
/********** Begin **********/
try {
HBaseUtil.createTable("t_city_hotels_info", new String[] { "cityInfo", "hotel_info" });
} catch (Exception e) {
// 创建表失败
e.printStackTrace();
}
List<Put> puts = new ArrayList<>();
// 添加数据
try {
InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("aomen.txt");
String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
List<Hotel> parseArray = JSONObject.parseArray(readFileToString, Hotel.class);
String hongkong = IOUtils.toString(SaveData.class.getClassLoader().getResourceAsStream("hongkong.txt"),
"UTF-8");
List<Hotel> hongkongHotel = JSONObject.parseArray(hongkong, Hotel.class);
parseArray.addAll(hongkongHotel);
for (Hotel hotel : parseArray) {
String cityId = hotel.getCity_id();
String hotelId = hotel.getId();
Put put = new Put(Bytes.toBytes(cityId + "_" + hotelId));
// 添加city数据
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityId"), Bytes.toBytes(cityId));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("cityName"),
Bytes.toBytes(hotel.getCity_name()));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("pinyin"), Bytes.toBytes(hotel.getPinyin()));
put.addColumn(Bytes.toBytes("cityInfo"), Bytes.toBytes("collectionTime"),
Bytes.toBytes(hotel.getCollectionTime()));
// 添加hotel数据
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("id"), Bytes.toBytes(hotel.getId()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("name"), Bytes.toBytes(hotel.getName()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("price"), Bytes.toBytes(String.valueOf(hotel.getPrice())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("lon"), Bytes.toBytes(String.valueOf(hotel.getLon())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("url"), Bytes.toBytes(hotel.getUrl()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("img"), Bytes.toBytes(hotel.getImg()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("address"), Bytes.toBytes(hotel.getAddress()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("score"), Bytes.toBytes(String.valueOf(hotel.getScore())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpscore"), Bytes.toBytes(String.valueOf(hotel.getDpscore())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("dpcount"), Bytes.toBytes(String.valueOf(hotel.getDpcount())));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("star"), Bytes.toBytes(hotel.getStar()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("stardesc"),
Bytes.toBytes(hotel.getStardesc()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("shortName"),
Bytes.toBytes(hotel.getShortName()));
put.addColumn(Bytes.toBytes("hotel_info"), Bytes.toBytes("isSingleRec"),
Bytes.toBytes(hotel.getIsSingleRec()));
puts.add(put);
}
// 批量保存数据
HBaseUtil.putByTable("t_city_hotels_info", puts);
} catch (Exception e) {
e.printStackTrace();
}
/********** End **********/
}
/**
* 获取和保存酒店的评论数据
*/
public static void saveCommentInfo() {
/********** Begin **********/
// 创建评论表
try {
HBaseUtil.createTable("t_hotel_comment", new String[] { "hotel_info", "comment_info" });
} catch (Exception e) {
// 创建表失败
e.printStackTrace();
}
InputStream resourceAsStream = SaveData.class.getClassLoader().getResourceAsStream("comment.txt");
try {
String readFileToString = IOUtils.toString(resourceAsStream, "UTF-8");
List<HotelComment> otherCommentListByPage = JSONObject.parseArray(readFileToString, HotelComment.class);
// 获取数据
List<Put> puts = new ArrayList<>();
// 定义Put对象
for (HotelComment comment : otherCommentListByPage) {
Put put = new Put((comment.getHotel_id() + "_" + comment.getId()).getBytes());
put.addColumn("hotel_info".getBytes(), "hotel_name".getBytes(),
comment.getHotel_name().getBytes());
put.addColumn("hotel_info".getBytes(), "hotel_id".getBytes(), comment.getHotel_id().getBytes());
// 数据量很大在这里只保存用作分析的数据
put.addColumn("comment_info".getBytes(), "id".getBytes(), Bytes.toBytes(String.valueOf(comment.getId())));
put.addColumn("comment_info".getBytes(), "baseRoomId".getBytes(), Bytes.toBytes(String.valueOf(comment.getBaseRoomId())));
if (comment.getBaseRoomId() != -1 && comment.getBaseRoomName() != null) {
put.addColumn("comment_info".getBytes(), "baseRoomName".getBytes(),
Bytes.toBytes(comment.getBaseRoomName()));
}
put.addColumn("comment_info".getBytes(), "checkInDate".getBytes(), Bytes.toBytes(comment.getCheckInDate()));
put.addColumn("comment_info".getBytes(), "postDate".getBytes(), Bytes.toBytes(comment.getPostDate()));
put.addColumn("comment_info".getBytes(), "content".getBytes(), Bytes.toBytes(comment.getContent()));
put.addColumn("comment_info".getBytes(), "highlightPosition".getBytes(),
Bytes.toBytes(comment.getHighlightPosition()));
put.addColumn("comment_info".getBytes(), "hasHotelFeedback".getBytes(),
Bytes.toBytes(String.valueOf(comment.getHasHotelFeedback())));
put.addColumn("comment_info".getBytes(), "userNickName".getBytes(),
Bytes.toBytes(comment.getUserNickName()));
puts.add(put);
}
// 上传数据
HBaseUtil.putByTable("t_hotel_comment", puts);
} catch (Exception e) {
e.printStackTrace();
}
/********** End **********/
}
}
旅游网站之数据分析
第1关 统计每个城市的宾馆平均价格
package com.processdata;
import java.io.IOException;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.util.HBaseUtil;
/**
* 使用MapReduce程序处理HBase中的数据并将最终结果存入到另一张表 1中
*/
public class HBaseMapReduce extends Configured implements Tool {
public static class MyMapper extends TableMapper<Text, DoubleWritable> {
public static final byte[] column = "price".getBytes();
public static final byte[] family = "hotel_info".getBytes();
@Override
protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
throws IOException, InterruptedException {
/********** Begin *********/
String cityId = Bytes.toString(result.getValue("cityInfo".getBytes(), "cityId".getBytes()));
byte[] value = result.getValue(family, column);
Double value1 = Double.parseDouble(Bytes.toString(value));
DoubleWritable i = new DoubleWritable(value1);
String priceKey = cityId;
context.write(new Text(priceKey),i);
/********** End *********/
}
}
public static class MyTableReducer extends TableReducer<Text, DoubleWritable, ImmutableBytesWritable> {
@Override
public void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
/********** Begin *********/
double sum = 0;
int len = 0;
for (DoubleWritable price : values)
{
len ++;
sum += price.get();
}
Put put = new Put(Bytes.toBytes(key.toString()));
put.addColumn("average_infos".getBytes(),"price".getBytes(),Bytes.toBytes(String.valueOf(sum / len)));
context.write(null, put);
/********** End *********/
}
}
public int run(String[] args) throws Exception {
//配置Job
Configuration conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "127.0.0.1"); //hbase 服务地址
conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
Scanner sc = new Scanner(System.in);
String arg1 = sc.next();
String arg2 = sc.next();
//String arg1 = "t_city_hotels_info";
//String arg2 = "average_table";
try {
HBaseUtil.createTable("average_table", new String[] {"average_infos"});
} catch (Exception e) {
// 创建表失败
e.printStackTrace();
}
Job job = configureJob(conf,new String[]{arg1,arg2});
return job.waitForCompletion(true) ? 0 : 1;
}
private Job configureJob(Configuration conf, String[] args) throws IOException {
String tablename = args[0];
String targetTable = args[1];
Job job = new Job(conf,tablename);
Scan scan = new Scan();
scan.setCaching(300);
scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存
//初始化Mapreduce程序
TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class, Text.class, DoubleWritable.class,job);
//初始化Reduce
TableMapReduceUtil.initTableReducerJob(
targetTable, // output table
MyTableReducer.class, // reducer class
job);
job.setNumReduceTasks(1);
return job;
}
}
第2关 统计酒店评论中词频较高的词
package com.processdata;
import java.io.IOException;
import java.util.List;
import java.util.Scanner;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.Word;
import com.util.HBaseUtil;
import com.vdurmont.emoji.EmojiParser;
/**
* 词频统计
*
*/
public class WorldCountMapReduce extends Configured implements Tool {
public static class MyMapper extends TableMapper<Text, IntWritable> {
private static byte[] family = "comment_info".getBytes();
private static byte[] column = "content".getBytes();
@Override
protected void map(ImmutableBytesWritable rowKey, Result result, Context context)
throws IOException, InterruptedException {
/********** Begin *********/
byte[] value = result.getValue(family, column);
String word = new String(value,"utf-8");
if(!word.isEmpty())
{
String filter = EmojiParser.removeAllEmojis(word);
List<Word> segs = WordSegmenter.seg(filter);
for(Word cont : segs)
{
Text text = new Text(cont.getText());
IntWritable v = new IntWritable(1);
context.write(text,v);
}
}
/********** End *********/
}
}
public static class MyReducer extends TableReducer<Text, IntWritable, ImmutableBytesWritable> {
private static byte[] family = "word_info".getBytes();
private static byte[] column = "count".getBytes();
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
/********** Begin *********/
int sum = 0;
for (IntWritable value : values)
sum += value.get();
Put put = new Put(Bytes.toBytes(key.toString()));
put.addColumn(family,column,Bytes.toBytes(sum));
context.write(null,put);
/********** End *********/
}
}
public int run(String[] args) throws Exception {
//配置Job
Configuration conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "127.0.0.1"); //hbase 服务地址
conf.set("hbase.zookeeper.property.clientPort", "2181"); //端口号
Scanner sc = new Scanner(System.in);
String arg1 = sc.next();
String arg2 = sc.next();
try {
HBaseUtil.createTable("comment_word_count", new String[] {"word_info"});
} catch (Exception e) {
// 创建表失败
e.printStackTrace();
}
Job job = configureJob(conf,new String[]{arg1,arg2});
return job.waitForCompletion(true) ? 0 : 1;
}
private Job configureJob(Configuration conf, String[] args) throws IOException {
String tablename = args[0];
String targetTable = args[1];
Job job = new Job(conf,tablename);
Scan scan = new Scan();
scan.setCaching(300);
scan.setCacheBlocks(false);//在mapreduce程序中千万不要设置允许缓存
//初始化Mapper Reduce程序
TableMapReduceUtil.initTableMapperJob(tablename,scan,MyMapper.class, Text.class, IntWritable.class,job);
TableMapReduceUtil.initTableReducerJob(targetTable,MyReducer.class,job);
job.setNumReduceTasks(1);
return job;
}
}
酒店智能推荐—智慧旅游
第1关 构建用户-酒店矩阵
import numpy as np
def create_user_hotel_matrix(users, items, data, hotel_id):
'''
构建用户-酒店矩阵
:param users: 用户数量,类型为整数
:param items: 酒店数量,类型为整数
:param data: 原始数据,类型为DataFrame
:param hotel_id: 酒店ID的列表,类型为列表
:return: user_hotel_matrix
'''
user_hotel_matrix = np.zeros((users, items))
for line in data.itertuples():
#********* Begin *********#
users = data.user_id.unique().shape[0]
# 将data中有多少个酒店统计出来并保存到items变量中
items = data.id.unique().shape[0]
user_hotel_matrix[line[3], hotel_id.index(line[1])] = line[4]
#********* End *********#
return user_hotel_matrix
data = pd.read_csv('./step1/hotel_data.csv', encoding='utf8')
第2关 酒店智能推荐
import numpy as np
def recommend_hotel(A, userid):
'''
向用户id为userid的用户推荐3家酒店
:param A: 已经更新好了的矩阵A
:param userid: 待推荐的userid,类型为整数
:return: recommend
'''
#********* Begin *********#
m,n = A.shape
d=5
alpha=0.1
lr=0.01
B = np.random.uniform(0,1,(m,d))
C = np.random.uniform(0,1,(d,n))
record = np.array(A>0, dtype=int)
B_grads = np.dot(np.multiply(record, np.dot(B,C)-A),C.T)
# 用和上面一样的方式按公式计算loss对C的偏导
C_grads = np.dot(B.T, np.multiply(record,np.dot(B,C)-A))
# 根据公式更新矩阵B和矩阵C
B = alpha*B - lr*B_grads
C = alpha*C - lr*C_grads
pred_ratings = np.dot(B, C)
# 对矩阵A中userid对应的行进行升序排序
ranklist = np.argsort(A[userid])
#********* End *********#
recommend = ranklist[-1:-4:-1]
return recommend[-1], recommend[-2], recommend[-3]
旅游网站之数据可视化
第1关 词云的绘制
package com.showdata;
import java.awt.Dimension;
import java.io.IOException;
import java.util.List;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
public class WordCloud_img {
public WordCloud get() throws IOException {
/********** Begin **********/
FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
frequencyAnalyzer.setWordFrequenciesToReturn(200);
List<WordFrequency> wordFrequencyList = frequencyAnalyzer.load("wordcloud.txt");
Dimension dimension = new Dimension(500,312);
//2.修改词云的实例化
WordCloud wordCloud = new WordCloud(dimension,CollisionMode.PIXEL_PERFECT);
//3.生成词云并写入图片
wordCloud.build(wordFrequencyList);
wordCloud.writeToFile("imgs/wordcloud_img.png");
/********** End **********/
return wordCloud;
}
}
第2关 词云的渲染
package com.showdata;
import java.awt.Color;
import java.awt.Dimension;
import java.io.IOException;
import java.util.List;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.PixelBoundryBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.image.AngleGenerator;
import com.kennycason.kumo.nlp.FrequencyAnalyzer;
import com.kennycason.kumo.palette.ColorPalette;
import com.kennycason.kumo.wordstart.CenterWordStart;
import java.awt.Font;
public class WordCloud_render {
public WordCloud get() throws IOException {
FrequencyAnalyzer frequencyAnalyzer = new FrequencyAnalyzer();
frequencyAnalyzer.setWordFrequenciesToReturn(200);
List<WordFrequency> wordFrequencies = frequencyAnalyzer.load("wordcloud.txt");
Dimension dimension = new Dimension(500, 312);
WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
/********** Begin **********/
java.awt.Font font = new java.awt.Font("宋体",3, 24);//3表示粗斜体
wordCloud.setKumoFont(new KumoFont(font));
wordCloud.setPadding(2);
wordCloud.setBackgroundColor(Color.white);
wordCloud.setBackground(new PixelBoundryBackground("myImgs/whale_small.png"));
wordCloud.setColorPalette(new ColorPalette(Color.RED, Color.BLUE, Color.GREEN));
wordCloud.setWordStartStrategy(new CenterWordStart());
wordCloud.setAngleGenerator(new AngleGenerator(0));//0表示横向 若填180则表示横向基础旋转180°
/********** End **********/
wordCloud.build(wordFrequencies);
wordCloud.writeToFile("imgs/wordcloud_render.png");
return wordCloud;
}
}
第3关 获取酒店评论数据生成词云
package com.showdata;
import com.kennycason.kumo.CollisionMode;
import com.kennycason.kumo.WordCloud;
import com.kennycason.kumo.WordFrequency;
import com.kennycason.kumo.bg.PixelBoundryBackground;
import com.kennycason.kumo.font.KumoFont;
import com.kennycason.kumo.image.AngleGenerator;
import com.kennycason.kumo.palette.LinearGradientColorPalette;
import com.kennycason.kumo.wordstart.CenterWordStart;
import com.util.HBaseUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.util.Bytes;
import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 词云
*/
public class CommentWordCloud {
public WordCloud get() throws IOException {
Connection conn=HBaseUtil.getConnection();
/********** Begin **********/
TableName tableName = TableName.valueOf(Bytes.toBytes("comment_word_count"));
Table table = conn.getTable(tableName);
ResultScanner scanner = table.getScanner(new Scan());
List<WordFrequency> words = new ArrayList<>();
for (Result result : scanner) {
String word = new String(result.getRow(), "utf-8");
int count = Bytes.toInt(result.getValue(Bytes.toBytes("word_info"), Bytes.toBytes("count")));
WordFrequency wordFrequency = new WordFrequency(word, count);
if (count > 10) {
words.add(wordFrequency);
}
}
// 2.生成词云并设置样式
Dimension dimension = new Dimension(500, 312);
WordCloud wordCloud = new WordCloud(dimension, CollisionMode.PIXEL_PERFECT);
java.awt.Font font = new java.awt.Font("宋体", Font.ITALIC, 24);
wordCloud.setKumoFont(new KumoFont(font));
wordCloud.setPadding(2);
wordCloud.setBackgroundColor(Color.WHITE);
wordCloud.setBackground(new PixelBoundryBackground("myImgs/whale_small.png"));
wordCloud.setColorPalette(new LinearGradientColorPalette(Color.RED, Color.BLUE, Color.GREEN, 30, 30));
wordCloud.setWordStartStrategy(new CenterWordStart());
wordCloud.setAngleGenerator(new AngleGenerator(0));
// 将词频数据加载到词云中
wordCloud.build(words);
/********** End **********/
wordCloud.writeToFile("imgs/wordcloud_comment.png");
return wordCloud;
}
}