数据下载
链接: https://pan.baidu.com/s/1JfJ5EK55-XQbgGODIpEWSw 提取码: gmkr 复制这段内容后打开百度网盘手机App,操作更方便哦
介绍
根据用户信息与活动(event)信息,预测用户将对哪些活动感兴趣。
数据集介绍
共有六个文件:train.csv,test.csv, users.csv,user_friends.csv,events.csv和 event_attendees.csv。
train.csv 包含六列:
user:用户id
event:活动id
invite:是否被邀请
timestamp:时间戳
interested:
not_interested
test.csv包含四列(与train的属性相同,但没有interested和not_interested)。
users.csv 包含七列
user_id:用户的ID
locale:用户区域
birthyear:用户出生的年份
gender:性别
joinedAt:首次使用APP的时间
location:用户位置
timezone:UTC偏移量
user_friends.csv包含有关此用户的社交数据,包含两列:user和friends。
user:用户的id,
friends:用户朋友ID(以空格分隔)。
events.csv 包含有关活动的数据,有110列。前九列是 event_id,user_id,start_time,city,state,zip,country, lat和lng
event_id:活动id
user_id:创建活动的用户的id
start_time:开始时间
city、state、zip、country:活动场地详细信息
lat和lng:经纬度
count_1, count_2,…, count_100 表示:统计了活动名称或描述中出现的100个最常见的词干,统计它们出现的频率(会把时态语态都去掉, 对词做了词频的排序(表示前N个最常出现的词,每个词在活动中出现的频次。)
count_other: count_other 是其余词的统计。
event_attendees.csv包含有关哪些用户参加了各种事件的信息,并包含以下列: event_id,yes,maybe,invite和no。
event_id:活动id
yes:会参加的用户
maybe:可能参加的用户
invite:邀请的用户
no:不会参加的用户
所以,总的来说包括三类数据:
- 用户信息
- 用户社交信息
- 活动本身信息
技术框架
使用Flume将数据写到Kafka
首先将数据上传到linux系统
然后到一个目录下面编写flume所需的conf
这里以event_attendees为例子
1.创建flume配置文件eventattend-flume-kafka.conf
eventattend.sources=eventattendSource
eventattend.channels=eventattendChannel
eventattend.sinks=eventattendSink
eventattend.sources.eventattendSource.type=spooldir
eventattend.sources.eventattendSource.spoolDir=/opt/flume/conf/jobkb09/dataSourceFile/eventattend
eventattend.sources.eventattendSource.includePattern=eventattends_[0-9]{4}-[0-9]{2}-[0-9]{2}.csv
eventattend.sources.eventattendSource.deserializer=LINE
eventattend.sources.eventattendSource.deserializer.maxLineLength=320000
eventattend.sources.eventattendSource.interceptors=head_filter
eventattend.sources.eventattendSource.interceptors.head_filter.type=regex_filter
eventattend.sources.eventattendSource.interceptors.head_filter.regex=^event*
eventattend.sources.eventattendSource.interceptors.head_filter.excludeEvents=true
eventattend.channels.eventattendChannel.type=file
eventattend.channels.eventattendChannel.checkpointDir=/opt/flume/conf/jobkb09/checkPointFile/eventattend
eventattend.channels.eventattendChannel.dataDirs=opt/flume/conf/jobkb09/dataChannelFile/eventattend
eventattend.sinks.eventattendSink.type=org.apache.flume.sink.kafka.KafkaSink
eventattend.sinks.eventattendSink.batchSize=640
eventattend.sinks.eventattendSink.brokerList=192.168.83.100:9092
eventattend.sinks.eventattendSink.topic=event_attendees_raw
eventattend.sources.eventattendSource.channels=eventattendChannel
eventattend.sinks.eventattendSink.channel=eventattendChannel
2.创建flume所需要的三个文件夹
3.启动kafka,在Kafka创建event_attendees_raw话题
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic users --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic user_friends_raw --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic user_friends --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic events --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic event_attendees_raw --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic event_attendees --partitions 1 --replication-factor 1
kafka-topics.sh --zookeeper hadoop100:2181 --create --topic train --partitions 1 --replication-factor 1
4.启动flume
[root@hadoop100 flume]# ./bin/flume-ng agent -n eventattend -c ./conf/ -f ./conf/jobkb09/eventattend-flume-kafka.conf -Dflume.root.logger=INFO,console
5.将之前上传的event_attendees.csv文件复制到/dataSourceFile的eventattend文件夹下面,并改名为eventattends_2021-01-06.csv
当文件末尾出现.completed,说明文件已经成功写入到kafka
其余4个文件重复1-5步的操作。
链接: https://pan.baidu.com/s/1UNv9DaG43u6-15y5BqcbsQ 提取码: gdpk 复制这段内容后打开百度网盘手机App,操作更方便哦
消费Kafka数据进入Hbase
首先在Hbase当中创建表
create_namespace 'events_db'
create 'events_db:users','profile','region','registration'
create 'events_db:user_friend','uf'
create 'events_db:events','schedule','location','creator','remark'
create 'events_db:event_attendee','euat'
create 'events_db:event_train','eu'
加载idea依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.kgc</groupId>
<artifactId>streamstu</artifactId>
<version>1.0-SNAPSHOT</version>
<name>streamstu</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.12</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.6.6</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
这里采用面向接口编程,将功能拆分接口,简化代码,是工业生产当中常见的做法
IWriter 写数据接口
public interface IWriter {
public int write(ConsumerRecords<String, String> records,String tableName) throws IOException;
}
HbaseWriter 实现IWriter接口的方法,是具体的实现类
package cn.kgc.kafkaToHbase2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.io.IOException;
import java.util.List;
/**
* @Author lichangxin
* @date 2021-01-08
* @Des
*/
public class HbaseWriter implements IWriter {
private Connection connection;
private IParseRecord iParseRecord;
public IParseRecord getiParseRecord() {
return iParseRecord;
}
public void setiParseRecord(IParseRecord iParseRecord) {
this.iParseRecord = iParseRecord;
}
public HbaseWriter(IParseRecord iParseRecord) {
this.iParseRecord=iParseRecord;
//配置hbase信息 连接hbase数据库
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.rootdir","hdfs://192.168.83.100:9000/hbase");
conf.set("hbase.zookeeper.quorum","192.168.83.100");
conf.set("hbase.zookeeper.property.clientPort","2181");
try {
connection= ConnectionFactory.createConnection(conf);
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public int write(ConsumerRecords<String, String> records, String tableName) throws IOException {
Table eventAttendTable = connection.getTable(TableName.valueOf(tableName));//"events2_db:user_friend"
List<Put> datas=iParseRecord.parse(records);
eventAttendTable.put(datas);
return datas.size();
}
}
IWorker 具体的工作者
public interface IWorker {
public void fillData();
}
ParentWorker 实现IWorker
package cn.kgc.kafkaToHbase2;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.StringDeserializer;
import java.util.Properties;
/**
* @Author lichangxin
* @date 2021-01-11
* @Des
*/
public abstract class ParentWorker implements IWorker {
protected Properties prop;
public ParentWorker(String groupName) {
prop=new Properties();
prop.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.83.100:9092");
prop.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
prop.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,StringDeserializer.class);
prop.put(ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,30000);
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,false);
prop.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG,1000);
prop.put(ConsumerConfig.GROUP_ID_CONFIG,groupName);
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
}
}
IParseRecord 组装者
将kafka消费的信息,通过加工转换,得到List《put》对象,用于hbase存储
package cn.kgc.kafkaToHbase2;
import org.apache.hadoop.hbase.client.Put;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.util.List;
/*组装者
将kafka消费的信息,通过加工转换,得到List《put》对象,用于hbase存储
* */
public interface IParseRecord {
public List<Put> parse(ConsumerRecords<String, String> records);
}
UsersHandler 实现IParseRecord
package cn.kgc.kafkaToHbase2;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import java.util.ArrayList;
import java.util.List;
/**
* @Author lichangxin
* @date 2021-01-12
* @Des
*/
public class UsersHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas=new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
if(split[0].trim().length()==0){
continue;
}
System.out.println(record);
Put put = new Put(Bytes.toBytes(split[0]));
put.addColumn("profile".getBytes(),"birthyear".getBytes(),split[2].getBytes());
put.addColumn("profile".getBytes(),"gender".getBytes(),split[3].getBytes());
put.addColumn("region".getBytes(),"locale".getBytes(),split[1].getBytes());
if(split.length>5){
put.addColumn("region".getBytes(),"location".getBytes(),split[5].getBytes());
}
if (split.length>6){
put.addColumn("region".getBytes(),"timezone".getBytes(),split[6].getBytes());
}
if (split.length>4){
put.addColumn("registration".getBytes(),"joinedAt".getBytes(),split[4].getBytes());
}
datas.add(put);
}
return datas;
}
}
Driver 用于执行
public class Driver {
public static void main(String[] args) {
//users
new HBaseWorker("users2",
"users",
"events_db:users",
new HbaseWriter(new UsersHandler())
).fillData();
}
}
其他的topic只需要再次创建对应的Handler并在Driver执行即可
EventsHandler
public class EventsHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes(split[0]));
put.addColumn("schedule".getBytes(),"start_time".getBytes(),split[2].getBytes());
put.addColumn("location".getBytes(),"city".getBytes(),split[3].getBytes());
put.addColumn("location".getBytes(),"state".getBytes(),split[4].getBytes());
put.addColumn("location".getBytes(),"zip".getBytes(),split[5].getBytes());
put.addColumn("location".getBytes(),"country".getBytes(),split[6].getBytes());
put.addColumn("location".getBytes(),"lat".getBytes(),split[7].getBytes());
put.addColumn("location".getBytes(),"lng".getBytes(),split[8].getBytes());
put.addColumn("creator".getBytes(),"user_id".getBytes(),split[1].getBytes());
put.addColumn("remark".getBytes(),"common_words".getBytes(),split[9].getBytes());
datas.add(put);
}
return datas;
}
}
TrainHandler
public class TrainHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes((split[0]+split[1]).hashCode()));
put.addColumn("eu".getBytes(),"user_id".getBytes(),split[0].getBytes());
put.addColumn("eu".getBytes(),"event_id".getBytes(),split[1].getBytes());
put.addColumn("eu".getBytes(),"invited".getBytes(),split[2].getBytes());
put.addColumn("eu".getBytes(),"timestamp".getBytes(),split[3].getBytes());
put.addColumn("eu".getBytes(),"interested".getBytes(),split[4].getBytes());
put.addColumn("eu".getBytes(),"not_interested".getBytes(),split[5].getBytes());
datas.add(put);
}
return datas;
}
}
UserFriendsHandler
public class UserFriendsHandler implements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> p : records) {
//System.out.println(p.value());
String[] split = p.value().split(",");
Put put = new Put(Bytes.toBytes((split[0] + split[1]).hashCode()));
put.addColumn("uf".getBytes(), "userid".getBytes(), split[0].getBytes());
put.addColumn("uf".getBytes(), "friendid".getBytes(), split[1].getBytes());
datas.add(put);
}
return datas;
}
}
EventAttendeesHandler
public class EventAttendeesHandlerimplements IParseRecord {
@Override
public List<Put> parse(ConsumerRecords<String, String> records) {
List<Put> datas = new ArrayList<>();
for (ConsumerRecord<String, String> record : records) {
String[] split = record.value().split(",");
Put put = new Put(Bytes.toBytes(split[0].hashCode()));
put.addColumn("euat".getBytes(),"event".getBytes(),split[0].getBytes());
put.addColumn("euat".getBytes(),"yes".getBytes(),split[1].getBytes());
put.addColumn("euat".getBytes(),"maybe".getBytes(),split[2].getBytes());
put.addColumn("euat".getBytes(),"invited".getBytes(),split[3].getBytes());
put.addColumn("euat".getBytes(),"no".getBytes(),split[4].getBytes());
datas.add(put);
}
return datas;
}
}
将Hbase的表映射要HIve
创建数据库
create database events;
打开配置优化
hive (default)> set hive.exec.dynamic.partition=true;
hive (default)> set hive.exec.dynamic.partition.mode=nostrict;
hive (default)> set hive.auto.convert.join=false;
映射hbase的表格
user_friend
create external table events.hb_user_friend(row_key String,
user_id String,
friend_id String)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key,uf:userid,uf:friendid')
tblproperties('hbase.table.name'='events_db:user_friend');
create table user_friend
stored as ORC as select * from hb_user_friend;
event_attendee
create external table hb_event_attendee(row_key String,
event_id String,
user_id String,
attend_type String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key,euat:eventid,euat:userid,euat:state')
tblproperties('hbase.table.name'='events_db:event_attendee');
create table event_attendee
stored as ORC as select * from hb_event_attendee;
events
create external table hb_events(event_id String,
start_time String,
city String,
state String,
zip String,
country String,
lat String,
lng String,
user_id String,
common_words String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key,
schedule:start_time,
location:city,
location:state,
location:zip,
location:country,
location:lat,
location:lng,
creator:user_id,
remark:common_words')
tblproperties('hbase.table.name'='events_db:events');
create table events
stored as ORC as select * from hb_events;
train
create external table hb_train(row_key String,
user_id String,
event_id String,
invited String,
timestamp String,
interested String,
not_interested String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key,
eu:user,
eu:event,
eu:invited,
eu:timestamp,
eu:interested,
eu:not_interested')
tblproperties('hbase.table.name'='events_db:train');
create table train
stored as ORC as select * from hb_train;
users
create external table hb_users(user_id String,
birth_year String,
gender String,
locale String,
location String,
timezone String,
joinedAt String
)
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ('hbase.columns.mapping'=':key,
profile:birthyear,
profile:gender,
region:locale,
region:location,
region:timezone,
registration:joinedAt'
)
tblproperties('hbase.table.name'='events_db:users');
create table users
stored as ORC as select * from hb_users;