项目架构:
idea创建sb项目:
至此,sb项目构建完毕,找到程序入口,测试成功!
搭建nginx环境:
1,安装c++依赖
cd /etc/yum.repos.d
rm -rf *
上传 CentOS6-Base-163.repo 文件---->/etc/yum.repos.d
yum clean all
yum makecache
yum -y install gcc pcre-devel openssl openssl-devel(必须执行)
2,解压nginx-1.11.11.tar.gz
tar -zxvf nginx-1.11.11.tar.gz -C ~/app/
3,cd 到解压后的nginx目录,执行以下命令 检查安装环境,并指定安装目录
./configure --prefix=/home/hadoop/app/nginx
4, cd 到解压后的nginx目录,执行以下命令
make && make install
5,修改nginx.conf配置文件
增加
#gzip on;
#bigdata.hwz.com为你的nginx访问
upstream bigdata.hwz.com {
#配置sb服务地址:端口与yml文件中一致,如果有多台可以写多个
server hadoop001:16666;
}
server {
#配置ngnix的端口,默认 80
listen 6789;
....很多.....
}
启动命令为(xxx/nginx/sbin/nginx -c xxx/nginx/conf/nginx.conf )
我们模拟client请求:
//该类模拟了一个httpClient,即通过该类可以将数据发送到sb
public class HttpClientDemo {
public static void sendData(String log,String hostname) {
try{
//该路径为yum文件中的路径加Controller中的路径
URL url = new URL(hostname+"/log/uploadData");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("POST");
conn.setRequestProperty("clientTime",System.currentTimeMillis()+"");
conn.setDoOutput(true);
conn.setRequestProperty("Content-Type", "application/json;charset=utf8");
//连接 可以不写,默认 conn.getOutputStream() 也能默认打开连接
conn.connect();
OutputStream os = conn.getOutputStream();
os.write(log.getBytes());
os.flush();
os.close();
//判断连接是否成功
if(conn.getResponseCode()==200){
System.out.println("状态:"+200);
}else{
System.out.println("状态!="+200);
}
conn.disconnect();
}catch (Exception e){
e.printStackTrace();
}
}
//该类模拟的是一个随机生成数据
object GenereateData {
def randomDate(): Date = {
val sdf = new SimpleDateFormat("yyyy/MM/dd hh:mm:ss")
val hour = new Random().nextInt(24)
val minute = new Random().nextInt(60)
val second = new Random().nextInt(60)
val sb = new StringBuffer()
sb.append("2019").append("/").append("09").append("/").append("04")
.append(" ").append(""+hour).append(":").append(""+minute).append(":").append(""+second)
sdf.parse(sb.toString)
}
def main(args: Array[String]): Unit = {
val platforms = Array("Android", "iOS","windows phone","Symbian","Harmony")
for(i <- 1 to 1000){
val sb = new StringBuffer()
//获取随机操作平台
val platform=platforms(new Random().nextInt(4))
//获取随机用户
var user="hwz"+new Random().nextInt(100)
//获取随机软件版本号
var version="10.1."+new Random().nextInt(3)
//获取随机ip
val ip = new Random().nextInt(100)+"."+new Random().nextInt(100)+"."+new Random().nextInt(100)+"."+new Random().nextInt(100)
//获取随机访问时间
val formatDate = new SimpleDateFormat("dd/MM/yyyy HH:mm:ss ZZZ")
val time = formatDate.format(randomDate())
//获取随机 traffic
val traffic = new Random().nextInt(8000)
//获取随机停留时间
val duration = new Random().nextInt(5000)
val appId="app001"
if(i%3==0 && i%5!=0){
val map = Map("user"->user,"appId"->appId,"ip"->ip,"time"->time,"traffic"->"error","duration"->duration,"platform"->platform,"version"->version)
val jo = new JSONObject(map)
HttpClientDemo.sendData(jo.toString(JSONFormat.defaultFormatter),"http://bigdata.hwz.com:6789")
}else{
val map = Map("user"->user,"appId"->appId,"ip"->ip,"time"->time,"traffic"->traffic,"duration"->duration,"platform"->platform,"version"->version)
val jo = new JSONObject(map)
HttpClientDemo.sendData(jo.toString(JSONFormat.defaultFormatter),"http://bigdata.hwz.com:6789")
}
}
}
}
public class AccessLog {
private String user; //用户账号,可能为null
private String platform; //操作平台
private String version; //软件版本号
private String ip; //通过ip可以获取到经纬度,省份/城市/运营商,然后可以构建商圈
private String traffic; //字符串型流量,
private String time; //访问时间
private String duration;// 页面停留时间
private String appId;//因为一家公司可能有多个app
public void setUser(String user) {
this.user = user;
}
public void setPlatform(String platform) {
this.platform = platform;
}
public void setVersion(String version) {
this.version = version;
}
public void setIp(String ip) {
this.ip = ip;
}
public void setTraffic(String traffic) {
this.traffic = traffic;
}
public void setTime(String time) {
this.time = time;
}
public void setDuration(String duration) {
this.duration = duration;
}
public void setAppId(String appId) {
this.appId = appId;
}
public String getUser() {
return user;
}
public String getPlatform() {
return platform;
}
public String getVersion() {
return version;
}
public String getIp() {
return ip;
}
public String getTraffic() {
return traffic;
}
public String getTime() {
return time;
}
public String getDuration() {
return duration;
}
public String getAppId() {
return appId;
}
@Override
public String toString() {
return "AccessLog{" +
"user='" + user + '\'' +
", platform='" + platform + '\'' +
", version='" + version + '\'' +
", ip='" + ip + '\'' +
", traffic='" + traffic + '\'' +
", time='" + time + '\'' +
", duration='" + duration + '\'' +
", appId='" + appId + '\'' +
'}';
}
}
//该类为控制器,即根据不同的请求路径,访问该控制器里面对应的方法
@Controller
public class DataController {
private final static Logger logger=Logger.getLogger(DataController.class);
//定义请求路径,即如果请求的是这个路径,并且为post请求,那么就执行下边这个方法
@RequestMapping(value = "/uploadData",method = RequestMethod.POST)
@ResponseBody
//@RequestBody主要用来接收前端传递给后端的json字符串中的数据的(请求体中的数据的),且必须为post请求(因为get请求无请求体)
public AccessLog doPost(@RequestBody AccessLog accessLog){
//通过log4j讲数据保存起来
logger.info(JSON.toJSONString(accessLog));
System.out.println(JSON.toJSONString(accessLog));
return accessLog;
}
}
log4j.properties配置文件:
#定义了生成的数据文件在执行该程序位置的上级目录下的logs目录下的access.log文件
#定义了access.log文件一天生成一个,然后后缀名为 .年.月.日
log4j.appender.hwz.File=org.apache.log4j.DailyRollingFileAppender
log4j.appender.hwz.File.file=../logs/access.log
log4j.appender.hwz.File.DatePattern='.'yyyy-MM-dd
log4j.appender.hwz.File.layout=org.apache.log4j.PatternLayout
log4j.appender.hwz.File.layout.ConversionPattern=%m%n
log4j.rootLogger=info,hwz.File
[hadoop@hadoop001 flumeC]$ cat tailDir_memory_hdfs.conf
#!/bin/bash
a1.sources = r1
a1.channels = c1
a1.sinks = k1
##定义source类型
a1.sources.r1.type = TAILDIR
a1.sources.r1.positionFile = /home/hadoop/data/flumeData/flume_position/taildir_position.json
a1.sources.r1.filegroups = f1
a1.sources.r1.filegroups.f1 = /home/hadoop/hadoop-project/logs/access.log
##定义channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 10000
a1.channels.c1.transactionCapacity = 10000
##定义sink类型
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path =hdfs://hadoop001:9000/hadoop/project/access-logs/%Y%m%d
a1.sinks.k1.hdfs.filePrefix = hwz
a1.sinks.k1.hdfs.fileSuffix= .log
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.rollInterval=60
a1.sinks.k1.hdfs.rollSize=0
a1.sinks.k1.hdfs.rollCount=0
a1.sinks.k1.hdfs.batchSize = 10000
a1.sinks.k1.hdfs.useLocalTimeStamp = true
##将三个组件串起来
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
捋一下思路:
1.启动nginx服务:/home/hadoop/app/nginx/sbin/nginx -c /home/hadoop/app/nginx/conf/nginx.conf
2.启动sb服务:nohup java -jar logserver-0.0.1-SNAPSHOT.jar &
执行后会在当前目录的上一层目录创建/logs/access.log
3.启动flume,实时监控生成数据access.log文件
启动命令:
flume-ng agent
-n a1
-c ${FLUME_HOME}/conf
-f /home/hadoop/hadoop-project/flumeC/tailDir_memory_hdfs.conf
-Dflume.root.logger=INFO,console
4.idea运行GenereateData
数据已到hdfs,使用mapreduce清洗数据:
重点:
1.ip解析,根据ip查找ip规则库,此处使用分布式缓存加载ip规则库,获取到对应的省/市/运营商字段
2.过滤掉错误的日志信息(流量字段需要数值型,过滤掉字符串类型)
3.解析日期,获取对应的年/月/日字段值
4.因为数据是json格式的,所以可以使用JSONObject.parseObject将每一行数据封装为一个对象,然后通过重写对象的toString方法调正输出清洗后的数据规则
//清洗数据使用的主类
public class EtlLog {
public static class MyMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
private List<Info> list=new ArrayList<Info>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
URI[] cacheFiles = context.getCacheFiles();
String path = cacheFiles[0].getPath();
// BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
FileSystem fileSystem = FileSystem.get(context.getConfiguration());
FSDataInputStream fis = fileSystem.open(new Path(path));
BufferedReader br = new BufferedReader(new InputStreamReader(fis));
String line;
while(StringUtils.isNotEmpty(line=br.readLine())){
String[] split = line.split("[|]");
Info info = new Info();
info.setStartIp(Long.parseLong(split[2]));
info.setEndIp(Long.parseLong(split[3]));
info.setProvince(split[6]);
info.setCity(split[7]);
info.setIsp(split[9]);
list.add(info);
}
IOUtils.closeStream(br);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//定义计算总记录数
context.getCounter("etlLog","totalLog").increment(1);
String json = value.toString();
AccessLog accessLog = JSONObject.parseObject(json, AccessLog.class);
//解析年月日
SimpleDateFormat sdf=new SimpleDateFormat("dd/MM/yyyy HH:mm:ss ZZZ");
try {
Date date = sdf.parse(accessLog.getTime());
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
int year = calendar.get(Calendar.YEAR);
int month = calendar.get(Calendar.MONDAY)+1; //Calendar的月份从0开始
int day = calendar.get(Calendar.DATE);
accessLog.setYear(year+"");
accessLog.setMonth(month<10?"0"+month:month+"");
accessLog.setDay(day<10?"0"+day:day+"");
// 过滤掉错误的流量信息
String traffic = accessLog.getTraffic();
long size = Long.parseLong(traffic);
accessLog.setSize(size);
//获取省/市/运营商
Long ip = IPUtil.ip2Long(accessLog.getIp());
int index = IPUtil.search(ip, list);
Info info = list.get(index);
accessLog.setProvince(info.getProvince());
accessLog.setCity(info.getCity());
accessLog.setIsp(info.getIsp());
//定义符合数据格式的记录数
context.getCounter("etlLog","rightLog").increment(1);
context.write(new Text(accessLog.toString()),NullWritable.get());
} catch (Exception e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//设置map端压缩
// conf.set("mapreduce.map.output.compress","true");
// conf.set("mapreduce.map.output.compress.codec","org.apache.hadoop.io.compress.DefaultCodec");
//设置reduce端压缩
// conf.set("mapreduce.output.fileoutputformat.compress","true");
// conf.set("mapreduce.output.fileoutputformat.compress.codec","org.apache.hadoop.io.compress.BZip2Codec");
Job job = Job.getInstance(conf, "etlLog");
//将mvn packet后的包指定一下
// conf.set("mapred.jar","C:\\tools\\IdealUltimateEdition\\hadoop\\target\\hadoop-1.0-SNAPSHOT.jar");
// job.setJar("C:\\tools\\IdealUltimateEdition\\hadoop\\target\\hadoop-1.0-SNAPSHOT.jar");
job.setJarByClass(EtlLog.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
//hadoop jar 后面设置主类的话 args[1]代表输入 args[2]代表
FileInputFormat.setInputPaths(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将ip规则库加载到缓存
job.addCacheFile(new URI(args[2]));
int status = job.waitForCompletion(true) ? 0 : -1;
// //拿到记录数
// CounterGroup etlLog = job.getCounters().getGroup("etlLog");
// Iterator<Counter> ite = etlLog.iterator();
// while(ite.hasNext()){
// //获取记录数,然后可以选择存放mysql或者其它数据库
// Counter next = ite.next();
// System.out.println("----"+next.getName()+"\t"+ next.getValue());
// }
System.exit(status);
}
}
//解析ip使用的工具类
public class IPUtil {
public static Long ip2Long(String ipaddr){
String[] ips=ipaddr.split("\\.");
long l = Long.parseLong(ips[0]) * 256 * 256 * 256 +
Long.parseLong(ips[1]) * 256 * 256 +
Long.parseLong(ips[2]) * 256 +
Long.parseLong(ips[3]);
return l;
}
public static int search(Long ip, List<Info> infos){
int index=-1;
int min=0;
int max=infos.size()-1;
while(min<=max){
int mid=(min+max)/2;
Info info = infos.get(mid);
if(ip>=info.getStartIp()&&ip<=info.getEndIp()){
index=mid;
break;
}else if(ip>info.getEndIp()){
min=mid+1;
}else if(ip<info.getStartIp()){
max=mid-1;
}
}
return index;
}
}
//该javaBean用来封装ip相关字段信息
public class Info {
private Long startIp;
private Long endIp;
private String province;
private String city;
private String isp;
public void setStartIp(Long startIp) {
this.startIp = startIp;
}
public void setEndIp(Long endIp) {
this.endIp = endIp;
}
public Long getStartIp() {
return startIp;
}
public Long getEndIp() {
return endIp;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public void setCity(String city) {
this.city = city;
}
public void setIsp(String isp) {
this.isp = isp;
}
public String getCity() {
return city;
}
public String getIsp() {
return isp;
}
}
此时数据已经清洗完毕,落盘到hdfs上,然后创建hive大宽表:
创建hive外部表,该表只需要创建一次,因为是分区表,每天的数据都会在不同的分区中,加载数据的路径即为etl清洗后的数据
create database hadoop;
use hadoop;
create external table hadoop.access_wide(
user string,
platform string,
version string,
ip string,
size bigint,
time string,
duration string,
appId string,
province string,
city string,
isp string,
y string,
month string,
d string
)
partitioned by (day string)
row format delimited fields terminated by '\t'
location 'hdfs://hadoop001:9000/hadoop/project/access-wide/';
然后刷新对应分区的数据即可:
ALTER TABLE hadoop.access_wide ADD IF NOT EXISTS PARTITION (day=$time)