package com.gamewave.dataservices.dataanalysis.weblog;
import com.gamewave.dataservices.util.HTableUtil;
import java.io.IOException;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.util.Bytes;
import org.json.JSONException;
import org.json.JSONObject;
/*
* 解决‘WEB日志分析系统’首页实时统计信息 和 基本统计 中 ‘访问IP统计(TOP10)’与‘访问页面统计(TOP10)’统计
* 基于WeblogOne.java基础上扩展,新增 基本统计 中 ‘访问IP统计(TOP10)’与‘访问页面统计(TOP10)’统计
* 思路:
* 1.从mysql库中查询从0点截止到上一个30秒的统计信息;
* 2.放入map中;
* 3.从hbase中查询下一个30秒的数据;
* 4.放入map中;
* 5.将map数据插入到mysql库中;
* 6.删除mysql中以前的数据
*/
public class WeblogTwo {
public static String DRIVER = "";
public static String URL = "";
public static String USERNAME = "";
public static String PASSWORD = "";
private static Connection conn;
private static String ALL = "ALL";
private static String SEP = "@#@";
private static int layoutTime = 30000;
private static boolean flag = false;
public static void main(String[] args) {
try {
String tableName = "weblog";
String[] qualify = "agent,path,remote,size".split(",");
HTable table = HTableUtil.getHtable(tableName);
Scan scan = new Scan();
table.setScannerCaching(20000);
table.setAutoFlush(false);
table.setScannerCaching(20000);
scan.setCaching(20000);
for (String item : qualify)
scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes(item));
long s;
long e;
while (true) {
try {
long systemTime = System.currentTimeMillis();
String StringTime = time13Convert(Long.toString(systemTime));
String queryDate = StringTime.substring(0, 10);
String date = queryDate.replaceAll("-", "");
scan.setStartRow(Bytes.toBytes(date + "-"));
scan.setStopRow(Bytes.toBytes(date + "_"));
if (systemTime % layoutTime == 0) {
s = systemTime - layoutTime*2;
e = systemTime - layoutTime;
String StartTime = time13Convert(Long.toString(s));
String StartDate = StartTime.substring(0, 10);
if (StartDate.equals(queryDate))
putWeblog(queryDate, table, scan, s, e);
else
init();
}
} catch (Exception e1) {
System.out.println(e1.getMessage());
continue;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
/*
* truncate table realtime,realtime_ip,access_ip,access_ur
*/
public static void init() throws SQLException {
java.sql.Connection conn = java.sql.DriverManager.getConnection(URL,
USERNAME, PASSWORD);
java.sql.Statement s = conn.createStatement(
ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
conn.setAutoCommit(false);
for (String name : "realtime,realtime_ip,access_ip,access_url"
.split(",")) {
StringBuilder sql = new StringBuilder("");
sql.append("truncate table " + name);
s.execute(sql.toString());
}
conn.commit();
s.close();
conn.close();
System.out
.println("truncate table realtime,realtime_ip,access_ip,access_url");
}
@SuppressWarnings("unchecked")
private static void putWeblog(String queryDate, HTable table, Scan scan,
long ss, long ee) throws IOException, JSONException, SQLException {
long start = System.currentTimeMillis();
scan.setTimeRange(ss, ee);
ResultScanner scanner = table.getScanner(scan);
conn = DriverManager.getConnection(URL, USERNAME, PASSWORD);
// 首页统计
Map<String, Statistic> map = new HashMap<String, Statistic>();
long id = 0L;
// 访问页面统计(TOP10)
Map<String, Long> pageMap = new HashMap<String, Long>();
// 访问IP统计(TOP10)
Map<String, Long> ipMap = new HashMap<String, Long>();
long start1 = System.currentTimeMillis();
// query the lastest date.
Statement stmt_id = conn.createStatement();
String sql_id = "select date from realtime order by date desc LIMIT 1";
ResultSet rs_id = stmt_id.executeQuery(sql_id);
String lastestDate = "";
while (rs_id.next()) {
lastestDate = rs_id.getString("date");
}
rs_id.close();
stmt_id.close();
long stop1 = System.currentTimeMillis();
System.out.println(" query the lastest date, time used "
+ (stop1 - start1));
// query the lastest history for access_url and put another map.
Statement stmt_access_url = conn.createStatement();
String sql_access_url = "select domain,url,count from access_url where date = '"
+ lastestDate + "'";
ResultSet rs_access_url = stmt_access_url.executeQuery(sql_access_url);
while (rs_access_url.next()) {
pageMap.put(rs_access_url.getString("domain") + SEP
+ rs_access_url.getString("url"), rs_access_url
.getLong("count"));
}
rs_access_url.close();
stmt_access_url.close();
long stop11 = System.currentTimeMillis();
System.out.println(" query the lastest access_url, time used "
+ (stop11 - stop1));
// query the lastest history for access_ip and put another map.
Statement stmt_access_ip = conn.createStatement();
String sql_access_ip = "select domain,ip,count from access_ip where date = '"
+ lastestDate + "'";
ResultSet rs_access_ip = stmt_access_ip.executeQuery(sql_access_ip);
while (rs_access_ip.next()) {
ipMap.put(rs_access_ip.getString("domain") + SEP
+ rs_access_ip.getString("ip"), rs_access_ip
.getLong("count"));
}
rs_access_ip.close();
stmt_access_ip.close();
long stop12 = System.currentTimeMillis();
System.out.println(" query the lastest access_ip, time used "
+ (stop12 - stop11));
// query the lastest history for realtime_ip and put another map.
Statement stmt_ip = conn.createStatement();
Map<String, String> historyMap = new HashMap<String, String>();
String sql_ip = "select domain,ip from realtime_ip where date = '"
+ lastestDate + "'";
ResultSet rs_ip = stmt_ip.executeQuery(sql_ip);
while (rs_ip.next()) {
historyMap.put(rs_ip.getString("domain"), rs_ip.getString("ip"));
}
rs_ip.close();
stmt_ip.close();
long stop2 = System.currentTimeMillis();
System.out.println(" query the lastest realtime_ip, time used "
+ (stop2 - stop1));
// query the lastest history for table realtime.
Statement stmt = conn.createStatement();
String sql = "select id,date,data from realtime where date like '"
+ queryDate + "%' order by date desc LIMIT 1";
ResultSet rs = stmt.executeQuery(sql);
while (rs.next()) {
if ((rs.getString("data") == null)
|| (rs.getString("data").length() <= 60))
continue;
String[] data = rs.getString("data").replaceAll("\"", "").split(
"},");
id = rs.getInt("id");
if (data.length < 15) {
WeblogTwo wD = new WeblogTwo();
Statistic sD = wD.new Statistic();
map.put("domain", sD);
} else {
WeblogTwo wD = new WeblogTwo();
for (String kv : data) {
Statistic sD = wD.new Statistic();
String[] kvs = kv.replace("{", ":").split("::");
String region = kvs[0].replace("{", "").replace(":", "");
String[] values = kvs[1].replace("{", "").split(",");
Set ipsDD = new HashSet();
if (historyMap.get(values[1].split(":")[1]) != null) {
String[] ip_s = historyMap.get(values[1].split(":")[1])
.split(",");
for (String IP : ip_s) {
ipsDD.add(IP);
}
}
sD.setIps(ipsDD);
sD.setPv(Long.parseLong(values[0].split(":")[1]));
sD.setFlow(Long.parseLong(values[2].split(":")[1]
.replaceAll("}", "")));
map.put(region, sD);
}
}
}
long stop3 = System.currentTimeMillis();
System.out.println(" query the lastest realtime, time used "
+ (stop3 - stop2));
// query the lastest 20s data from hbase.
Set<String> ipSet = new HashSet<String>();
Long flow = new Long(0L);
Long count = new Long(0L);
for (Result result : scanner) {
try {
String agent = new String(result.getValue(
Bytes.toBytes("info"), Bytes.toBytes("agent")));
String path = new String(result.getValue(Bytes.toBytes("info"),
Bytes.toBytes("path")));
String ip = new String(result.getValue(Bytes.toBytes("info"),
Bytes.toBytes("remote")));
String size = new String(result.getValue(Bytes.toBytes("info"),
Bytes.toBytes("size")));
flow = flow + Long.parseLong(size);
if (!matcher(agent)) {
count += 1L;
}
ipSet.add(ip);
String rowkey = new String(result.getRow());
String region = rowkey.split("-")[1];
if (map.get(region) == null) {
WeblogTwo w = new WeblogTwo();
Statistic s = w.new Statistic();
Set<String> ips = new HashSet<String>();
ips.add(ip);
// pv不统计爬虫的数据
if (matcher(agent) == false)
s.setPv(1);
s.setFlow(Integer.parseInt(size));
map.put(region, s);
} else {
Statistic s = (Statistic) map.get(region);
Set<String> ips = s.getIps();
ips.add(ip);
// pv不统计爬虫的数据
if (matcher(agent) == false)
s.setPv(s.getPv() + 1);
s.setFlow(s.getFlow() + Integer.parseInt(size));
map.put(region, s);
}
// 访问页面统计(TOP10)
path = region + SEP + path;
if (pageMap.get(path) == null)
pageMap.put(path, 1L);
else
pageMap.put(path, (pageMap.get(path) + 1L));
// 访问IP统计(TOP10)
ip = region + SEP + ip;
if (ipMap.get(ip) == null)
ipMap.put(ip, 1L);
else
ipMap.put(ip, (ipMap.get(ip) + 1L));
} catch (Exception localException) {
continue;
}
}
long stop4 = System.currentTimeMillis();
System.out.println(" query the lastest 30s data from hbase, time used "
+ (stop4 - stop3));
JSONObject json = new JSONObject();
// compute ALL and insert into realtime_ip .
JSONObject js = new JSONObject();
StringBuffer sbALL = new StringBuffer();
if (map.get(ALL) != null) {
Statistic allStatistic = (Statistic) map.get(ALL);
if (historyMap.get(ALL) != null) {
String[] ips = historyMap.get(ALL).split(",");
for (String ip : ips)
ipSet.add(ip);
}
js.put("IP", ipSet.size());
js.put("PV", (count + allStatistic.getPv()));
js.put("flow", (flow + allStatistic.getFlow()));
json.put(ALL, js);
for (String ip : ipSet) {
sbALL.append(ip).append(",");
}
} else {
js.put("IP", ipSet.size());
js.put("PV", count);
js.put("flow", flow);
json.put(ALL, js);
for (String ip : ipSet) {
sbALL.append(ip).append(",");
}
}
List<String> list = new ArrayList<String>();
String ALL_IP = "";
if (sbALL.toString().length() < 1)
ALL_IP = sbALL.toString();
else
ALL_IP = sbALL.toString().substring(0,
sbALL.toString().length() - 1);
String insertSql = "insert into realtime_ip(date,domain,ip) values('"
+ time13Convert(Long.toString(ee)) + "','" + ALL + "','"
+ ALL_IP + "')";
list.add(insertSql);
long stop5 = System.currentTimeMillis();
System.out.println(" put 'ALL' to realtime_ip, time used "
+ (stop5 - stop4));
// set json and insert into realtime_ip except ALL.
Iterator it = map.entrySet().iterator();
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
String region = (String) entry.getKey();
if (!ALL.equals(region)) {
Statistic value = (Statistic) entry.getValue();
JSONObject jsv = new JSONObject();
Set ips = value.getIps();
if (historyMap.get(region) != null) {
String[] dataDD = historyMap.get(region).split(",");
for (String ip : dataDD)
ips.add(ip);
}
jsv.put("IP", ips.size());
jsv.put("PV", value.getPv());
jsv.put("flow", value.getFlow());
json.put(region, jsv);
StringBuffer sb = new StringBuffer();
for (String ipValue : value.getIps()) {
sb.append(ipValue).append(",");
}
if (sb.toString().length() < 1)
insertSql = "insert into realtime_ip(date,domain,ip) values('"
+ time13Convert(Long.toString(ee))
+ "','"
+ region
+ "','" + sb.toString() + "')";
else
insertSql = "insert into realtime_ip(date,domain,ip) values('"
+ time13Convert(Long.toString(ee))
+ "','"
+ region
+ "','"
+ sb.toString().substring(0,
sb.toString().length() - 1) + "')";
list.add(insertSql);
}
}
// insert into statistic infomation utill now.
insertSql = "insert into realtime(date,data,dtype) values('"
+ time13Convert(Long.toString(ee)) + "','" + json.toString()
+ "','1')";
list.add(insertSql);
long stop6 = System.currentTimeMillis();
System.out.println(" put domain to realtime_ip, time used "
+ (stop6 - stop5));
// 访问页面统计(TOP10)
List<String> listUrl = new ArrayList<String>();
Iterator pageIt = pageMap.entrySet().iterator();
while (pageIt.hasNext()) {
Map.Entry entry = (Map.Entry) pageIt.next();
String page = (String) entry.getKey();
Long countIP = (Long) entry.getValue();
try {
String[] pages = page.split(SEP);
insertSql = " ('" + time13Convert(Long.toString(ee)) + "','"
+ pages[0] + "','" + pages[1].replace("'", "\\'")
+ "','" + countIP + "')";
listUrl.add(insertSql);
} catch (Exception e) {
continue;
}
}
// 访问IP统计(TOP10)
List<String> listIp = new ArrayList<String>();
Iterator ipIt = ipMap.entrySet().iterator();
while (ipIt.hasNext()) {
Map.Entry entry = (Map.Entry) ipIt.next();
String ip = (String) entry.getKey();
Long countIP = (Long) entry.getValue();
try {
String[] ips = ip.split(SEP);
insertSql = " ('" + time13Convert(Long.toString(ee)) + "','"
+ ips[0] + "','" + ips[1] + "','" + countIP + "')";
listIp.add(insertSql);
} catch (Exception e) {
continue;
}
}
// batch insert
patchInsert(list);
// batch insert access_url
executeInsert("access_url", "url", listUrl);
// batch insert access_ip
executeInsert("access_ip", "ip", listIp);
// // 启动两个线程插入
// WeblogTwo wt = new WeblogTwo();
// ThreadInsert t1 = wt.new ThreadInsert("access_url", "url", listUrl);
// ThreadInsert t2 = wt.new ThreadInsert("access_ip", "ip", listIp);
// Thread thread1= new Thread(t1);
// thread1.start();
// Thread thread2= new Thread(t2);
// thread2.start();
long stop7 = System.currentTimeMillis();
// delete the history records.
try {
String deleteSql = "delete from realtime where id < '" + (id - 10L)
+ "'";
Statement deleteStmt = conn.createStatement();
deleteStmt.executeUpdate(deleteSql);
deleteStmt.close();
String deleteSqlIP = "delete from realtime_ip where date <> '"
+ time13Convert(Long.toString(ee)) + "'";
Statement deleteStmtIP = conn.createStatement();
deleteStmtIP.executeUpdate(deleteSqlIP);
deleteStmtIP.close();
if (flag == true) { // 出现异常
// 访问页面统计(TOP10)
String delete_url = "delete from access_url where date = '"
+ time13Convert(Long.toString(ee)) + "'";
Statement deleteStmt_url = conn.createStatement();
deleteStmt_url.executeUpdate(delete_url);
deleteStmt_url.close();
String update_url = "update access_url set date = '"
+ time13Convert(Long.toString(ee)) + "'";
Statement update_url2 = conn.createStatement();
update_url2.executeUpdate(update_url);
update_url2.close();
flag = false;
}else{
// 访问页面统计(TOP10)
String delete_url = "delete from access_url where date <> '"
+ time13Convert(Long.toString(ee)) + "'";
Statement deleteStmt_url = conn.createStatement();
deleteStmt_url.executeUpdate(delete_url);
deleteStmt_url.close();
}
// 访问IP统计(TOP10)
String deleteSql_ip = "delete from access_ip where date <> '"
+ time13Convert(Long.toString(ee)) + "'";
Statement deleteStmt_ip = conn.createStatement();
deleteStmt_ip.executeUpdate(deleteSql_ip);
deleteStmt_ip.close();
} catch (Exception e) {
System.out.println(e.getMessage());
}
rs.close();
rs = null;
long stop8 = System.currentTimeMillis();
System.out.println(" delete table, time used " + (stop8 - stop7));
long stop = System.currentTimeMillis();
System.out.println("[" + time13Convert(Long.toString(ss)) + ","
+ time13Convert(Long.toString(ee)) + "] time used "
+ (stop - start) + "ms.");
}
public static String time13Convert(String oriTimeStr) {
if ((oriTimeStr == null) || (oriTimeStr.length() <= 0)
|| (!StringUtils.isNumeric(oriTimeStr))) {
return "";
}
String re_StrTime = null;
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long lcc_time = Long.valueOf(oriTimeStr).longValue();
re_StrTime = sdf.format(new Date(lcc_time));
return re_StrTime;
}
public static boolean matcher(String agent) {
String reg = "Mediapartners-Google|Googlebot|Slurp|spider|YodaoBot|ia_archiver|P\\.Arthur|QihooBot|Indy Library|Gigabot|heritrix|LWP\\:\\:Simple|winhttp|clshttp|HTTrack|harvest|nsauditor|dirbuster|pangolin|nmap|sqlninja|grendel-scan|hydra|perl|HTMLParser|libwww|BBBike|sqlmap|w3af|owasp|Nikto|fimap|havij|PycURL|python|zmeu|python-urllib|BabyKrokodil|Arachni|bench|httperf|webshag|webcollage|dragostea|injection|mysqloit|prog\\.customcrawler|network-services-auditor|bsqlbf|webinspect|revolt|planetwork|fuck|morpheus|dnspod|jcomers bot|mozilla/5\\.0 sf";
Pattern p = Pattern.compile(reg);
Matcher m = p.matcher(agent);
return m.find();
}
public class Statistic {
private Set<String> ips;
private long pv;
private long flow;
@SuppressWarnings("unchecked")
public Statistic() {
this.ips = new HashSet();
this.pv = 0L;
this.flow = 0L;
}
public Set<String> getIps() {
return this.ips;
}
public void setIps(Set<String> ips) {
this.ips = ips;
}
public long getPv() {
return this.pv;
}
public void setPv(long pv) {
this.pv = pv;
}
public long getFlow() {
return this.flow;
}
public void setFlow(long flow) {
this.flow = flow;
}
}
public static void patchInsert(List<String> list) throws SQLException {
java.sql.Connection conn = java.sql.DriverManager.getConnection(URL,
USERNAME, PASSWORD);
java.sql.Statement stmt = conn.createStatement(
ResultSet.TYPE_SCROLL_SENSITIVE, ResultSet.CONCUR_UPDATABLE);
conn.setAutoCommit(false);
long start = System.currentTimeMillis();
for (int i = 0; i < list.size(); i++) {
stmt.addBatch(list.get(i));
}
stmt.executeBatch();
conn.commit();
long end = System.currentTimeMillis();
System.out.println(" patchInsert realtime and realttime_ip:"
+ (end - start) + "ms, rows is " + list.size());
stmt.close();
conn.close();
stmt = null;
conn = null;
}
public static void executeInsert(String table, String col, List<String> list) {
try {
java.sql.Connection conn = java.sql.DriverManager.getConnection(
URL, USERNAME, PASSWORD);
java.sql.Statement s = conn
.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,
ResultSet.CONCUR_UPDATABLE);
conn.setAutoCommit(false);
StringBuilder sb = new StringBuilder("");
sb.append("insert into " + table + "(date,domain," + col
+ ",count) values ");
long start = System.currentTimeMillis();
int j = 0;
for (String sql : list) {
j++;
if (j % 10000 == 0) {
sb.append(sql);
s.execute(sb.toString());
conn.commit();
sb = new StringBuilder("");
sb.append("insert into " + table + "(date,domain," + col
+ ",count) values ");
continue;
} else {
if (j == list.size())
sb.append(sql);
else
sb.append(sql).append(",");
}
}
s.execute(sb.toString());
conn.commit();
s.close();
conn.close();
long end = System.currentTimeMillis();
System.out.println(" executeInsert " + table + ":" + (end - start)
+ "ms, rows is " + list.size());
} catch (Exception e) {
flag = true;
System.out.println("Exception:" + e.getMessage());
}
}
class ThreadInsert implements Runnable {
private String table;
private String col;
private List<String> list;
public ThreadInsert(String table, String col, List<String> list) {
this.table = table;
this.col = col;
this.list = list;
}
@Override
public void run() {
executeInsert(table, col, list);
}
}
}
实时统计解决方案
最新推荐文章于 2020-09-07 15:15:01 发布