对于一个中小型规模的电商商务网站来说,如何收集用户行为日志是一个首先要解决问题。这里采用Nginx来完成,收集日志的流程:
(1) 开发人员编写自定义的js,通过插码的方式把数据放入到每个页面上
(2) 用户每次访问的时候,该js会把用户访问的行为统一发送到Nginx上,然后可以拦截并保持日志
1、开发编写的js
<pre name="code" class="html">var stat_url="/analyzeVesopera.gif";
var sflag = 1;
function awstats_setCookie(TRKNameOfCookie, TRKvalue, TRKexpirehours) {
var TRKExpireDate = new Date ();
TRKExpireDate.setTime(TRKExpireDate.getTime() + (TRKexpirehours * 3600 * 1000));
document.cookie = TRKNameOfCookie + "=" + escape(TRKvalue) + "; path=/" + ((TRKexpirehours == null) ? "" : "; expires=" + TRKExpireDate.toGMTString());
}
function awstats_getCookie(TRKNameOfCookie){
if (document.cookie.length > 0){
TRKbegin = document.cookie.indexOf(TRKNameOfCookie+"=");
if (TRKbegin != -1) {
TRKbegin += TRKNameOfCookie.length+1;
TRKend = document.cookie.indexOf(";", TRKbegin);
if (TRKend == -1) TRKend = document.cookie.length;
return unescape(document.cookie.substring(TRKbegin, TRKend));
}
return null;
}
return null;
}
TRKnow = new Date();
statScreenSize = screen.width+"x"+screen.height;
if (navigator.appName != "Netscape") {
statColorDepth=screen.colorDepth;
}else {
statColorDepth=screen.pixelDepth;
};
TRKuserid=awstats_getCookie("AWSUSER_ID");
TRKsessionid=awstats_getCookie("AWSSESSION_ID");
var TRKrandomnumber=Math.floor(Math.random()*100000000000000000000);
if (TRKuserid == null || (TRKuserid=="")) {TRKuserid =TRKrandomnumber;};
if (TRKsessionid == null || (TRKsessionid=="")) {
TRKsessionid = TRKrandomnumber;
sflag = 0;
};
awstats_setCookie("AWSUSER_ID", TRKuserid, 10000);
awstats_setCookie("AWSSESSION_ID", TRKsessionid, 1);
TRKuserid=""; TRKuserid=awstats_getCookie("AWSUSER_ID");
TRKsessionid=""; TRKsessionid=awstats_getCookie("AWSSESSION_ID");
var onloadTotalTime;
if(onloadStartTime){
onloadTotalTime = new Date().getTime() - onloadStartTime.getTime();
}else {
onloadTotalTime = -1;
}
statReferrer=escape(document.referrer);
statPageTitle=escape(document.title);
stat_param='?screenSize='+statScreenSize+'&screenColor='+statColorDepth+'&pageTitle='+statPageTitle+'&referrerPage='+statReferrer+'&siteType=0&'+'uid='+TRKuserid+'&sid='+TRKsessionid+'&sflag='+sflag+'&countlog='+(new Date().getTime())+'&onloadTotalTime='+onloadTotalTime+'&channel=8';
var stat_outstr='<script language="javascript" src="' + stat_url + stat_param + '"></script>';
var _img=new Image();
_img.src=stat_url+stat_param;
_img.οnlοad=function(){};
2、Nginx作为负载均衡服务器
然后拦截/analyzeVesopera.gif,把对应的数据保存下来, 访问请求浏览器上访问url,然后通过Nginx转发,Nginx会记录浏览的日志,日志格式详见标红部分。
<pre name="code" class="html">user www www;
worker_processes 4;
worker_cpu_affinity 0001 0010 0100 1000;
pid /var/run/nginx.pid;
events {
use epoll;
worker_connections 10240;
}
http {
include mime.types;
default_type application/octet-stream;
error_page 400 403 500 502 503 504 /50x.html;
index index.html index.shtml
autoindex off;
#set header buffer default 1k
server_names_hash_bucket_size 128;
client_header_buffer_size 32k;
large_client_header_buffers 4 32k;
sendfile on;
# These are good default values.
tcp_nopush on;
tcp_nodelay off;
# output compression saves bandwidth
gzip on;
gzip_static on;
#gzip_min_length 1k;
gzip_http_version 1.0;
gzip_comp_level 2;
gzip_buffers 4 16k;
gzip_proxied any;
gzip_disable "MSIE [1-6]\.";
gzip_types text/plain text/css application/x-javascript text/xml application/xml application/xml+rss text/javascript;
#gzip_vary on;
server_name_in_redirect off;
access_log /data/logs/nginx/access.log combined;
log_format access '"$time_local" ' '"$request_method" ' '"$arg_referrerPage" ' '"$server_protocol" '
'"$status" ' '"$http_referer" ' '"$http_user_agent" ' '"$http_x_forwarded_for" '
'"$arg_screenSize" ' '"$arg_screenColor" ' '"$arg_pageTitle" ' '"$arg_siteType" ' '"30$arg_uid"
' '"30$arg_sid" ' '"$arg_sflag" ' '"$arg_onloadTotalTime" ';
include /usr/local/nginx/conf/nginx_81.conf;
。。。。。。。。。。。。。。。。。。
}
3、通过编写shell脚本合并日志
对于一个大型电商网站来说,服务器会是一个集群,然后通过负载均衡来完成,解决高并发、大流量,导致网站访问性能瓶颈问题,
由于访问的日志会发送大很多太服务器上,这里可以编写shell每一个小时合并一次,然后放到日志服务器上的目录/data 下, 文件的格式onclick-201509070601.log 表明每一个小时生成一个。
一般到这里就完成了电商网站日志的收集工作。