同学可以自行下载原始数据集合:http://download.labs.sogou.com/resource/q.php
这里主要利用Sql Server数据库来演示基本的分析方法
1)数据导入:
可以将全部日志文件合并为一个单一文件,以方便导入,方法是在命令提示符界面中运行 copy *.filter data.dat
数据格式为Linux系统文本存储格式,便于处理可以利用linux2dos(下载)工具直接转换,方法是在命令提示符界面中运行 linux2dos data.dat data2.dat
在Sql Server查询管理器中运行:
[kw] [varchar] (1200) COLLATE Chinese_PRC_CI_AS NULL ,
[ab] [varchar] (1200) COLLATE Chinese_PRC_CI_AS NULL ,
[url] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL
) ON [PRIMARY]
GO
BULK INSERT data FROM 'C:\data2.dat'
WITH (
DATAFILETYPE = 'char',
FIELDTERMINATOR = '\t',
ROWTERMINATOR = '\n',
MAXERRORS=100000
)
2)数据处理
由于存在错误切分数据,所以需要进行必要的数据整理,错误数据量很少
delete from data where ab like '%]%'
delete from data where ascii(ab)>127
delete from data where ascii(lower(ab))>=ascii('a') and ascii(lower(ab))<=ascii('z')
delete from data where charindex(' ',ab)<=0
这些删除记录都为一些由于切分错误引发的错误格式
3)在data表中增加rank和seq字段,并填充(其实是将导入进来的原本两个字段再次处理分开,并去除查询词语前后的方括号)
update data set rank=CONVERT(int, substring([ab],1,charindex(' ',[ab])-1))
update data set seq=CONVERT(int, substring([ab],charindex(' ',[ab])+1,10))
update data set kw=substring([kw],2,len([kw])-2)
delete from data where len(kw)>900
建立各个字段的索引
4)常见查询分析:
用户访问情况分析
select top 100 userid,count(*) from data group by userid order by count(*) desc
存在爬虫
select * from data where userid='13505142608748166' order by kw,seq
查询词语情况
select top 100 kw,count(*) from data group by kw order by count(*) desc
select top 100 kw,count(distinct userid) from data group by kw order by count(distinct userid) desc
select avg(len(kw)) from data
select len(kw),count(*) from data group by len(kw) order by count(*) desc
查询浏览结果的情况
select rank,count(*) from data group by rank order by count(*) desc
了解词语命中的URL情况
select url,count(*) from data where kw='张玉凤' group by url order by count(*) desc
这里主要利用Sql Server数据库来演示基本的分析方法
1)数据导入:
可以将全部日志文件合并为一个单一文件,以方便导入,方法是在命令提示符界面中运行 copy *.filter data.dat
数据格式为Linux系统文本存储格式,便于处理可以利用linux2dos(下载)工具直接转换,方法是在命令提示符界面中运行 linux2dos data.dat data2.dat
在Sql Server查询管理器中运行:
CREATE TABLE [data] (
[userid] [varchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,[kw] [varchar] (1200) COLLATE Chinese_PRC_CI_AS NULL ,
[ab] [varchar] (1200) COLLATE Chinese_PRC_CI_AS NULL ,
[url] [varchar] (500) COLLATE Chinese_PRC_CI_AS NULL
) ON [PRIMARY]
GO
BULK INSERT data FROM 'C:\data2.dat'
WITH (
DATAFILETYPE = 'char',
FIELDTERMINATOR = '\t',
ROWTERMINATOR = '\n',
MAXERRORS=100000
)
2)数据处理
由于存在错误切分数据,所以需要进行必要的数据整理,错误数据量很少
delete from data where ab like '%]%'
delete from data where ascii(ab)>127
delete from data where ascii(lower(ab))>=ascii('a') and ascii(lower(ab))<=ascii('z')
delete from data where charindex(' ',ab)<=0
这些删除记录都为一些由于切分错误引发的错误格式
3)在data表中增加rank和seq字段,并填充(其实是将导入进来的原本两个字段再次处理分开,并去除查询词语前后的方括号)
update data set rank=CONVERT(int, substring([ab],1,charindex(' ',[ab])-1))
update data set seq=CONVERT(int, substring([ab],charindex(' ',[ab])+1,10))
update data set kw=substring([kw],2,len([kw])-2)
delete from data where len(kw)>900
建立各个字段的索引
4)常见查询分析:
用户访问情况分析
select top 100 userid,count(*) from data group by userid order by count(*) desc
存在爬虫
select * from data where userid='13505142608748166' order by kw,seq
查询词语情况
select top 100 kw,count(*) from data group by kw order by count(*) desc
select top 100 kw,count(distinct userid) from data group by kw order by count(distinct userid) desc
select avg(len(kw)) from data
select len(kw),count(*) from data group by len(kw) order by count(*) desc
查询浏览结果的情况
select rank,count(*) from data group by rank order by count(*) desc
了解词语命中的URL情况
select url,count(*) from data where kw='张玉凤' group by url order by count(*) desc