毕业设计三:数据清洗与转换
1、执行清洗工作
// CleanSQL.txt
delete from weblog where status like'4%'or status like'5%';
delete from weblog where method like'%POST%';
delete from weblog where method like'%OPTIONS%';
delete from weblog where method like'%HEAD%';
delete from weblog where req_bytes like'-';
delete from weblog where url like'%gif%';
delete from weblog where url like'%jpg%';
delete from weblog where url like'%JPG';
delete from weblog where url like'%jpeg%';
delete from weblog where url like'%png';
delete from weblog where url like'%images%';
delete from weblog where url like'%image%';
delete from weblog where url like'%swf%';
delete from weblog where url like'%css%';
delete from weblog where url like'%ccs.css%';
delete from weblog where url='/';
delete from weblog where url='%pac%';
delete from weblog where url like'/stat.php?type=%id=%';
delete from weblog where url like'%php%';
delete from weblog where url like'%js';
delete from weblog where url like'%cgi';
delete from weblog where url like'';
delete from weblog where url like'%zip';
delete from weblog where url like'%rar';
delete from weblog where url like'%mp3';
delete from weblog where url like'%wma';
delete from weblog where url like'%wmv';
delete from weblog where url like'%doc';
delete from weblog where url like'%exe';
delete from weblog where url like'%pdf';
delete from weblog where url like'%ppt';
commit;
执行:SQL> start F:\毕业设计\DataPreparation\CleanSQL.txt
清洗结果:
SQL> select count(*) from weblog;
COUNT(*)
----------
94265
2、进行格式转换与横向缩简
update weblog
set access_time=SUBSTR(access_time,2,20);
comm.it;
alter table weblog add(temptime date);
update weblog
set
temptime=to_date(access_time,'dd/mm/yyyy hh24:mi:ss','NLS_DATE_LANGUAGE=english');
commit;
alter table weblog drop column access_time;
alter table weblog add(access_time date);
update weblog
set access_time = temptime;
alter table weblog drop column temptime;
alter table weblog drop column ident;
alter table weblog drop column authorized_user;
alter table weblog drop column time_zone;
alter table weblog drop column method;
alter table weblog drop column protocol;
/**********************注释开始***********************************/
//(1)清洗数据,凡对挖掘结果没有影响的记录,全部删除。
//除HTM页面,ASP页面,类似图片,音乐,SWF,脚本,压缩包等全部删除
//状态为400到599的全部删除
//方法为POST,HEAD,OPTIONS的全部删除
//请求字节数为0的全部删除。
delete from weblog where status like'4%'or status like'5%';
delete from weblog where method like'%POST%';
delete from weblog where method like'%OPTIONS%';
delete from weblog where method like'%HEAD%';
delete from weblog where req_bytes like'-';
delete from weblog where url like'%gif%';
delete from weblog where url like'%jpg%';
delete from weblog where url like'%JPG';
delete from weblog where url like'%jpeg%';
delete from weblog where url like'%png';
delete from weblog where url like'%images%';
delete from weblog where url like'%image%';
delete from weblog where url like'%swf%';
delete from weblog where url like'%css%';
delete from weblog where url like'%ccs.css%';
delete from weblog where url='/';
delete from weblog where url='%pac%';
delete from weblog where url like'/stat.php?type=%id=%';
delete from weblog where url like'%php%';
delete from weblog where url like'%js';
delete from weblog where url like'%cgi';
delete from weblog where url like'';
delete from weblog where url like'%zip';
delete from weblog where url like'%rar';
delete from weblog where url like'%mp3';
delete from weblog where url like'%wma';
delete from weblog where url like'%wmv';
delete from weblog where url like'%doc';
delete from weblog where url like'%exe';
delete from weblog where url like'%pdf';
delete from weblog where url like'%ppt';
(2)对表进行格式转换和横向缩简
//过滤掉时间列的“[”字符
update weblog
set access_time=SUBSTR(access_time,2,20);
//将时间由VARCHAR2转换为DATE类型,便于后期处理。
//1、创建一个DATE类型的临时字段TEMPTIME。
alter table weblog add(temptime date);
//2、将旧字段中的VARCHAR2类型转换为时间格式,保存在TEMPTIME中
update weblog
set
temptime=to_date(access_time,'dd/mm/yyyy hh24:mi:ss','NLS_DATE_LANGUAGE=english');
commit;
//3、删除旧字段
alter table weblog drop column access_time;
//4、增加DATE类型的新字段
alter table weblog add(access_time date);
//5、将临时字段中的值更新到新增加的DATE类型字段中
update weblog
set access_time = temptime;
//6、删除临时字段
alter table weblog drop column temptime;
//对表进行横向缩简,删除不需要的字段
alter table weblog drop column ident;
alter table weblog drop column authorized_user;
alter table weblog drop column time_zone;
alter table weblog drop column method;
alter table weblog drop column protocol;
/
/****************************注释结束********************************/
2、登录SQL*PLUS
输入如下命令:
start F:\毕业设计\DataPreparation\CleanAndFormatSQL.txt
即可运行文本文件里边的SQL语句。
3、数据清洗的结果
SQL> select count(*) from weblog;
COUNT(*)
----------
94265
SQL> desc weblog;
名称 是否为空? 类型
----------------------------------------- -------- -----------------------
CIP VARCHAR2(20)
URL VARCHAR2(500)
STATUS VARCHAR2(10)
REQ_BYTES VARCHAR2(100)
ACCESS_TIME DATE
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/12443821/viewspace-265033/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/12443821/viewspace-265033/