awk处理大数据

#############################程序说明#############################
#1.输出路径为当前运行目录上级目录中建立 result文件夹
#2.在result文件夹中建立url_data,url_result,app_result,app_data三个文夹
#3.url_data 待解析URL数据存放目录
#4.url_result 已解析为噪音URL的数据存放目录
#4.app_result 已解析为APP应用所存放文件目录
#5.app_data 待处APP数理
#6.result/log_out.log 处理时间
#7.字段对应说明如下:
#$2:IMSI $3:MDN $4:MEID $5:DestinationIP $6:DestinationPort $7:SourceIP
#$8:SourcePort $9:ProtocolID $10:ServiceType $11:StartTime $12:EndTime
#$13:Duration $14:InputOctets $15:OutputOctets $16:DestinationURL
#2013年6月22日确定后的新数据列
#2:DestinationIP 3:DestinationPort 4:SourceIP 5:SourcePort 6:ProtocolID 7:ServiceType 
#8:StartTime 9:EndTime 10:Duration 11:InputOctets 12:OutputOctets 13:DestinationURL 
#14:DomainName 15:Host 
#作者:sunf

#邮箱:6905728#qq.com

#转载说明出处

################################################################

#!/bin/sh
awk -F '|' 'BEGIN{
#取得当前主机名,为每个机器单独部署时文件起一个别名
"hostname" | getline  file_name_everyone;
OFS="|";
is_null = "";
url_filename=file_name_everyone"_url.txt";
noice_filename=file_name_everyone"_noice.txt";
not_noice_filename=file_name_everyone"_not_noice.txt";
app_filename=file_name_everyone"_app.txt";
app_a19_filename=file_name_everyone"_app_a19.txt";
app_result_filename=file_name_everyone"_app_result.txt";   
app_data_filename=file_name_everyone"_app_data.txt"
app_flag="A1999000001";
haed_http="http://";
print "......解析文件开始........"  strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"}
END{print "......解析文件结束........DONE"  strftime("%Y-%m-%d %H:%M:%S") >> "../result/log_out.log"}
{
#文件分割 当文件条数为总记录条数50w的倍数时进行拆解文件,500000记录大小为:30M如需要更大的文件,可改变此大小
#4000000 约等于130M 56000000 约等于2G
if(NR % 56000000 ==0){
url_filename=file_name_everyone"_"NR"_url.txt";
noice_filename=file_name_everyone"_"NR"_noice.txt";
app_filename=file_name_everyone"_"NR"_app.txt";
app_a19_filename=file_name_everyone"_"NR"_app_a19.txt";
not_noice_filename=file_name_everyone"_"NR"not_noice.txt";
}
#初始化序列号
fruit_num = $1;
if(0 == (match(tolower($13),/.*(\.js\b|\.js\W|\.img|\.inf|\.dat|\.dwr|\.fla\b|\.fla\W|\.mp4|\.cmr|\.asm|\.cfg|\.amr|\.war|\.tdz|\.md5|\.jar|\.cmd|\.gif|\.png|\.jpeg|\.bmp|\.def|\.jpg|\.css|\.ico|\.cur|\.swf|\.txt|\.avi|\.xml|\.zip|\.cab|\.crl|\.mp3|\.tpt|\.fcg|\.lrc|\.action|\.rar|\.m4a|\.idx|\.exe|\.dll|\.ini|\.vbs|\.doc|\.flv\b|\.flv\W).*/)) && (length($2)>0 && length($3)>0 && length($4)>0 && length($5)>0 && length($6)>0)){
#当协议类型为http或wap时
if($6==1 || $6==2){
if(length($13)>0){
#将url字段中出现的 | 替换为 ,(逗号)
gsub("\\|",",",$13);
#拆分URL
split($13,url,"/");
#当url以http 或HTTP 或https 
if(tolower(url[1])~/http:|https:/){
 #去掉域名中的端口号
 split(url[3],array_host,":");
 host = array_host[1];
 #当以https开头时,增加 1 的长度
 flag_value = 9;
 if (tolower(url[1])=="https:"){
flag_value = 10;
 }
 #当url按斜杠拆分后长度为5时,为没有二级域名时
if(length(url)==5){
print fruit_num,$6,$7,$13,host,url[4],is_null,substr($13,length(url[3])+flag_value) >> "../result/url_data/"url_filename;
} else if(length(url)==4 || length(url)==3){
print fruit_num,$6,$7,$13,host,is_null,is_null,substr($13,length(url[3])+flag_value) >> "../result/url_data/"url_filename;
   } else {
print fruit_num,$6,$7,$13,host,url[4],url[4]"/"url[5],substr($13,length(url[3])+flag_value) >> "../result/url_data/"url_filename;

#当url不以http 或HTTP开头时
}else {
#取出url中第一个字符
http_flag = substr($13,1,1);
#如果第一个字符为 /开头,则将url增加DomainName
if(http_flag == "/"){
haed_http="http://"$14;
new_url = $14$13;
split(new_url,url_host,"/");
#去掉域名中的端口号
split(url_host[1],array_host1,":");
host1 = array_host1[1];
if(length(url_host)==3){
print fruit_num,$6,$7,haed_http$13,host1,url_host[2],is_null,substr(new_url,length(url_host[1])+2) >> "../result/url_data/"url_filename;
} else {
print fruit_num,$6,$7,haed_http$13,host1,url_host[2],url_host[2]"/"url_host[3],substr(new_url,length(url_host[1])+2) >> "../result/url_data/"url_filename;
}
} else {
#去掉域名中的端口号
split(url[1],array_host2,":");
host2 = array_host2[1];
if(length(url)==3){
print fruit_num,$6,$7,"http://"$13,host2,url[2],is_null,substr($13,length(url[1])+2) >> "../result/url_data/"url_filename;
} else if(length(url) ==2 || length(url)==1){
print fruit_num,$6,$7,"http://"$13,host2,is_null,is_null,substr($13,length(url[1])+2) >> "../result/url_data/"url_filename;
} else {
print fruit_num,$6,$7,"http://"$13,host2,url[2],url[2]"/"url[3],substr($13,length(url[1])+2) >> "../result/url_data/"url_filename;
}
}

}
} else {
print fruit_num,$13,$6,$7,1,is_null >> "../result/url_result/"noice_filename;
}         
#协议 3:SMTP 4:POP3 5:IMAP4 
} else if(($6==3 || $6==4 || $6==5) && (($7~/199|299|399|401|501|699|799|899|999|A99|B99|C99|D99|E99|F01|A01|F02|A02/))){
print fruit_num,$13,$6,$7,app_flag,is_null,is_null >> "../result/app_result/"app_a19_filename;
print fruit_num,$13,$6,$7,0,is_null >> "../result/url_result/"not_noice_filename;
#协议 6:FTP 8:MMS
} else if(($6==6 || $6==8) && (($7~/199|299|399|401|501|699|799|899|999|A99|B99|C99|D99|E99|F01|A01|F02|A02/))){
print fruit_num,$13,$6,$7,-1,is_null,is_null >> "../result/app_result/"app_result_filename;
print fruit_num,$13,$6,$7,0,is_null >> "../result/url_result/"not_noice_filename;
#协议 7:RTSP
} else if($6==7 && (($7~/199|299|399|401|501|699|799|899|999|A99|B99|C99|D99|E99|F01|A01|F02|A02/))){
print fruit_num,$2,$3,$6,$7 >> "../result/app_data/"app_data_filename;
print fruit_num,$13,$6,$7,0,is_null >> "../result/url_result/"not_noice_filename;
#其它协议时,输出
} else {
print fruit_num,$13,$6,$7,0,is_null >> "../result/url_result/"not_noice_filename;
}
} else {
print fruit_num,$13,$6,$7,1,is_null >> "../result/url_result/"noice_filename;
}
}' *.txt

转载于:https://my.oschina.net/suncf/blog/142594

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值