搜索关键词统计

#!/bin/bash
# 统计搜索词 分析171,173上的日志 

sourceDir="/export/manager/kmsearch/log/wordlog"
tmpDateFile="/tmp/search_wordlog_tmp.txt"
tmpSearchWordlog="/tmp/search_wordlog"


# 分析获取哪些日志文件
startDate="2015-05-04"
startTimeStamp=`date -d "$startDate" +%s`

endDate="2015-12-31"
endTimeStamp=`date -d "$endDate" +%s`

echo "" > $tmpDateFile 
for((i=$startTimeStamp; i<=$endTimeStamp; i=i+86400))
do
dateStr=`date -d @$i  "+%Y-%m-%d"`
echo "$dateStr.txt" >> $tmpDateFile
done

#下载 171
echo "downloading from 171..."
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
scp root@10.15.200.171:$sourceDir/$tmpStr $tmpSearchWordlog/171/
done

#173
echo "downloading from 173..."
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
scp root@10.15.200.173:$sourceDir/$tmpStr $tmpSearchWordlog/173/
done


#输出到同一个文件
echo "combine all data... "
echo '' > $tmpSearchWordlog/alldata.txt
dateArr=$(cat $tmpDateFile )
for tmpStr in ${dateArr[@]}
do
cat $tmpSearchWordlog/171/$tmpStr >> $tmpSearchWordlog/alldata.txt
cat $tmpSearchWordlog/173/$tmpStr >> $tmpSearchWordlog/alldata.txt
done


#统计 - all
#cat $tmpSearchWordlog/alldata.txt | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -100 | awk '{print $1"\t"$2" "$3}' > $tmpSearchWordlog/allTop.txt
#exit


#拆分文件 3,000,000行 <200M
cd $tmpSearchWordlog
find . -name 'part.alldata.txt*' | xargs rm -rf
split -l3000000 alldata.txt part.alldata.txt
allPartFiles=`find . -name "part.alldata.txt*"`
for tmpStr in $allPartFiles
do
cat $tmpStr | awk -F ',' '{print $2","$6}' | sort | uniq | awk -F ',' '{print $1}'| sort | uniq -c | sort -rn | head -900 | awk '{print $1"\t"$2" "$3}' > ${tmpStr}_Tops.txt &
done

echo 'waiting 1分钟...'
sleep 60

# 整合统计
find . -name 'part.alldata.txt*_Tops.txt' | xargs cat | awk '{print $2"\t"$1}'  | tr '[A-Z]' '[a-z]' | sort > partsAllTops.txt

# 关键词统计
awk '{a[$1]+=$2;}END{for(i in a){print i,a[i];}}' partsAllTops.txt | awk '{print $2"\t"$1}' | sort -rn | grep -v 'www.' | grep -v 'http:' > statistic.result

 

转载于:https://www.cnblogs.com/bandbandme/p/5156947.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值