介绍
- awk是一个文本处理工具,通常用于处理处理数据并生成结果报告
- awk的命名是它的创始人 Affred Aho、Peter Weinberger 和Brian Kernighan 姓氏的首个字母组成的
语法格式
- 第一种形式:awk ‘BEGIN{}pattern{commands}END{}’ file_name
- 第二种形式:standard output | awk ‘BEGIN{}pattern{commands}END{}’
语法格式说明
语法格式 | 解释 |
---|
BEGIN{} | 正式处理数据之前执行 |
pattern | 匹配模式 |
{commands} | 处理命令,可能多行 |
END{} | 处理完所有匹配数据后执行 |
内置变量对照表
内置变量 | 含义 |
---|
$0 | 含义 |
1
−
1-
1−n | 当前行的第1-n个字段 |
NF (number field) | 当前行的字段个数,也就是有多少列 |
NR (number row) | 当前行的行号,从1开始计数 |
FNR (file number row) | 多文件处理时,每个文件行号单独计数,都是从0开始 |
FS (field separator) | 输入字段分隔符。不指定默认以空格或tab键分割 |
RS (row separator) | 输入行分隔符。默认回车换行 |
OFS (output field separator) | 输出字段分隔符。默认为空格 |
ORS (output row separator) | 输出行分隔符。默认为回车换行 |
FILENAME | 处理文件的文件名 |
ARGC () | 命令行参数个数 |
ARGV () | 命令行参数数组 |
awk '{print}' /etc/passwd
awk 'BEGIN{FS=":"}{print $1} ' /etc/passwd
awk '{print NF} ' /etc/passwd
awk 'BEGIN{FS=":"}{print NF} ' /etc/passwd
awk '{print NR} ' /etc/passwd
awk '{print NR} ' /etc/passwd /etc/passwd
awk '{print FNR} ' /etc/passwd /etc/passwd
awk 'BEGIN{FS=":"}{print $7} ' /etc/passwd
awk 'BEGIN{FS=":"}{print $NF} ' /etc/passwd
printf 的格式说明符
格式符 | 含义 |
---|
%s | 打印字符串 |
%d | 打印十进制数 |
%f | 打印一个浮点数 |
%x | 打印十六进制数 |
%o | 打印八进制数 |
%e | 打印数字的科学计数方式 |
%c | 打印单个字符的ADCII码 |
printf 的修饰符
修饰符 | 含义 |
---|
- | 左对齐 |
+ | 右对齐 |
# | 显示8进制的在前面+0,显示16进制的在前面加0x |
awk 'BEGIN{FS=":"}{printf "%s\n",$1} ' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%20s%20s\n",$1,$7} ' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%-20s%-20s\n",$1,$7} ' /etc/passwd
格式符示例
awk 'BEGIN{FS=":"}{printf "%s\n",$7}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%d\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%f\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%0.2f\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%x\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%#x\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%o\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%#o\n",$3}' /etc/passwd
awk 'BEGIN{FS=":"}{printf "%e\n",$3}' /etc/passwd
模式匹配的两种方法
- 第一种模式匹配 :RegExp
- 第二种模式匹配 :关系运算符匹配
1.RegExp
awk 'BEGIN{FS=":"} /root/ {print $0}' /etc/passwd
awk 'BEGIN{FS=":"} /^root/ {print $0}' /etc/passwd
2.运算符匹配
关系运算符匹配
关系运算符 | 含义 |
---|
< | 小于 |
> | 大于 |
<= | 小于等于 |
>= | 大于等于 |
== | 等于 |
!= | 不等于 |
~ | 匹配正则表达式 |
!~ | 不匹配正则表达式 |
awk 'BEGIN{FS=":"} $3<50 {print $0}' /etc/passwd
awk 'BEGIN{FS=":"} $3>50 {print $0}' /etc/passwd
awk 'BEGIN{FS=":"} $7=="/bin/bash" {print $0}' /etc/passwd
awk 'BEGIN{FS=":"} $7!="/bin/bash" {print $0}' /etc/passwd
awk 'BEGIN{FS=":"}$3~/[0-9]{3,}/{print $0}' /etc/passwd
布尔运算符匹配
awk 'BEGIN{FS=":"} /hdfs/||/yarn/ {print $0}' /etc/passwd
awk 'BEGIN{FS=":"} $3<50&&$4>50 {print $0}' /etc/passwd
awk动作表达式中的算术运算符
运算符 | 含义 |
---|
+ | 加 |
- | 减 |
* | 乘 |
/ | 除 |
% | 模 |
^ 或** | 乘方 |
++X | 在返回X变量之前,X变量加1 |
X++ | 在返回X变量之后,X变量加1 |
awk 'BEGIN{var1=20;var2="hello";print var1,var2}'
awk 'BEGIN{num1=20;num2+=num1;print num1,num2}'
awk 'BEGIN{num1=20;num2=num2+num1;print num1,num2}'
awk 'BEGIN{num1=20;num2=20;print num1,num2}'
awk 'BEGIN{num1=20;num2=30;print num1+num2}'
awk 'BEGIN{num1=20;num2=30;print num1-num2}'
awk 'BEGIN{num1=20;num2=30;print num1*num2}'
awk 'BEGIN{num1=20;num2=30;print num1/num2}'
awk 'BEGIN{num1=20;num2=30;printf "%0.2f\n",num1/num2}'
awk 'BEGIN{num1=20;num2=2;print num1^num2}'
awk 'BEGIN{num1=20;num2=2;print num1**num2}'
awk 'BEGIN{num1=20;num2=num1++;print num1,num2}'
awk 'BEGIN{num1=20;num2=++num1;print num1,num2}'
awk '/^$/{sum++}END{print sum}' /etc/services
17
Allen 80 90 96 98
Mike 93 98 92 91
Zhang 78 76 87 92
Jerry 86 89 68 92
Han 85 95 75 90
Li 78 88 98 100
awk '{total=$2+$3+$4+$5;AVG=total/4;printf "%-8s%-5d%-5d%-5d%-5d%0.2f\n",$1,$2,$3,$4,$5,AVG}' student.txt
Allen 80 90 86 98 87.4
awk 'BEGIN{printf "%-8s%-8s%-8s%-8s%-8s%s\n","姓名","语文","数学",“英语”,"物理","平均分"}{total=$2+$3+$4+$5;AVG=total/4;printf "%-8s%-8d%-8d%-8d%-8d%0.2f\n",$1,$2,$3,$4,$5,AVG}' student.txt
awk中的条件及循环语句
条件语句:
if (条件表达式)
动作1
else if (条件表达式)
动作2
else
动作3
循环语句:
while 循环:
while (条件表达式)
动作
do while 循环:
do
动作
while (条件表达式)
for 循环:
for (初始化计数器;计数器测试;计数器变更)
动作
awk 'BEGIN{FS=":"}{if($3>50 && $3<100) print $0}' /etc/passwd
awk中的字符串函数
字符串函数对照表
函数名 | 解释 | 函数返回值 |
---|
length(str) | 计算字符串长度 | 整数长度值 |
index(str1,str2) | 在str1中查找str2的位置 | 返回值为位置索引,从1计数 |
tolower(str) | 转换为小写 | 转换后的小写字符串 |
toupper(str) | 转换为大写 | 转换后的大学字符串 |
substr(str,m,n) | 从str的m个字符开始,截取n位 | 截取后的子串 |
split(str,arr,fs) | 按fs切割字符串,结果保存arr | 切割后的子串的个数 |
match(str,RE) | 在str中按照RE查找,返回位置 | 返回索引位置 |
sub(RE.RepStr,str) | 在str中搜索符合RE的字串,将其替换为RepStr,只替换第一个 | 替换的个数 |
gsub(RE.RepStr,str) | 在str中搜索符合RE的字串,将其替换为RepStr,替换所有 | 替换的个数 |
awk 'BEGIN{FS=":"}
{for(i=1;i<=NF;i++){
if(i!=NF){
printf "%d:",length($i)
}else{
printf "%d",length($i)
}
}
print '\n'
}' /etc/passwd
awk 'BEGIN{str="I have a dream";location=index(str,"ea");print location}'
awk 'BEGIN{str="Hadoop is a bigdata Framework";print tolower(str)}'
awk 'BEGIN{str="Hadoop is a bigdata Framework";print toupper(str)}'
awk 'BEGIN{str="Hadoop Kafka Spark Storm HDFS YARN Zookeeper";split(str,arr," ");print arr[1]}'
awk 'BEGIN{str="Hadoop Kafka Spark Storm HDFS YARN Zookeeper";split(str,arr," ");for(a in arr){print arr[a]}}'
awk 'BEGIN{str="Tranction 2345 Start:Select * from master";printf "%d",match(str,"[0-9]")}'
awk 'BEGIN{str="transaction start";print substr(str,4,5)}'
awk 'BEGIN{str="ranction 243 Start,Event ID:9002";print sub(/[0-9]+/,"$",str)}'
awk 'BEGIN{str="ranction 243 Start,Event ID:9002";sub(/[0-9]+/,"$",str);print str}'
awk中的常用选项
awk选项总结
选项 | 解释 |
---|
-v | 参数传递 (定义或引用变量) |
-f | 指定脚本文件 |
-F | 指定分隔符 |
-V | 查看awk版本号 |
num1 =20
var ="hello world"
awk -v num2=$num1 -v var1=“$var” 'BEGIN{print num2,var1}'
awk -F ":" '{print $7}' /etc/passwd
shell中数组的用法
- array=(“Allen” “Mike” “Messi” “Jerry” “Hanmeimei” “Wang”)
选项 | 解释 |
---|
打印元素 | echo ${array[2]} |
打印元素个数 | echo ${#array[@]} |
打印元素长度 | echo ${#array[3]} |
给元素赋值 | array[3]=“Chen” |
删除元素 | unset array[2];unset array |
分片访问 | echo ${array[@]:1:3} |
元素内容替换 |
a
r
r
a
y
[
@
]
/
e
/
E
只
替
换
第
一
个
e
;
{array[@]/e/E} 只替换第一个e;
array[@]/e/E只替换第一个e;{array[@]//e/E} 替换所有的e |
|数组的遍历:
for a in ${array[@]}
do
echo $a
done
netstat -an|grep tcp|awk '{array[$6]++}END{for(a in array) print a,array[a]}'
结果:
LISTEN 12
ESTABLISHED 1
Allen 80 90 87 91
Mike 78 86 93 96
Kobe 66 92 82 78
Jerry 98 74 66 54
Wang 87 21 100 43
awk 'BEGIN{printf "%-10s%-10s%-10s%-10s%-10s%-10s\n","Name","Yuwen","Math","English","Physical","Total"}
{total=$2+$3+$4+$5
yuwen_sum+=$2
math_sum+=$3
eng_sum+=$4
phy_sum+=$5
printf "%-10s%-10d%-10d%-10d%-10d%-10d\n",$1,$2,$3,$4,$5,total}
END{printf "%-10s%-10d%-10d%-10d%-10d\n","",yuwen_sum,math_sum,eng_sum,phy_sum}' student.txt
模拟生产环境数据脚本
db.log.20190608 数据如下:
2019-06-08 10:31:40 15459 Batches: user Jerry insert 5504 records into datebase:product table:detail, insert 5253 records successfully,failed 251 records
2019-06-08 10:31:40 15460 Batches: user Tracy insert 25114 records into datebase:product table:detail, insert 13340 records successfully,failed 11774 records
2019-06-08 10:31:40 15461 Batches: user Hanmeimei insert 13840 records into datebase:product table:detail, insert 5108 records successfully,failed 8732 records
2019-06-08 10:31:40 15462 Batches: user Lilei insert 32691 records into datebase:product table:detail, insert 5780 records successfully,failed 26911 records
2019-06-08 10:31:40 15463 Batches: user Allen insert 25902 records into datebase:product table:detail, insert 14027 records successfully,failed 11875 records
exam1.awk
BEGIN{
printf "%-20s%-20s\n","User","Total records"
}
{
USER[$6]+=$8
}
END{
for(u in USER)
printf "%-20s%-20d\n",u,USER[u]
}
awk -f exam1.awk db.log.20190608
exam2.awk
BEGIN{
printf "%-30s%-30s%-30s\n","User","Success records","Failed records"
}
{
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in SUCCESS)
printf "%-30s%-30d%-30d\n",u,SUCCESS[u],FAILED[u]
}
awk -f exam2.awk db.log.20190608
exam3.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
TOTAL_RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in TOTAL_RECORDS)
printf "%-30s%-30d%-30d%-30d\n",u,TOTAL_RECORDS[u],SUCCESS[u],FAILED[u]
}
awk -f exam3.awk db.log.20190608
exam4_b.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
TOTAL_RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
}
END{
for(u in TOTAL_RECORDS)
{
records_sum+=TOTAL_RECORDS[u]
success_sum+=SUCCESS[u]
failed_sum+=FAILED[u]
printf "%-30s%-30d%-30d%-30d\n",u,TOTAL_RECORDS[u],SUCCESS[u],FAILED[u]
}
printf "%-30s%-30d%-30d%-30d\n","",records_sum,success_sum,failed_sum
}
awk -f exam4_b.awk db.log.20190608
方法2:
exam4.awk
BEGIN{
printf "%-30s%-30s%-30s%-30s\n","Name","total records","success records","failed records"
}
{
RECORDS[$6]+=$8
SUCCESS[$6]+=$14
FAILED[$6]+=$17
records_sum+=$8
success_sum+=$14
failed_sum+=$17
}
END{
for(u in RECORDS)
printf "%-30s%-30d%-30d%-30d\n",u,RECORDS[u],SUCCESS[u],FAILED[u]
printf "%-30s%-30d%-30d%-30d\n","total",records_sum,success_sum,failed_sum
}
awk '{if($8!=$14+$17) print NR,$0}' db.log.20190608
写入文件的方式
exam5.awk
BEGIN{
}
{
if($8!=$14+$17)
print NR,$0
}
awk -f exam5.awk db.log.20190608