shell在变量中使用awk命令查找两个数组的差集

最新推荐文章于 2024-03-17 22:05:52 发布

牧锋

最新推荐文章于 2024-03-17 22:05:52 发布

阅读量3.4k

点赞数

分类专栏： Shell awk

本文链接：https://blog.csdn.net/weixin_40245633/article/details/88353311

版权

Shell 同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

awk

1 篇文章 0 订阅

订阅专栏

生产环境中需要获取原端和目标端的已由文件的差异情况，因为在docker中运行，要求尽量避免落地文件，所以采用比较暴力的比对方式：

for i in `echo -e "${source}"`
do
    j=`echo -e "${local}"|grep ${i}$`
    if [[ "${j}" == "" ]]
    then
       str="${str} ${i}"
    fi
done

对3000个文件的比较需要耗时72秒以上

grep的执行效率明显堪忧，所以考虑用awk命令去查找差集，来优化

echo |awk -v source_list="${source}" -v local_list="${local}" 
'{split(sourcelist,arr_source," ")
split(locallist,arr_local," ")
for(i in arr_source){
    arr_diff[arr_source[i]]=arr_source[i]
}
for(i in arr_local){
    if(arr_diff[arr_local[i]]!=""){
        delete arr_diff[arr_local[i]]
    }
}
for(i in arr_diff){
    print arr_diff[i]
}
}'

1、awk中首先定义两个变量，接受shell中的变量

2、使用split函数将字符串拆分成数组

3、复制原端的列表数组，存入一个以键来存取的数组arr_diff（arr_diff["20190303abcd"]="20190303abcd"）

4、遍历本地列表数组，如果文件名在arr_diff数组中出现，则用delete函数删除这个元素

5、输出差集

附上GNU下载的grep命令源码，下载地址：http://mirrors.ustc.edu.cn/gnu/

static int
grep (int fd, char const *file, struct stats *stats)
{
  int nlines, i;
  int not_text;
  size_t residue, save;
  char oldc;
  char *beg;
  char *lim;
  char eol = eolbyte;

  if (!reset (fd, file, stats))
    return 0;

  if (file && directories == RECURSE_DIRECTORIES
      && S_ISDIR (stats->stat.st_mode))
    {
      /* Close fd now, so that we don't open a lot of file descriptors
	 when we recurse deeply.  */
      if (close (fd) != 0)
	error (0, errno, "%s", file);
      return grepdir (file, stats) - 2;
    }

  totalcc = 0;
  lastout = 0;
  totalnl = 0;
  outleft = max_count;
  after_last_match = 0;
  pending = 0;

  nlines = 0;
  residue = 0;
  save = 0;

  if (! fillbuf (save, stats))
    {
      if (! is_EISDIR (errno, file))
	suppressible_error (filename, errno);
      return 0;
    }

  not_text = (((binary_files == BINARY_BINARY_FILES && !out_quiet)
	       || binary_files == WITHOUT_MATCH_BINARY_FILES)
	      && memchr (bufbeg, eol ? '\0' : '\200', buflim - bufbeg));
  if (not_text && binary_files == WITHOUT_MATCH_BINARY_FILES)
    return 0;
  done_on_match += not_text;
  out_quiet += not_text;

  for (;;)
    {
      lastnl = bufbeg;
      if (lastout)
	lastout = bufbeg;

      beg = bufbeg + save;

      /* no more data to scan (eof) except for maybe a residue -> break */
      if (beg == buflim)
	break;

      /* Determine new residue (the length of an incomplete line at the end of
         the buffer, 0 means there is no incomplete last line).  */
      oldc = beg[-1];
      beg[-1] = eol;
      for (lim = buflim; lim[-1] != eol; lim--)
	continue;
      beg[-1] = oldc;
      if (lim == beg)
	lim = beg - residue;
      beg -= residue;
      residue = buflim - lim;

      if (beg < lim)
	{
	  if (outleft)
	    nlines += grepbuf (beg, lim);
	  if (pending)
	    prpending (lim);
	  if((!outleft && !pending) || (nlines && done_on_match && !out_invert))
	    goto finish_grep;
	}

      /* The last OUT_BEFORE lines at the end of the buffer will be needed as
	 leading context if there is a matching line at the begin of the
	 next data. Make beg point to their begin.  */
      i = 0;
      beg = lim;
      while (i < out_before && beg > bufbeg && beg != lastout)
	{
	  ++i;
	  do
	    --beg;
	  while (beg[-1] != eol);
	}

      /* detect if leading context is discontinuous from last printed line.  */
      if (beg != lastout)
	lastout = 0;

      /* Handle some details and read more data to scan.  */
      save = residue + lim - beg;
      if (out_byte)
	totalcc = add_count (totalcc, buflim - bufbeg - save);
      if (out_line)
	nlscan (beg);
      if (! fillbuf (save, stats))
	{
	  if (! is_EISDIR (errno, file))
	    suppressible_error (filename, errno);
	  goto finish_grep;
	}
    }
  if (residue)
    {
      *buflim++ = eol;
      if (outleft)
	nlines += grepbuf (bufbeg + save - residue, buflim);
      if (pending)
        prpending (buflim);
    }

 finish_grep:
  done_on_match -= not_text;
  out_quiet -= not_text;
  if ((not_text & ~out_quiet) && nlines != 0)
    printf (_("Binary file %s matches\n"), filename);
  return nlines;
}

每一次grep都需要对管道的数据全部读取然后进行查找，而数组只需要寻址匹配的删除即可。

牧锋

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
shell在变量中使用awk命令查找两个数组的差集

生产环境中需要获取原端和目标端的已由文件的差异情况，因为在docker中运行，要求尽量避免落地文件，所以采用比较暴力的比对方式：for i in `echo -e "${source}"`do j=`echo -e "${local}"|grep ${i}$` if [[ "${j}" == "" ]] then str="${str} ${i}" ...
复制链接

扫一扫

专栏目录