python多线程批量过滤文件关键字

基于上一篇博客中的问题,我尝试用python改成多进程运行:https://blog.csdn.net/linxi7/article/details/81317704

#!/usr/bin/env python
import time, datetime
import os, sys, commands
import socket
import threading

OSS_DIR="/data/test"

test_name = "t_%s" % time.strftime("%Y%m%d")
test_name_LAST_DAY = "t_%s" % ((datetime.datetime.now() + datetime.timedelta(days=-1)).strftime("%Y%m%d"))
test_name_3_DAY_AGO = "t_%s" % ((datetime.datetime.now() + datetime.timedelta(days=-2)).strftime("%Y%m%d"))
test_name_7_DAY_AGO = "t_%s" %  ((datetime.datetime.now() + datetime.timedelta(days=-6)).strftime("%Y%m%d"))
test_name_14_DAY_AGO = "t_%s" % ((datetime.datetime.now() + datetime.timedelta(days=-13)).strftime("%Y%m%d"))
time_of_system = time.strftime("%Y-%m-%d")

def exec_commands(cmd):
    result = commands.getoutput(cmd)
    return result

def travel_files(keyword, files, output_file):
    for single_file in files:
        file_obj = open(single_file).read().rstrip()
        for everyline in file_obj.split('\n'):
            if keyword in everyline:
                cmd_getid = '''echo "%s" | awk -F'|' '{print $2}' |sort | uniq''' % everyline
                userid = exec_commands(cmd_getid)
                output_file.write(userid + '\n')

def get_action(filename, oss_file_name_date, filter_word):
    register_file = "%s_%s.txt" % (filename, socket.gethostname())

    if os.path.exists(register_file):
        file_modify_time = time.strftime("%Y-%m-%d", time.localtime(os.stat(register_file).st_mtime))
    else:
        file_modify_time = ''

    cmd_date = "find %s -type f -name '%s*'" % (OSS_DIR, oss_file_name_date)
    file_date = exec_commands(cmd_date).split()

    select_number = 600
    if file_modify_time != time_of_system:
        if os.path.exists(register_file):
            os.remove(register_file)
        output_file = open(register_file, 'w+')
        t = threading.Thread(target=travel_files, args=(filter_word, file_date, output_file,))
        t.start()
        while threading.active_count() > select_number:
            time.sleep(1)

if __name__ == '__main__':
    #register 1 day ago
    get_action("register_file_1_day_ago", test_name_LAST_DAY, "LOGID_ACNT_REGISTER")

    #register 3 day ago
    get_action("register_file_3_day_ago", test_name_3_DAY_AGO, "LOGID_ACNT_REGISTER")

    #register 6 day ago
    get_action("register_file_7_day_ago", test_name_7_DAY_AGO, "LOGID_ACNT_REGISTER")

    #register 13 day ago
    get_action("register_file_14_day_ago", test_name_14_DAY_AGO, "LOGID_ACNT_REGISTER")

    #login today
    get_action("login_file", test_name, "LOGID_ACNT_LOGIN")

    #logout today
    get_action("logout_file", test_name, "LOGID_ACNT_LOGOUT")

    #register today
    get_action("today_register_file", test_name, "LOGID_ACNT_REGISTER")

脚本是查找指定目录下符合时间条件的文件,然后对文件进行批量过滤,采用多线程的方式,能够提升串行执行的时间效果。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值