python 处理200个文件的数据，存到文本内

dong95write

于 2021-02-05 16:10:44 发布

阅读量197

点赞数

分类专栏：文件分类服务器

本文链接：https://blog.csdn.net/m0_46601663/article/details/113698438

版权

python

服务器同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

文件分类

3 篇文章 0 订阅

订阅专栏

# -*- coding:utf-8 -*-
import re
import os
import pymysql
from datetime import datetime, date, timedelta
import time
import shutil
import sys
import _thread

# 连接数据库
#def dbh_connect(databaseName):
#    global cur
#    global conn
#    conn = pymysql.connect("127.0.0.1", "root", "1234", databaseName)
#    cur = conn.cursor()

# 省份地域对应关系
#def get_region():
#    global regionRes
#    regionRes = {}
#    cur.execute("select * from table_name")
#    region = cur.fetchall()
#    
#    for key in region:
#        regionRes[key[3]] = key[5]



#拉取文件
def generate_file():
    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write(modularName+yesterday+'日期数据开始拉取:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()

    #拉取数据至集群
    if  modular == '4':
        cmd = '大数据表拉取'
        cmd = cmd.encode("utf-8").decode("latin1")
    else:
        cmd = 
    os.system(cmd)

    #集群拉取数据至本地
    cmd = "存到集群" % (modularName+yesterday, path)
    os.system(cmd)

    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write(modularName+yesterday+'日期数据拉取成功:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()




def combine_conditions(input_elms):
    if len(input_elms) == 1:
        return input_elms
    all_combine_list = []
    for i in range(len(input_elms)):
        current_letter = input_elms[i]
        if i+1 < len(input_elms):
            other_letters = input_elms[i+1:]
            new_elms = combine_conditions(other_letters)
            for elm in new_elms:
                all_combine_list.append(current_letter + '&' + elm)

    return all_combine_list + input_elms


def generate():
    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write('生成数据:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()


    global dayOutput
    dayOutput = '%s%s%s_out.txt' % (path, modularName, yesterday)
    summary_dict = dict()
    total_query = 0
    total_audio_ms = 0
    cachedTmpNum = 0
    cachedNum = 0
    textLen = 0
    fileArr = os.listdir(path+modularName+yesterday)
    fileArr.sort()
    fileStr = ''
    for filename in fileArr:
        fileStr += ' ' +filename
    do_cmd = os.popen("cd "+ path+modularName+yesterday +"; cat %s |awk -F '\x01' '{print $5}'|sort -u|wc -l" % fileStr)
    total_user = do_cmd.read().strip()
    for filename in fileArr:
        with open(path+modularName+yesterday+'/'+filename, 'r', encoding='utf-8') as f:
            while True:
                try:
                    sents = f.readline()
                except:
                    continue
                if not sents:
                    break
                if not sents.strip():
                    continue
                total_query += 1
                _split = sents.replace('\n', '').split('\x01')
                if len(_split) == 9:
                    audio_ms = float(_split[6])
                    print(audio_ms)
                    total_audio_ms += audio_ms
                    textLenSin = float(_split[8])
                    textLen += textLenSin
                    if 'Cached' in _split[3]:
                        cachedTmpNum = 1
                        cachedNum += 1
                    else:
                        cachedTmpNum = 0
                else:
                    continue
        Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
        Speed_of_progress.write(filename+'处理完成:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
        Speed_of_progress.close()

    user_query = round(int(total_query) / int(total_user), 2)
    total_audio_user = round(int(total_audio_ms) / int(total_user) / 60000, 2)
    #写入文件
    template = '{0} query_num:{1} user_num:{2} total_audio_user:{3} user_query:{4} \n'
    f_out = open(dayOutput, 'w',encoding='utf-8')
    f_out.write(template.format('D%s' % yesterday, total_query, total_user, str(total_audio_user), str(user_query)))
    f_out.close()

    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write('数据生成成功:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()


if __name__ == '__main__':

    global path
    path = ''
    global yesterday
    modular = sys.argv[1]
    # yesterday = '20200327'
    yesterday = sys.argv[2]

    pathOri = '/home/public/'
    if modular == '4':
        path = pathOri+'timeDateNovelNew/'
        modularName = 'novel'

    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write(modularName+'执行开始:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()
    try:
        #get_region()  # 查询地域对应关系
        #queryInfo() #查询发音人 pid,informant信息
        #generatie_file()  # 拉取数据
        generate()       #生成所有条件组合数据
    except Exception as e:
        with open('%s/log.txt' % path, 'a', encoding='utf-8') as fl:
            fl.write(str(e))
        fl.close()

    Speed_of_progress = open('%s/log.txt' % path, 'a', encoding='utf-8')
    Speed_of_progress.write('执行结束:' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '\n')
    Speed_of_progress.close()