Python函数执行效率的探究

1.前言

1.1 文章简介

  • 之所以会写本篇博客是因为在用python写代码的时候发现数据分析的时间开销特别大,所以希望来如何去尽可能优化我的python代码(本篇目的旨在优化我的python代码,仅供参考)
  • 探究影响因素
    • 1.函数调用
    • 2 函数嵌套
    • 3 单变量数值(已取消)
  • 评价指标
    • 程序执行时间t

1.2 代码简介

  • 本次代码取自我毕设数据分析代码的一部分
  • 通过遍历三个层级的文件夹来依此对txt文件进行处理
  • 代码如下
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time

start_time = time.time()
def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表,列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度(不是备选条数数目)
    length = len(data_saving)
    #RG
    gap_n = calgap(time,route,data_saving)
    #speed和speedcv
    calspeed(data_saving,length,distance,speed)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离(取最短作为OD的距离)
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

def calspeed(data_saving,length,distance,speed):
    """
    计算路网平均速度(加权平均速度)
    要修改!
    :param data_saving:存储数据的dataframe
    :param length:存储数据的条数
    :param distance:存储文件内每条数据距离的列表
    :param speed:存储文本内每条数据速度的列表
    :return:无返回值(利用python传参机制改变数据)
    """
    for i in range(length):
        time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
        distance.append(float(data_saving.iloc[i,4]))
        speed.append((float(distance[i])/time)*3.6)


#这个计算有待优化
def calgap(time,route,data_saving):
    count_routes = data_saving['经过道路'].value_counts()
    if (all(x == 1 for x in count_routes)):  # 每条路径均不是同一路径
        for i in range(len(data_saving)):
            time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
            route.append(data_saving.iloc[i, 11])
        n = len(route)  # 参数n:备选路径条数
        tmin = min(time)  # 参数tmin:路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    else:  # 每条路径都是同一路径
        count_routes = pd.DataFrame(count_routes)  # 解决困境
        for i in range(len(count_routes)):
            get_index = count_routes.index.to_list()[i]
            # print(int(count_routes._get_values[i]))
            if int(count_routes._get_values[i]) == 1:
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                        route.append(data_saving.iloc[j, 11])
            else:
                list_average_time = []  # 临时存储相同路径后取平均
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        list_average_time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                time.append(np.mean(list_average_time))
                route.append(get_index)
        n = len(route)  # 参数n:备选路径条数
        tmin = min(time)  # 参数tmin:路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    gap_n = [gap,n]
    return gap_n

def initDataSaving(f):
    # 初始化列表
    list_rest = []  # 除第一行的其他行
    line = f.readline()
    list_first = line.strip().split(" ")  # 文件第一行
    line = f.readline()
    data_saving = pd.DataFrame(columns=list_first)  # 接收series数据
    # 文件按行读取
    while line:
        list = line.strip().split(" ")
        for i in range(11):
            list_rest.append(list[i])  # 对前11行单独赋值
        list_rest.append(list[11:])
        line = f.readline()
        # 存储数据
        data = pd.Series(list_rest, index=list_first)
        # print(data)
        data_saving = data_saving.append(data, ignore_index=True)
        list_rest = []
    return data_saving


def getWeek(date):
    week = datetime.strptime(date, "%Y%m%d").weekday()
    if 0<=week<=3:
        return "Mon_Thur"
    elif week == 4:
        return "Fri"
    elif week == 5:
        return "Sat"
    else:
        return "Sun"


path = "F:/python3.8/testdata"
files = os.listdir(path)
columns_list = ['OD对', "时段", '星期', 'RelativeGap', "distance", 'speed', 'speedCV', 'rount_num']
out_all_txt = pd.DataFrame(
    columns=columns_list)
row = [i for i in range(8)]
count_day = 0
count_all = 0
for file_date in files:  # CoorGetTrackWH20090901层次操作
    path0 = path + '/' + file_date
    if os.path.isdir(path0):  # 判断是否是文件夹
        #添加星期数据
        date = re.findall(r"200909[0-9][0-9]", file_date)[0]
        row[2] = getWeek(date)
        out_txt = pd.DataFrame(
            columns=columns_list)
        out_txt_file_name = file_date + '.txt'
        path1 = path0 + '/' + '100grid_5min5newwithdis'
        files1 = os.listdir(path1)
        files1.sort(key=lambda i: int(re.match(r'(\d+)', i).group()))  # 使用正则表达式来匹配数字,重新排序列表
        for file_hour in files1:  # 0_1文件夹层次操作
            path2 = path1 + '/' + file_hour
            if os.path.isdir(path2):  # 判断是否是文件夹
                #添加时段数据
                row[1] = file_hour
                files2 = os.listdir(path2)
                for file_txt in files2:  # txt文件层次操作
                    #添加OD对数据
                    ODname =re.findall(r".........",file_txt)[0]
                    row[0] = ODname
                    # 打开文件
                    f = open(path2 + '\\' + file_txt, "r")
                    # 存储数据到datasaving
                    data_saving = initDataSaving(f)
                    #进行计算并得到结果
                    res = calculation(data_saving)
                    # 获取gap值
                    row[3] = res[0]
                    # 获取速度
                    row[5] = res[1]
                    # 获取距离(取最短)
                    row[4] = res[2]
                    # 获取速度差异系数
                    row[6] = res[4]
                    # 获取备选路径数目
                    row[7] = res[3]
                    #按行添加到输出变量
                    out_txt.loc[count_day] = row
                    out_all_txt.loc[count_all] = row
                    count_day += 1
                    count_all += 1
                print(file_date + "中的" + file_hour + "已完成")
        out_txt.to_csv(path0 + '/' + "test.txt", sep='\t', index=False)
        print(file_date + "已完成")
        count_day = 0
mtime = time.time()
print('没有输出总文件时所耗费的时间为%f 秒'%(mtime - start_time))
print('数据量为%f个'%count_all)
out_all_txt.to_csv(path + '/' + "gap.txt", sep='\t', index=False)
end_time = time.time()
print('程序运行耗费时间为%f 秒'%(end_time - start_time))

2.操作流程

2.1 探究函数嵌套影响

2.1.1 嵌套说明

  • 这里的函数嵌套是指我们在一个函数中再次调用其他函数,原始代码就是已经带嵌套的
def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表,列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度(不是备选条数数目)
    length = len(data_saving)
    #RG
    gap_n = calgap(time,route,data_saving)
    #speed和speedcv
    calspeed(data_saving,length,distance,speed)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离(取最短作为OD的距离)
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

2.1.2 小数据情况

  • 嵌套

2.1.3大数据情况

在这里插入图片描述

2.2 探究函数调用影响(无嵌套)

2.2.1 说明

  • 之前的calspeed与calgap全部放在calculation内
def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表,列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度(不是备选条数数目)
    length = len(data_saving)
    #RG
    count_routes = data_saving['经过道路'].value_counts()
    if (all(x == 1 for x in count_routes)):  # 每条路径均不是同一路径
        for i in range(len(data_saving)):
            time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
            route.append(data_saving.iloc[i, 11])
        n = len(route)  # 参数n:备选路径条数
        tmin = min(time)  # 参数tmin:路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    else:  # 每条路径都是同一路径
        count_routes = pd.DataFrame(count_routes)  # 解决困境
        for i in range(len(count_routes)):
            get_index = count_routes.index.to_list()[i]
            # print(int(count_routes._get_values[i]))
            if int(count_routes._get_values[i]) == 1:
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                        route.append(data_saving.iloc[j, 11])
            else:
                list_average_time = []  # 临时存储相同路径后取平均
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        list_average_time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                time.append(np.mean(list_average_time))
                route.append(get_index)
        n = len(route)  # 参数n:备选路径条数
        tmin = min(time)  # 参数tmin:路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    gap_n = [gap,n]
    #speed和speedcv
    for i in range(length):
        time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
        distance.append(float(data_saving.iloc[i,4]))
        speed.append((float(distance[i])/time)*3.6)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离(取最短作为OD的距离)
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

2.2.2 小数据结论

在这里插入图片描述

2.2.3大数据结论

在这里插入图片描述

3.结论

当一个函数过长的时候,使用函数嵌套的方法可以提高python的执行效率

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值