Python函数执行效率的探究

最新推荐文章于 2023-02-11 12:21:11 发布

武者小路

最新推荐文章于 2023-02-11 12:21:11 发布

阅读量908

点赞数 3

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/qq_41815357/article/details/105004402

版权

python 专栏收录该内容

7 篇文章 4 订阅

订阅专栏

1.前言

1.1 文章简介

之所以会写本篇博客是因为在用python写代码的时候发现数据分析的时间开销特别大，所以希望来如何去尽可能优化我的python代码（本篇目的旨在优化我的python代码，仅供参考）
探究影响因素
- 1.函数调用
- 2 函数嵌套
- 3 单变量数值（已取消）
评价指标
- 程序执行时间t

1.2 代码简介

本次代码取自我毕设数据分析代码的一部分
通过遍历三个层级的文件夹来依此对txt文件进行处理
代码如下

import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time

start_time = time.time()
def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表，列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度（不是备选条数数目）
    length = len(data_saving)
    #RG
    gap_n = calgap(time,route,data_saving)
    #speed和speedcv
    calspeed(data_saving,length,distance,speed)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离（取最短作为OD的距离）
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

def calspeed(data_saving,length,distance,speed):
    """
    计算路网平均速度（加权平均速度）
    要修改！
    :param data_saving:存储数据的dataframe
    :param length:存储数据的条数
    :param distance:存储文件内每条数据距离的列表
    :param speed:存储文本内每条数据速度的列表
    :return:无返回值（利用python传参机制改变数据）
    """
    for i in range(length):
        time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
        distance.append(float(data_saving.iloc[i,4]))
        speed.append((float(distance[i])/time)*3.6)


#这个计算有待优化
def calgap(time,route,data_saving):
    count_routes = data_saving['经过道路'].value_counts()
    if (all(x == 1 for x in count_routes)):  # 每条路径均不是同一路径
        for i in range(len(data_saving)):
            time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
            route.append(data_saving.iloc[i, 11])
        n = len(route)  # 参数n：备选路径条数
        tmin = min(time)  # 参数tmin：路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    else:  # 每条路径都是同一路径
        count_routes = pd.DataFrame(count_routes)  # 解决困境
        for i in range(len(count_routes)):
            get_index = count_routes.index.to_list()[i]
            # print(int(count_routes._get_values[i]))
            if int(count_routes._get_values[i]) == 1:
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                        route.append(data_saving.iloc[j, 11])
            else:
                list_average_time = []  # 临时存储相同路径后取平均
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        list_average_time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                time.append(np.mean(list_average_time))
                route.append(get_index)
        n = len(route)  # 参数n：备选路径条数
        tmin = min(time)  # 参数tmin：路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    gap_n = [gap,n]
    return gap_n

def initDataSaving(f):
    # 初始化列表
    list_rest = []  # 除第一行的其他行
    line = f.readline()
    list_first = line.strip().split(" ")  # 文件第一行
    line = f.readline()
    data_saving = pd.DataFrame(columns=list_first)  # 接收series数据
    # 文件按行读取
    while line:
        list = line.strip().split(" ")
        for i in range(11):
            list_rest.append(list[i])  # 对前11行单独赋值
        list_rest.append(list[11:])
        line = f.readline()
        # 存储数据
        data = pd.Series(list_rest, index=list_first)
        # print(data)
        data_saving = data_saving.append(data, ignore_index=True)
        list_rest = []
    return data_saving


def getWeek(date):
    week = datetime.strptime(date, "%Y%m%d").weekday()
    if 0<=week<=3:
        return "Mon_Thur"
    elif week == 4:
        return "Fri"
    elif week == 5:
        return "Sat"
    else:
        return "Sun"


path = "F:/python3.8/testdata"
files = os.listdir(path)
columns_list = ['OD对', "时段", '星期', 'RelativeGap', "distance", 'speed', 'speedCV', 'rount_num']
out_all_txt = pd.DataFrame(
    columns=columns_list)
row = [i for i in range(8)]
count_day = 0
count_all = 0
for file_date in files:  # CoorGetTrackWH20090901层次操作
    path0 = path + '/' + file_date
    if os.path.isdir(path0):  # 判断是否是文件夹
        #添加星期数据
        date = re.findall(r"200909[0-9][0-9]", file_date)[0]
        row[2] = getWeek(date)
        out_txt = pd.DataFrame(
            columns=columns_list)
        out_txt_file_name = file_date + '.txt'
        path1 = path0 + '/' + '100grid_5min5newwithdis'
        files1 = os.listdir(path1)
        files1.sort(key=lambda i: int(re.match(r'(\d+)', i).group()))  # 使用正则表达式来匹配数字，重新排序列表
        for file_hour in files1:  # 0_1文件夹层次操作
            path2 = path1 + '/' + file_hour
            if os.path.isdir(path2):  # 判断是否是文件夹
                #添加时段数据
                row[1] = file_hour
                files2 = os.listdir(path2)
                for file_txt in files2:  # txt文件层次操作
                    #添加OD对数据
                    ODname =re.findall(r".........",file_txt)[0]
                    row[0] = ODname
                    # 打开文件
                    f = open(path2 + '\\' + file_txt, "r")
                    # 存储数据到datasaving
                    data_saving = initDataSaving(f)
                    #进行计算并得到结果
                    res = calculation(data_saving)
                    # 获取gap值
                    row[3] = res[0]
                    # 获取速度
                    row[5] = res[1]
                    # 获取距离（取最短）
                    row[4] = res[2]
                    # 获取速度差异系数
                    row[6] = res[4]
                    # 获取备选路径数目
                    row[7] = res[3]
                    #按行添加到输出变量
                    out_txt.loc[count_day] = row
                    out_all_txt.loc[count_all] = row
                    count_day += 1
                    count_all += 1
                print(file_date + "中的" + file_hour + "已完成")
        out_txt.to_csv(path0 + '/' + "test.txt", sep='\t', index=False)
        print(file_date + "已完成")
        count_day = 0
mtime = time.time()
print('没有输出总文件时所耗费的时间为%f 秒'%(mtime - start_time))
print('数据量为%f个'%count_all)
out_all_txt.to_csv(path + '/' + "gap.txt", sep='\t', index=False)
end_time = time.time()
print('程序运行耗费时间为%f 秒'%(end_time - start_time))

2.操作流程

2.1 探究函数嵌套影响

2.1.1 嵌套说明

这里的函数嵌套是指我们在一个函数中再次调用其他函数，原始代码就是已经带嵌套的

def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表，列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度（不是备选条数数目）
    length = len(data_saving)
    #RG
    gap_n = calgap(time,route,data_saving)
    #speed和speedcv
    calspeed(data_saving,length,distance,speed)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离（取最短作为OD的距离）
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

2.1.2 小数据情况

嵌套

2.1.3大数据情况

在这里插入图片描述

2.2 探究函数调用影响（无嵌套）

2.2.1 说明

之前的calspeed与calgap全部放在calculation内

def calculation(data_saving):
    """
    计算函数
    :param data_saving:存储数据的dataframe
    :return:返回一个列表，列表有已经计算好的参数
    """
    time = []  # 存储路径耗费时间
    speed = []  #存储速度
    route = []  # 存储路径
    distance = []   #存储距离
    #长度（不是备选条数数目）
    length = len(data_saving)
    #RG
    count_routes = data_saving['经过道路'].value_counts()
    if (all(x == 1 for x in count_routes)):  # 每条路径均不是同一路径
        for i in range(len(data_saving)):
            time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
            route.append(data_saving.iloc[i, 11])
        n = len(route)  # 参数n：备选路径条数
        tmin = min(time)  # 参数tmin：路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    else:  # 每条路径都是同一路径
        count_routes = pd.DataFrame(count_routes)  # 解决困境
        for i in range(len(count_routes)):
            get_index = count_routes.index.to_list()[i]
            # print(int(count_routes._get_values[i]))
            if int(count_routes._get_values[i]) == 1:
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                        route.append(data_saving.iloc[j, 11])
            else:
                list_average_time = []  # 临时存储相同路径后取平均
                for j in range(len(data_saving)):
                    if data_saving.iloc[j, 11] == get_index:
                        list_average_time.append(
                            float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
                time.append(np.mean(list_average_time))
                route.append(get_index)
        n = len(route)  # 参数n：备选路径条数
        tmin = min(time)  # 参数tmin：路径花费时间最小值
        sum = 0
        for j in range(n):
            sum = sum + (time[j] - tmin) / tmin
        gap = sum / n
    gap_n = [gap,n]
    #speed和speedcv
    for i in range(length):
        time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
        distance.append(float(data_saving.iloc[i,4]))
        speed.append((float(distance[i])/time)*3.6)
    avgspeed = np.average(speed,weights=distance)
    stdspeed = np.std(speed)
    speedcv = stdspeed/avgspeed
    #距离（取最短作为OD的距离）
    mindistance = np.min(distance)

    res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
    return res

2.2.2 小数据结论

在这里插入图片描述

2.2.3大数据结论

在这里插入图片描述

3.结论

当一个函数过长的时候，使用函数嵌套的方法可以提高python的执行效率

武者小路

关注

3
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
Python函数执行效率的探究

1.前言1.1 文章简介之所以会写本篇博客是因为在用python写代码的时候发现数据分析的时间开销特别大，所以希望来如何去尽可能优化我的python代码（本篇目的旨在优化我的python代码，仅供参考）探究影响因素1.函数调用2 函数嵌套3 单变量数值（已取消）评价指标程序执行时间t1.2 代码简介本次代码取自我毕设数据分析代码的一部分通过遍历三个层级的文件...
复制链接

扫一扫

专栏目录