1.前言
1.1 文章简介
- 之所以会写本篇博客是因为在用python写代码的时候发现数据分析的时间开销特别大,所以希望来如何去尽可能优化我的python代码(本篇目的旨在优化我的python代码,仅供参考)
- 探究影响因素
- 1.函数调用
- 2 函数嵌套
- 3 单变量数值(已取消)
- 评价指标
- 程序执行时间t
1.2 代码简介
- 本次代码取自我毕设数据分析代码的一部分
- 通过遍历三个层级的文件夹来依此对txt文件进行处理
- 代码如下
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
start_time = time.time()
def calculation(data_saving):
"""
计算函数
:param data_saving:存储数据的dataframe
:return:返回一个列表,列表有已经计算好的参数
"""
time = [] # 存储路径耗费时间
speed = [] #存储速度
route = [] # 存储路径
distance = [] #存储距离
#长度(不是备选条数数目)
length = len(data_saving)
#RG
gap_n = calgap(time,route,data_saving)
#speed和speedcv
calspeed(data_saving,length,distance,speed)
avgspeed = np.average(speed,weights=distance)
stdspeed = np.std(speed)
speedcv = stdspeed/avgspeed
#距离(取最短作为OD的距离)
mindistance = np.min(distance)
res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
return res
def calspeed(data_saving,length,distance,speed):
"""
计算路网平均速度(加权平均速度)
要修改!
:param data_saving:存储数据的dataframe
:param length:存储数据的条数
:param distance:存储文件内每条数据距离的列表
:param speed:存储文本内每条数据速度的列表
:return:无返回值(利用python传参机制改变数据)
"""
for i in range(length):
time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
distance.append(float(data_saving.iloc[i,4]))
speed.append((float(distance[i])/time)*3.6)
#这个计算有待优化
def calgap(time,route,data_saving):
count_routes = data_saving['经过道路'].value_counts()
if (all(x == 1 for x in count_routes)): # 每条路径均不是同一路径
for i in range(len(data_saving)):
time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
route.append(data_saving.iloc[i, 11])
n = len(route) # 参数n:备选路径条数
tmin = min(time) # 参数tmin:路径花费时间最小值
sum = 0
for j in range(n):
sum = sum + (time[j] - tmin) / tmin
gap = sum / n
else: # 每条路径都是同一路径
count_routes = pd.DataFrame(count_routes) # 解决困境
for i in range(len(count_routes)):
get_index = count_routes.index.to_list()[i]
# print(int(count_routes._get_values[i]))
if int(count_routes._get_values[i]) == 1:
for j in range(len(data_saving)):
if data_saving.iloc[j, 11] == get_index:
time.append(
float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
route.append(data_saving.iloc[j, 11])
else:
list_average_time = [] # 临时存储相同路径后取平均
for j in range(len(data_saving)):
if data_saving.iloc[j, 11] == get_index:
list_average_time.append(
float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
time.append(np.mean(list_average_time))
route.append(get_index)
n = len(route) # 参数n:备选路径条数
tmin = min(time) # 参数tmin:路径花费时间最小值
sum = 0
for j in range(n):
sum = sum + (time[j] - tmin) / tmin
gap = sum / n
gap_n = [gap,n]
return gap_n
def initDataSaving(f):
# 初始化列表
list_rest = [] # 除第一行的其他行
line = f.readline()
list_first = line.strip().split(" ") # 文件第一行
line = f.readline()
data_saving = pd.DataFrame(columns=list_first) # 接收series数据
# 文件按行读取
while line:
list = line.strip().split(" ")
for i in range(11):
list_rest.append(list[i]) # 对前11行单独赋值
list_rest.append(list[11:])
line = f.readline()
# 存储数据
data = pd.Series(list_rest, index=list_first)
# print(data)
data_saving = data_saving.append(data, ignore_index=True)
list_rest = []
return data_saving
def getWeek(date):
week = datetime.strptime(date, "%Y%m%d").weekday()
if 0<=week<=3:
return "Mon_Thur"
elif week == 4:
return "Fri"
elif week == 5:
return "Sat"
else:
return "Sun"
path = "F:/python3.8/testdata"
files = os.listdir(path)
columns_list = ['OD对', "时段", '星期', 'RelativeGap', "distance", 'speed', 'speedCV', 'rount_num']
out_all_txt = pd.DataFrame(
columns=columns_list)
row = [i for i in range(8)]
count_day = 0
count_all = 0
for file_date in files: # CoorGetTrackWH20090901层次操作
path0 = path + '/' + file_date
if os.path.isdir(path0): # 判断是否是文件夹
#添加星期数据
date = re.findall(r"200909[0-9][0-9]", file_date)[0]
row[2] = getWeek(date)
out_txt = pd.DataFrame(
columns=columns_list)
out_txt_file_name = file_date + '.txt'
path1 = path0 + '/' + '100grid_5min5newwithdis'
files1 = os.listdir(path1)
files1.sort(key=lambda i: int(re.match(r'(\d+)', i).group())) # 使用正则表达式来匹配数字,重新排序列表
for file_hour in files1: # 0_1文件夹层次操作
path2 = path1 + '/' + file_hour
if os.path.isdir(path2): # 判断是否是文件夹
#添加时段数据
row[1] = file_hour
files2 = os.listdir(path2)
for file_txt in files2: # txt文件层次操作
#添加OD对数据
ODname =re.findall(r".........",file_txt)[0]
row[0] = ODname
# 打开文件
f = open(path2 + '\\' + file_txt, "r")
# 存储数据到datasaving
data_saving = initDataSaving(f)
#进行计算并得到结果
res = calculation(data_saving)
# 获取gap值
row[3] = res[0]
# 获取速度
row[5] = res[1]
# 获取距离(取最短)
row[4] = res[2]
# 获取速度差异系数
row[6] = res[4]
# 获取备选路径数目
row[7] = res[3]
#按行添加到输出变量
out_txt.loc[count_day] = row
out_all_txt.loc[count_all] = row
count_day += 1
count_all += 1
print(file_date + "中的" + file_hour + "已完成")
out_txt.to_csv(path0 + '/' + "test.txt", sep='\t', index=False)
print(file_date + "已完成")
count_day = 0
mtime = time.time()
print('没有输出总文件时所耗费的时间为%f 秒'%(mtime - start_time))
print('数据量为%f个'%count_all)
out_all_txt.to_csv(path + '/' + "gap.txt", sep='\t', index=False)
end_time = time.time()
print('程序运行耗费时间为%f 秒'%(end_time - start_time))
2.操作流程
2.1 探究函数嵌套影响
2.1.1 嵌套说明
- 这里的函数嵌套是指我们在一个函数中再次调用其他函数,原始代码就是已经带嵌套的
def calculation(data_saving):
"""
计算函数
:param data_saving:存储数据的dataframe
:return:返回一个列表,列表有已经计算好的参数
"""
time = [] # 存储路径耗费时间
speed = [] #存储速度
route = [] # 存储路径
distance = [] #存储距离
#长度(不是备选条数数目)
length = len(data_saving)
#RG
gap_n = calgap(time,route,data_saving)
#speed和speedcv
calspeed(data_saving,length,distance,speed)
avgspeed = np.average(speed,weights=distance)
stdspeed = np.std(speed)
speedcv = stdspeed/avgspeed
#距离(取最短作为OD的距离)
mindistance = np.min(distance)
res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
return res
2.1.2 小数据情况
- 嵌套
2.1.3大数据情况
2.2 探究函数调用影响(无嵌套)
2.2.1 说明
- 之前的calspeed与calgap全部放在calculation内
def calculation(data_saving):
"""
计算函数
:param data_saving:存储数据的dataframe
:return:返回一个列表,列表有已经计算好的参数
"""
time = [] # 存储路径耗费时间
speed = [] #存储速度
route = [] # 存储路径
distance = [] #存储距离
#长度(不是备选条数数目)
length = len(data_saving)
#RG
count_routes = data_saving['经过道路'].value_counts()
if (all(x == 1 for x in count_routes)): # 每条路径均不是同一路径
for i in range(len(data_saving)):
time.append(float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1]))
route.append(data_saving.iloc[i, 11])
n = len(route) # 参数n:备选路径条数
tmin = min(time) # 参数tmin:路径花费时间最小值
sum = 0
for j in range(n):
sum = sum + (time[j] - tmin) / tmin
gap = sum / n
else: # 每条路径都是同一路径
count_routes = pd.DataFrame(count_routes) # 解决困境
for i in range(len(count_routes)):
get_index = count_routes.index.to_list()[i]
# print(int(count_routes._get_values[i]))
if int(count_routes._get_values[i]) == 1:
for j in range(len(data_saving)):
if data_saving.iloc[j, 11] == get_index:
time.append(
float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
route.append(data_saving.iloc[j, 11])
else:
list_average_time = [] # 临时存储相同路径后取平均
for j in range(len(data_saving)):
if data_saving.iloc[j, 11] == get_index:
list_average_time.append(
float(data_saving.iloc[j, 2]) - float(data_saving.iloc[j, 1]))
time.append(np.mean(list_average_time))
route.append(get_index)
n = len(route) # 参数n:备选路径条数
tmin = min(time) # 参数tmin:路径花费时间最小值
sum = 0
for j in range(n):
sum = sum + (time[j] - tmin) / tmin
gap = sum / n
gap_n = [gap,n]
#speed和speedcv
for i in range(length):
time = float(data_saving.iloc[i, 2]) - float(data_saving.iloc[i, 1])
distance.append(float(data_saving.iloc[i,4]))
speed.append((float(distance[i])/time)*3.6)
avgspeed = np.average(speed,weights=distance)
stdspeed = np.std(speed)
speedcv = stdspeed/avgspeed
#距离(取最短作为OD的距离)
mindistance = np.min(distance)
res=[gap_n[0],avgspeed,mindistance,gap_n[1],speedcv]
return res
2.2.2 小数据结论
2.2.3大数据结论
3.结论
当一个函数过长的时候,使用函数嵌套的方法可以提高python的执行效率