readCsv.py
import csv
import re # 正则表达式模块
import os
# import pandas as pd
import time
import matplotlib.pyplot as plt
# 程序运行时间计时 开始
start_time = time.perf_counter()
# 定义一个函数,将字符串转化为秒数
def convert_to_seconds(duration):
# 使用正则表达式匹配时、分、秒的数字
match = re.search(r'PT(\d+)H(\d+)M(\d+)S', duration)
# 如果匹配成功,将数字转化为整数,并计算总秒数
if match:
hours = int(match.group(1))
minutes = int(match.group(2))
seconds = int(match.group(3))
total_seconds = hours * 3600 + minutes * 60 + seconds
return total_seconds
# 如果匹配失败,返回None
else:
return 0
# 打开CSV文件
with open('results_10M_train.csv', 'r', encoding="utf-8") as csvFile:
# 创建CSV读取器
csvReader = csv.reader(csvFile)
# 所有视频的总时长
total_duration = 0
total_duration_valid = 0
# name字段总长度
total_length_name = 0
total_length_name_valid = 0
# 行数
count_row = 0
count_video = 0
# 字典,存储视频时长各个区间的视频个数
duration_seg_count = {'0': 0,'0-5': 0, '5-10': 0, '10-15': 0, '15-20': 0, '20-25': 0, '25-30': 0,'30-35':0, '35-40':0, '40-45':0, '45-50':0,'50-55':0,'55-60':0, '60-90': 0, '90-120':0, '120-150':0, '150-180':0, '180+':0}
duration_seg_count_valid = {'0': 0,'0-5': 0, '5-10': 0, '10-15': 0, '15-20': 0, '20-25': 0, '25-30': 0,'30-35':0, '35-40':0, '40-45':0, '45-50':0,'50-55':0,'55-60':0, '60-90': 0, '90-120':0, '120-150':0, '150-180':0, '180+':0}
# 字典,存储name字段长度各个区间的视频个数
name_seg_count = {'0': 0, '0-20': 0, '20-40': 0, '40-60': 0, '60-80': 0, '80-100': 0, '100-120': 0, '120-140': 0, '140-160':0,'160-180':0, '180-200':0, '200+':0}
name_seg_count_valid = {'0': 0, '0-20': 0, '20-40': 0, '40-60': 0, '60-80': 0, '80-100': 0, '100-120': 0, '120-140': 0, '140-160':0,'160-180':0, '180-200':0, '200+':0}
# 跳过第一行(表头)
next(csvReader)
# 读取CSV数据并进行处理
for row in csvReader:
count_row += 1
# if(count_row > 100000):
# count_row -= 1
# break
# print('##########################################################################################################')
# print('line', count_row)
# print(row) # ['videoid', 'contentUrl', 'duration', 'page_dir', 'name']
# 在这里对每一行数据进行处理
videoid = row[0]
duration = row[2]
page_dir = row[3]
name = row[4]
duration = convert_to_seconds(duration)
# print()
# print('videoid: ', videoid)
# print('duration:', duration)
# print('page_dir: ',page_dir)
# print('name: ',name)
# 根据duration字段的值,判断它属于哪个区间,并在字典中对应的键值+1
if duration == 0:
duration_seg_count['0'] += 1
elif duration <= 5:
duration_seg_count['0-5'] += 1
elif duration <= 10:
duration_seg_count['5-10'] += 1
elif duration <= 15:
duration_seg_count['10-15'] += 1
elif duration <= 20:
duration_seg_count['15-20'] += 1
elif duration <= 25:
duration_seg_count['20-25'] += 1
elif duration <= 30:
duration_seg_count['25-30'] += 1
elif duration <= 35:
duration_seg_count['30-35'] += 1
elif duration <= 40:
duration_seg_count['35-40'] += 1
elif duration <= 45:
duration_seg_count['40-45'] += 1
elif duration <= 50:
duration_seg_count['45-50'] += 1
elif duration <= 55:
duration_seg_count['50-55'] += 1
elif duration <= 60:
duration_seg_count['55-60'] += 1
elif duration <= 90:
duration_seg_count['60-90'] += 1
elif duration <= 120:
duration_seg_count['90-120'] += 1
elif duration <= 150:
duration_seg_count['120-150'] += 1
elif duration <= 180:
duration_seg_count['150-180'] += 1
else:
duration_seg_count['180+'] += 1
# 将duration字段的值累加到总时长变量中
total_duration += duration
# 根据name字段的长度,判断它属于哪个区间,并在字典中对应的键值+1
length = len(name)
if length <= 0:
name_seg_count['0'] += 1
if length <= 20:
name_seg_count['0-20'] += 1
elif length <= 40:
name_seg_count['20-40'] += 1
elif length <= 60:
name_seg_count['40-60'] += 1
elif length <= 80:
name_seg_count['60-80'] += 1
elif length <= 100:
name_seg_count['80-100'] += 1
elif length <= 120:
name_seg_count['100-120'] += 1
elif length <= 140:
name_seg_count['120-140'] += 1
elif length <= 160:
name_seg_count['140-160'] += 1
elif length <= 180:
name_seg_count['160-180'] += 1
elif length <= 200:
name_seg_count['180-200'] += 1
else:
name_seg_count['200+'] += 1
# 将name长度值累加到总长度变量中
total_length_name += length
ori_file_path = './WebVid/' + page_dir
ori_file_name = str(videoid) + '.mp4'
ori_file_path_name = ori_file_path + '/' + ori_file_name
if(os.path.exists(ori_file_path_name)):
count_video += 1
if duration == 0:
duration_seg_count_valid['0'] += 1
elif duration <= 5:
duration_seg_count_valid['0-5'] += 1
elif duration <= 10:
duration_seg_count_valid['5-10'] += 1
elif duration <= 15:
duration_seg_count_valid['10-15'] += 1
elif duration <= 20:
duration_seg_count_valid['15-20'] += 1
elif duration <= 25:
duration_seg_count_valid['20-25'] += 1
elif duration <= 30:
duration_seg_count_valid['25-30'] += 1
elif duration <= 35:
duration_seg_count_valid['30-35'] += 1
elif duration <= 40:
duration_seg_count_valid['35-40'] += 1
elif duration <= 45:
duration_seg_count_valid['40-45'] += 1
elif duration <= 50:
duration_seg_count_valid['45-50'] += 1
elif duration <= 55:
duration_seg_count_valid['50-55'] += 1
elif duration <= 60:
duration_seg_count_valid['55-60'] += 1
elif duration <= 90:
duration_seg_count_valid['60-90'] += 1
elif duration <= 120:
duration_seg_count_valid['90-120'] += 1
elif duration <= 150:
duration_seg_count_valid['120-150'] += 1
elif duration <= 180:
duration_seg_count_valid['150-180'] += 1
else:
duration_seg_count_valid['180+'] += 1
total_duration_valid += duration
if length <= 0:
name_seg_count_valid['0'] += 1
if length <= 20:
name_seg_count_valid['0-20'] += 1
elif length <= 40:
name_seg_count_valid['20-40'] += 1
elif length <= 60:
name_seg_count_valid['40-60'] += 1
elif length <= 80:
name_seg_count_valid['60-80'] += 1
elif length <= 100:
name_seg_count_valid['80-100'] += 1
elif length <= 120:
name_seg_count_valid['100-120'] += 1
elif length <= 140:
name_seg_count_valid['120-140'] += 1
elif length <= 160:
name_seg_count_valid['140-160'] += 1
elif length <= 180:
name_seg_count_valid['160-180'] += 1
elif length <= 200:
name_seg_count_valid['180-200'] += 1
else:
name_seg_count_valid['200+'] += 1
total_length_name_valid += length
# 视频个数
total_count = count_row
total_count_valid = count_video
# 计算所有视频的平均时长,即总时长除以个数
average_duration = total_duration / total_count
average_duration_valid = total_duration_valid / total_count_valid
# 计算所有name字段的平均长度,即总长度除以个数
average_length_name = total_length_name / total_count
average_length_name_valid = total_length_name_valid / total_count_valid
print('====================================statistical results====================================')
print('every lines in csv:\n')
# 打印duration字段统计结果
print('count: ' + str(total_count) + '\n')
print('total_duration: ', total_duration / 3600, 'h\n')
print('duration_seg_count: ' + str(duration_seg_count) + '\n')
print('average_duration: ' + str(average_duration) + 's\n')
# 打印name字段统计结果
print('total_length_name: ', total_length_name / 1000000 ,'M', '\n')
print('name_seg_count: ' + str(name_seg_count) + '\n')
print('average_length_name: ' + str(average_length_name) + '\n')
print('-----------------------------------------')
# 有效视频
print('valid video:\n')
# 打印duration字段统计结果
print('count: ' + str(total_count_valid) + '\n')
print('total_duration: ', total_duration_valid / 3600, 'h\n')
print('duration_seg_count: ' + str(duration_seg_count_valid) + '\n')
print('average_duration: ' + str(average_duration_valid) + 's\n')
# 打印name字段统计结果
print('total_length_name: ', total_length_name_valid / 10000 ,'W', '\n')
print('name_seg_count: ' + str(name_seg_count_valid) + '\n')
print('average_length_name: ' + str(average_length_name_valid) + '\n')
csvFile.close()
print("====================================CSV file processing complete====================================")
## 数据可视化
# 绘制duration柱状图
x_duration = list(duration_seg_count.keys()) # x轴是时长区间
y_duration = list(duration_seg_count.values()) # y轴是视频数量
fig, ax = plt.subplots()
plt.bar(x_duration, y_duration) # 使用plt.bar函数
plt.xlabel('Video duration range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different video durations') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_duration: ' + str(average_duration), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('duration_seg_count.png') # 保存图片到本文件夹下,命名为duration_seg_count.png
# plt.show() # 显示图形
plt.clf() # 清除画布
# 绘制name柱状图
x_name = list(name_seg_count.keys()) # x轴是时长区间
y_name = list(name_seg_count.values()) # y轴是视频数量
fig, ax = plt.subplots()
plt.bar(x_name, y_name) # 使用plt.bar函数
plt.xlabel('Text length range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different text length') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_length_name: ' + str(average_length_name), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('name_seg_count.png') # 保存图片到本文件夹下,命名为name_seg_count.png
# plt.show() # 显示图形
## valid数据可视化
# 绘制duration柱状图
x_duration = list(duration_seg_count_valid.keys()) # x轴是时长区间
y_duration = list(duration_seg_count_valid.values()) # y轴是视频数量
fig, ax = plt.subplots()
plt.bar(x_duration, y_duration) # 使用plt.bar函数
plt.xlabel('Video duration range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different video durations') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_duration: ' + str(average_duration_valid), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('duration_seg_count_valid.png') # 保存图片到本文件夹下,命名为duration_seg_count.png
# plt.show() # 显示图形
plt.clf() # 清除画布
# 绘制name柱状图
x_name = list(name_seg_count_valid.keys()) # x轴是时长区间
y_name = list(name_seg_count_valid.values()) # y轴是视频数量
fig, ax = plt.subplots()
plt.bar(x_name, y_name) # 使用plt.bar函数
plt.xlabel('Text length range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different text length') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_length_name: ' + str(average_length_name_valid), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('name_seg_count_valid.png') # 保存图片到本文件夹下,命名为name_seg_count.png
end_time = time.perf_counter()
print("program running time: ", (end_time - start_time)/60, "min")
# 计算csv文件中的总行数
# results = pd.read_csv('results_10M_train.csv')
# print(len(results)) #10727607 千万量级 10B
运行结果展示: