读取并处理csv格式文件python代码 | webVid数据集处理 | 数据可视化

readCsv.py

import csv
import re    # 正则表达式模块
import os

# import pandas as pd
import time
import matplotlib.pyplot as plt

# 程序运行时间计时 开始
start_time = time.perf_counter()

# 定义一个函数,将字符串转化为秒数
def convert_to_seconds(duration):
    # 使用正则表达式匹配时、分、秒的数字
    match = re.search(r'PT(\d+)H(\d+)M(\d+)S', duration)
    # 如果匹配成功,将数字转化为整数,并计算总秒数
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        seconds = int(match.group(3))
        total_seconds = hours * 3600 + minutes * 60 + seconds
        return total_seconds
    # 如果匹配失败,返回None
    else:
        return 0
    

# 打开CSV文件
with open('results_10M_train.csv', 'r', encoding="utf-8") as csvFile:
    # 创建CSV读取器
    csvReader = csv.reader(csvFile)
    # 所有视频的总时长
    total_duration = 0
    total_duration_valid = 0
    
    # name字段总长度
    total_length_name = 0
    total_length_name_valid = 0

    # 行数    
    count_row = 0
    count_video = 0
    
    # 字典,存储视频时长各个区间的视频个数
    duration_seg_count = {'0': 0,'0-5': 0, '5-10': 0, '10-15': 0, '15-20': 0, '20-25': 0, '25-30': 0,'30-35':0, '35-40':0, '40-45':0, '45-50':0,'50-55':0,'55-60':0, '60-90': 0, '90-120':0, '120-150':0, '150-180':0, '180+':0}
    duration_seg_count_valid = {'0': 0,'0-5': 0, '5-10': 0, '10-15': 0, '15-20': 0, '20-25': 0, '25-30': 0,'30-35':0, '35-40':0, '40-45':0, '45-50':0,'50-55':0,'55-60':0, '60-90': 0, '90-120':0, '120-150':0, '150-180':0, '180+':0}
    # 字典,存储name字段长度各个区间的视频个数
    name_seg_count = {'0': 0, '0-20': 0, '20-40': 0, '40-60': 0, '60-80': 0, '80-100': 0, '100-120': 0, '120-140': 0, '140-160':0,'160-180':0, '180-200':0, '200+':0}
    name_seg_count_valid = {'0': 0, '0-20': 0, '20-40': 0, '40-60': 0, '60-80': 0, '80-100': 0, '100-120': 0, '120-140': 0, '140-160':0,'160-180':0, '180-200':0, '200+':0}
    # 跳过第一行(表头)
    next(csvReader)
    
    # 读取CSV数据并进行处理
    for row in csvReader: 
        count_row += 1
        # if(count_row > 100000):
        #     count_row -= 1
        #     break
        # print('##########################################################################################################')
        # print('line', count_row)
        # print(row) # ['videoid', 'contentUrl', 'duration', 'page_dir', 'name']
        
        # 在这里对每一行数据进行处理
        videoid = row[0]
        duration = row[2]
        page_dir = row[3]
        name = row[4]
        duration = convert_to_seconds(duration)
        
        # print()
        # print('videoid: ', videoid)
        # print('duration:', duration)
        # print('page_dir: ',page_dir)
        # print('name: ',name)
        
        # 根据duration字段的值,判断它属于哪个区间,并在字典中对应的键值+1
        if duration == 0:
            duration_seg_count['0'] += 1
        elif duration <= 5:
            duration_seg_count['0-5'] += 1
        elif duration <= 10:
            duration_seg_count['5-10'] += 1
        elif duration <= 15:
            duration_seg_count['10-15'] += 1
        elif duration <= 20:
            duration_seg_count['15-20'] += 1
        elif duration <= 25:
            duration_seg_count['20-25'] += 1
        elif duration <= 30:
            duration_seg_count['25-30'] += 1
        elif duration <= 35:
            duration_seg_count['30-35'] += 1
        elif duration <= 40:
            duration_seg_count['35-40'] += 1
        elif duration <= 45:
            duration_seg_count['40-45'] += 1
        elif duration <= 50:
            duration_seg_count['45-50'] += 1
        elif duration <= 55:
            duration_seg_count['50-55'] += 1
        elif duration <= 60:
            duration_seg_count['55-60'] += 1
        elif duration <= 90:
            duration_seg_count['60-90'] += 1
        elif duration <= 120:
            duration_seg_count['90-120'] += 1
        elif duration <= 150:
            duration_seg_count['120-150'] += 1
        elif duration <= 180:
            duration_seg_count['150-180'] += 1
        else:
            duration_seg_count['180+'] += 1
            
        # 将duration字段的值累加到总时长变量中
        total_duration += duration
        
        # 根据name字段的长度,判断它属于哪个区间,并在字典中对应的键值+1
        length = len(name)
        if length <= 0:
            name_seg_count['0'] += 1
        if length <= 20:
            name_seg_count['0-20'] += 1
        elif length <= 40:
            name_seg_count['20-40'] += 1
        elif length <= 60:
            name_seg_count['40-60'] += 1
        elif length <= 80:
            name_seg_count['60-80'] += 1
        elif length <= 100:
            name_seg_count['80-100'] += 1
        elif length <= 120:
            name_seg_count['100-120'] += 1
        elif length <= 140:
            name_seg_count['120-140'] += 1
        elif length <= 160:
            name_seg_count['140-160'] += 1
        elif length <= 180:
            name_seg_count['160-180'] += 1
        elif length <= 200:
            name_seg_count['180-200'] += 1
        else:
            name_seg_count['200+'] += 1
        
        # 将name长度值累加到总长度变量中
        total_length_name += length
        

        ori_file_path = './WebVid/' + page_dir
        ori_file_name = str(videoid) + '.mp4' 
        ori_file_path_name = ori_file_path + '/' + ori_file_name
        
        if(os.path.exists(ori_file_path_name)):
            count_video += 1
            
            if duration == 0:
                duration_seg_count_valid['0'] += 1
            elif duration <= 5:
                duration_seg_count_valid['0-5'] += 1
            elif duration <= 10:
                duration_seg_count_valid['5-10'] += 1
            elif duration <= 15:
                duration_seg_count_valid['10-15'] += 1
            elif duration <= 20:
                duration_seg_count_valid['15-20'] += 1
            elif duration <= 25:
                duration_seg_count_valid['20-25'] += 1
            elif duration <= 30:
                duration_seg_count_valid['25-30'] += 1
            elif duration <= 35:
                duration_seg_count_valid['30-35'] += 1
            elif duration <= 40:
                duration_seg_count_valid['35-40'] += 1
            elif duration <= 45:
                duration_seg_count_valid['40-45'] += 1
            elif duration <= 50:
                duration_seg_count_valid['45-50'] += 1
            elif duration <= 55:
                duration_seg_count_valid['50-55'] += 1
            elif duration <= 60:
                duration_seg_count_valid['55-60'] += 1
            elif duration <= 90:
                duration_seg_count_valid['60-90'] += 1
            elif duration <= 120:
                duration_seg_count_valid['90-120'] += 1
            elif duration <= 150:
                duration_seg_count_valid['120-150'] += 1
            elif duration <= 180:
                duration_seg_count_valid['150-180'] += 1
            else:
                duration_seg_count_valid['180+'] += 1
            
            total_duration_valid += duration
            
            if length <= 0:
                name_seg_count_valid['0'] += 1
            if length <= 20:
                name_seg_count_valid['0-20'] += 1
            elif length <= 40:
                name_seg_count_valid['20-40'] += 1
            elif length <= 60:
                name_seg_count_valid['40-60'] += 1
            elif length <= 80:
                name_seg_count_valid['60-80'] += 1
            elif length <= 100:
                name_seg_count_valid['80-100'] += 1
            elif length <= 120:
                name_seg_count_valid['100-120'] += 1
            elif length <= 140:
                name_seg_count_valid['120-140'] += 1
            elif length <= 160:
                name_seg_count_valid['140-160'] += 1
            elif length <= 180:
                name_seg_count_valid['160-180'] += 1
            elif length <= 200:
                name_seg_count_valid['180-200'] += 1
            else:
                name_seg_count_valid['200+'] += 1
            
            total_length_name_valid += length
        
    # 视频个数
    total_count = count_row
    total_count_valid = count_video

    # 计算所有视频的平均时长,即总时长除以个数
    average_duration = total_duration / total_count
    average_duration_valid = total_duration_valid / total_count_valid
    
    # 计算所有name字段的平均长度,即总长度除以个数
    average_length_name = total_length_name / total_count
    average_length_name_valid = total_length_name_valid / total_count_valid
    
    print('====================================statistical results====================================')
    print('every lines in csv:\n')
    # 打印duration字段统计结果
    print('count: ' + str(total_count) + '\n')
    print('total_duration: ', total_duration / 3600, 'h\n')
    print('duration_seg_count: ' + str(duration_seg_count) + '\n')
    print('average_duration: ' + str(average_duration) + 's\n')
    
    # 打印name字段统计结果
    print('total_length_name: ', total_length_name / 1000000 ,'M', '\n')
    print('name_seg_count: ' + str(name_seg_count) + '\n')
    print('average_length_name: ' + str(average_length_name) + '\n')
    
    print('-----------------------------------------')
    
    # 有效视频
    print('valid video:\n')
    # 打印duration字段统计结果
    print('count: ' + str(total_count_valid) + '\n')
    print('total_duration: ', total_duration_valid / 3600, 'h\n')
    print('duration_seg_count: ' + str(duration_seg_count_valid) + '\n')
    print('average_duration: ' + str(average_duration_valid) + 's\n')
    
    # 打印name字段统计结果
    print('total_length_name: ', total_length_name_valid / 10000 ,'W', '\n')
    print('name_seg_count: ' + str(name_seg_count_valid) + '\n')
    print('average_length_name: ' + str(average_length_name_valid) + '\n')
    
    
csvFile.close()
print("====================================CSV file processing complete====================================")
## 数据可视化

# 绘制duration柱状图
x_duration = list(duration_seg_count.keys())   # x轴是时长区间
y_duration = list(duration_seg_count.values()) # y轴是视频数量

fig, ax = plt.subplots()
plt.bar(x_duration, y_duration) # 使用plt.bar函数
plt.xlabel('Video duration range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different video durations') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_duration: ' + str(average_duration), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('duration_seg_count.png') # 保存图片到本文件夹下,命名为duration_seg_count.png
# plt.show() # 显示图形

plt.clf()  # 清除画布

# 绘制name柱状图
x_name = list(name_seg_count.keys())   # x轴是时长区间
y_name = list(name_seg_count.values()) # y轴是视频数量

fig, ax = plt.subplots()
plt.bar(x_name, y_name) # 使用plt.bar函数
plt.xlabel('Text length range(s)') # 添加x轴标签
plt.ylabel('Number of videos')     # 添加y轴标签
plt.title('Distribution of different text length') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_length_name: ' + str(average_length_name), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('name_seg_count.png') # 保存图片到本文件夹下,命名为name_seg_count.png

# plt.show() # 显示图形

## valid数据可视化

# 绘制duration柱状图
x_duration = list(duration_seg_count_valid.keys())   # x轴是时长区间
y_duration = list(duration_seg_count_valid.values()) # y轴是视频数量

fig, ax = plt.subplots()
plt.bar(x_duration, y_duration) # 使用plt.bar函数
plt.xlabel('Video duration range(s)') # 添加x轴标签
plt.ylabel('Number of videos') # 添加y轴标签
plt.title('Distribution of different video durations') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_duration: ' + str(average_duration_valid), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('duration_seg_count_valid.png') # 保存图片到本文件夹下,命名为duration_seg_count.png
# plt.show() # 显示图形

plt.clf()  # 清除画布

# 绘制name柱状图
x_name = list(name_seg_count_valid.keys())   # x轴是时长区间
y_name = list(name_seg_count_valid.values()) # y轴是视频数量

fig, ax = plt.subplots()
plt.bar(x_name, y_name) # 使用plt.bar函数
plt.xlabel('Text length range(s)') # 添加x轴标签
plt.ylabel('Number of videos')     # 添加y轴标签
plt.title('Distribution of different text length') # 添加标题
plt.xticks(rotation=45) # 旋转x轴刻度标签,避免重叠
plt.text(0.5, 0.99, 'average_length_name: ' + str(average_length_name_valid), transform=ax.transAxes, horizontalalignment='left', verticalalignment='top')
plt.savefig('name_seg_count_valid.png') # 保存图片到本文件夹下,命名为name_seg_count.png


end_time = time.perf_counter()
print("program running time: ", (end_time - start_time)/60, "min")



# 计算csv文件中的总行数
# results = pd.read_csv('results_10M_train.csv')
# print(len(results)) #10727607 千万量级 10B

运行结果展示: 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值