通过pandas 处理CSV文件，增加行，增加列，转json list

最新推荐文章于 2024-04-10 10:33:30 发布

走走又停停，人生路上皆风景

最新推荐文章于 2024-04-10 10:33:30 发布

阅读量669

点赞数

分类专栏： python 大数据测试文章标签： python 大数据

本文链接：https://blog.csdn.net/qq_28207005/article/details/106721431

版权

python 同时被 2 个专栏收录

30 篇文章 2 订阅

订阅专栏

大数据测试

7 篇文章 0 订阅

订阅专栏

在网上找到了一切小技巧，让无for循环有进度条；添加一个程序运行时间记录的计时器等

import csv
import os
import time
from decimal import Decimal

import pandas as pd
import xlrd
from tqdm import tqdm
import jsonlines

cur_path = os.path.dirname(os.path.realpath(__file__))
parent_path = os.path.dirname(cur_path)


def func_time(func):
    def inner(*args, **kw):
        start_time = time.time()
        print("{}:{}".format("程序开始时间",
                             time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))))
        func(*args, **kw)
        end_time = time.time()
        print("{}:{}".format("程序结束时间",
                             time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))))
        cost_time = Decimal(end_time - start_time).quantize(Decimal("0.000"))
        print('程序运行时间为:', cost_time, 's')

    return inner


# 获取行数和列数
@func_time
def get_rows_cols(file_name):
    user_info = pd.read_csv(file_name, iterator=True, low_memory=False)
    count = 0
    for i in tqdm(range(3000)):
        try:
            user = user_info.get_chunk(1000)
            count += user.shape[0]
        except StopIteration:
            break
    print("行数:", count)
    print("列数:", user.shape[1])


# xlsx 转为 csv
@func_time
def xlsx_to_csv(src_file, tag_file):
    workbook = xlrd.open_workbook(src_file)
    table = workbook.sheet_by_index(0)
    with open(tag_file, "w", encoding="utf-8") as f:
        write = csv.writer(f)
        for row_num in tqdm(range(table.nrows)):
            row_value = table.row_values(row_num)
            write.writerow(row_value)
    print("生成csv文件，查看行数，列数")
    get_rows_cols(tag_file)


# 使用pandas将xlsx转化为csv文件
@func_time
def pandas_xlsx_to_csv(src_file, tag_file):
    src_rd = pd.read_excel(src_file)
    print("行数:", src_rd.shape[0])
    print("列数:", src_rd.shape[1])
    src_rd.to_csv(tag_file, index=False)
    print("生成csv文件中")
    get_rows_cols(tag_file)


# 将某一字段复制并添加
@func_time
def add_cols(src_file, tag_file):
    src_rd = pd.read_csv(src_file)
    print("列数")
    src_cols = src_rd.shape[1]
    print(src_cols)
    print("行数")
    src_rows = src_rd.shape[0]
    print(src_rows)
    print(src_rd["content"].values)
    print("增加列")
    # 将【'评论内容'】复制5遍，这里可以修改
    for i in tqdm(range(4)):
        col_name = src_rd["content"].name + str(i)
        src_rd[col_name] = src_rd["content"].values
    # 将【品牌】复制458遍
    for i in tqdm(range(467)):
        col_name = src_rd["friends_count"].name + str(i)
        src_rd[col_name] = src_rd["friends_count"].values
    src_rd.to_csv(tag_file, index=False)


# 增加行数
@func_time
def add_rows(src_file, tag_file, rows_number):
    src_rd = pd.read_csv(src_file, low_memory=False)
    list1 = [rows_number % 50000] if rows_number % 50000 else []
    list2 = (rows_number // 50000) * [50000]
    row_list = list1 + list2
    print("增加的行数:")
    print(row_list)
    rows = list(range(len(row_list)))
    i = 0
    for row in tqdm(row_list):
        rows[i] = src_rd.iloc[0:row]
        i = i + 1
    # 将前rows_number行复制两遍并写入目标文件中
    tag_rd = pd.concat(rows)
    tag_rd.to_csv(tag_file, index=False)


# 读取jsonl文件
@func_time
def readJsonl(file):
    with open(file, "r+", encoding="utf-8") as f:
        for item in jsonlines.Reader(f):
            print(item['description'])
            print(len(item))
            print(type(item))


# 写入jsonl文件
@func_time
def writeJsonl(file, data):
    with jsonlines.open(file, mode='a') as f:
        jsonlines.Writer.write(f, data)


# 将jsonl文件转成csv文件
@func_time
def jsonl_to_csv():
    test_file = os.path.join(cur_path, "../Files/yiqing.json")
    tag_file = os.path.join(cur_path, "../Files/yiqing.csv")
    file_handler = open(test_file, 'r', encoding="utf-8")
    lines = file_handler.readlines()
    file_handler.close()
    content_list = []
    for line in lines:
        content_list.append((eval(line)))
    df = pd.DataFrame(content_list)
    df.to_csv(tag_file, index=False)


if __name__ == '__main__':
    # yiqing = os.path.join(cur_path, "../Files/yy0.csv")
    yy1 = os.path.join(cur_path, "../dataFiles/yy1.csv")
    yy0 = os.path.join(cur_path, "../dataFiles/yy0.csv")

    # jsonl_to_csv()
    # test_file = os.path.join(cur_path, "../Files/100.jsonl")
    # readJsonl(test_file)
    # chunk(file_name, tag_name, 100000)
    # add_cols(yy1, yy0)

    add_rows(yy1, yy0, 500000)
    get_rows_cols(yy0)

    # src_xlsx = parent_path + "/dataFiles/xxx.xlsx"
    # tag_csv = parent_path + "/dataFiles/yyy.csv"
    # xlsx_to_csv(src_xlsx, tag_csv)
    # get_rows_cols(tag_csv)

走走又停停，人生路上皆风景

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
打赏
0
评论
通过pandas 处理CSV文件，增加行，增加列，转json list

通过pandas 处理CSV文件，增加行，增加列，转json list在网上找到了一切小技巧，让无for循环有进度条；添加一个程序运行时间记录的计时器等import csvimport osimport timefrom decimal import Decimalimport pandas as pdimport xlrdfrom tqdm import tqdmimport jsonlinescur_path = os.path.dirname(os.path.realpath(
复制链接

扫一扫