在网上找到了一切小技巧,让无for循环有进度条;添加一个程序运行时间记录的计时器等
import csv
import os
import time
from decimal import Decimal
import pandas as pd
import xlrd
from tqdm import tqdm
import jsonlines
cur_path = os.path.dirname(os.path.realpath(__file__))
parent_path = os.path.dirname(cur_path)
def func_time(func):
def inner(*args, **kw):
start_time = time.time()
print("{}:{}".format("程序开始时间",
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))))
func(*args, **kw)
end_time = time.time()
print("{}:{}".format("程序结束时间",
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(end_time))))
cost_time = Decimal(end_time - start_time).quantize(Decimal("0.000"))
print('程序运行时间为:', cost_time, 's')
return inner
@func_time
def get_rows_cols(file_name):
user_info = pd.read_csv(file_name, iterator=True, low_memory=False)
count = 0
for i in tqdm(range(3000)):
try:
user = user_info.get_chunk(1000)
count += user.shape[0]
except StopIteration:
break
print("行数:", count)
print("列数:", user.shape[1])
@func_time
def xlsx_to_csv(src_file, tag_file):
workbook = xlrd.open_workbook(src_file)
table = workbook.sheet_by_index(0)
with open(tag_file, "w", encoding="utf-8") as f:
write = csv.writer(f)
for row_num in tqdm(range(table.nrows)):
row_value = table.row_values(row_num)
write.writerow(row_value)
print("生成csv文件,查看行数,列数")
get_rows_cols(tag_file)
@func_time
def pandas_xlsx_to_csv(src_file, tag_file):
src_rd = pd.read_excel(src_file)
print("行数:", src_rd.shape[0])
print("列数:", src_rd.shape[1])
src_rd.to_csv(tag_file, index=False)
print("生成csv文件中")
get_rows_cols(tag_file)
@func_time
def add_cols(src_file, tag_file):
src_rd = pd.read_csv(src_file)
print("列数")
src_cols = src_rd.shape[1]
print(src_cols)
print("行数")
src_rows = src_rd.shape[0]
print(src_rows)
print(src_rd["content"].values)
print("增加列")
for i in tqdm(range(4)):
col_name = src_rd["content"].name + str(i)
src_rd[col_name] = src_rd["content"].values
for i in tqdm(range(467)):
col_name = src_rd["friends_count"].name + str(i)
src_rd[col_name] = src_rd["friends_count"].values
src_rd.to_csv(tag_file, index=False)
@func_time
def add_rows(src_file, tag_file, rows_number):
src_rd = pd.read_csv(src_file, low_memory=False)
list1 = [rows_number % 50000] if rows_number % 50000 else []
list2 = (rows_number // 50000) * [50000]
row_list = list1 + list2
print("增加的行数:")
print(row_list)
rows = list(range(len(row_list)))
i = 0
for row in tqdm(row_list):
rows[i] = src_rd.iloc[0:row]
i = i + 1
tag_rd = pd.concat(rows)
tag_rd.to_csv(tag_file, index=False)
@func_time
def readJsonl(file):
with open(file, "r+", encoding="utf-8") as f:
for item in jsonlines.Reader(f):
print(item['description'])
print(len(item))
print(type(item))
@func_time
def writeJsonl(file, data):
with jsonlines.open(file, mode='a') as f:
jsonlines.Writer.write(f, data)
@func_time
def jsonl_to_csv():
test_file = os.path.join(cur_path, "../Files/yiqing.json")
tag_file = os.path.join(cur_path, "../Files/yiqing.csv")
file_handler = open(test_file, 'r', encoding="utf-8")
lines = file_handler.readlines()
file_handler.close()
content_list = []
for line in lines:
content_list.append((eval(line)))
df = pd.DataFrame(content_list)
df.to_csv(tag_file, index=False)
if __name__ == '__main__':
yy1 = os.path.join(cur_path, "../dataFiles/yy1.csv")
yy0 = os.path.join(cur_path, "../dataFiles/yy0.csv")
add_rows(yy1, yy0, 500000)
get_rows_cols(yy0)