获取到的数据
得到的数据
导入的包
import numpy as np
import pandas as pd
import os
import glob
拿到的数据文件集,在 ./DataSet/20230101-20230506 下的数据包
读取所有文件 并写入到./DataSet/2023_china_cities.csv 只选取自己要的'date','hour','type','太原'
csv_list = glob.glob('DataSet/20230101-20230506/*csv')
print('共发现%s个csv文件'% len(csv_list))
count =0
for i in csv_list:
count+=1;
print(count)
fr = open(i,'r',encoding='utf-8').read()
with open('DataSet/2023_china_cities.csv','a',encoding='UTF-8') as f:
f.write(fr)
data = pd.read_csv('./DataSet/2023_china_cities.csv',
engine = 'python',
error_bad_lines = False,
usecols=['date','hour','type','太原'])
去重、将数据化成小数格式 object -> float
# 去除重复行
df1 = data.drop_duplicates()
# 去除空值
df1 = df1.dropna(axis=0)
df1['太原'] = df1['太原'].astype(float)
按照时间类型分类,并重置索引为时间。后续会将时间object ->datetime
# 给出的是每小时的数据 求平均得到当天的平均值
df2 = df1.groupby(by=['date','type']).mean()
df2
# 创建一个新的索引 将date列转化为时间
df3 = df2.reset_index()
df3.set_index('date')
df3.sort_index(ascending = True,inplace = True)
从行索引中提取type类型的关键词为列索引,逐一改名
dfCO = df3[df3['type'] == 'CO'][['date','太原']].set_index('date')
dfCO = dfCO.rename(columns={'太原':'CO'}).round(3)
df4 = pd.concat([df3test,dfCO],ignore_index = False,axis = 1)
dfNO2 = df3[df3['type'] == 'NO2'][['date','太原']].set_index('date')
dfNO2 = dfNO2.rename(columns={'太原':'NO2'}).round(3)
df4 = pd.concat([df4,dfNO2],ignore_index = False,axis = 1)
dfO3 = df3[df3['type'] == 'O3'][['date','太原']].set_index('date')
dfO3 = dfO3.rename(columns={'太原':'O3'}).round(3)
df4 = pd.concat([df4,dfO3],ignore_index = False,axis = 1)
dfPM10 = df3[df3['type'] == 'PM10'][['date','太原']].set_index('date')
dfPM10 = dfPM10.rename(columns={'太原':'PM10'}).round(3)
df4 = pd.concat([df4,dfPM10],ignore_index = False,axis = 1)
dfPM25 = df3[df3['type'] == 'PM2.5'][['date','太原']].set_index('date')
dfPM25 = dfPM25.rename(columns={'太原':'PM2.5'}).round(3)
df4 = pd.concat([df4,dfPM25],ignore_index = False,axis = 1)
dfSO2 = df3[df3['type'] == 'SO2'][['date','太原']].set_index('date')
dfSO2 = dfSO2.rename(columns={'太原':'SO2'}).round(3)
df4 = pd.concat([df4,dfSO2],ignore_index = False,axis = 1)
将文件输出到DataSet.csv中
# 把结果输出到数据库
outputpath = 'DataSet.csv'
df4.to_csv(outputpath,sep = ',',index = True,header = True)
后续每处理一个年份,都需要将DataSet.csv文件打开,并读取csv中的文件和处理好的数据concat,最后得到总数据。3256条