import pandas as pd
import os
from os import path
dir_names = [] #定义数组用于存储所有年份的文件夹名
file_names = [] #定义数组用于存储年份文件夹名下所有文件名
string = input("请输入需要筛选的数据元组:")
def select(string,file_names,file): #封装数据选择函数
# 创建DataFrame对象,将原数据传入
# \s匹配任何空白非打印字符,\S匹配任何非空白非打印字符
data = pd.read_csv("D:/python/python project/pythonProject1/file_test/data/" + str(dir_names) + "/" + str(file_names) + file, sep='\s+',
# names=['Year','Month','Day','Hour','Air Temperature','Dew Point Temperature','Sea Level Pressure','Wind Direction','Wind Speed Rate','Sky Condition Total Coverage Code','Liquid Precipitation Depth Dimension - One Hour Duration','Liquid Precipitation Depth Dimension - Six Hour Duration'],
names=['年', '月', '日', '时', '气温', '露点', '气压', '风向', '风速', '云量', '降水量_1h',
'降水量_6h'],
error_bad_lines=False, warn_bad_lines=False)
# 筛选需要的列,这里是双重方括号!
Air_Temperature = pd.DataFrame(data[['年', '月', '日', '时', string]])
# 通过sep参数设置保存文件的分隔符,将筛选的列保存为csv文件
# index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
# 转换为txt文件,可以自动生成txt文件
Air_Temperature.to_csv(str(file_names) + "_" + file + string + ".txt", sep=" ", index=True, header=True)
def scaner_file_0(url):
# 遍历当前路径下所有文件
file = os.listdir(url)
for f in file:
# 字符串拼接
real_url = path.join(url, f)
dir_names.append(real_url)
print(dir_names) # 年份文件夹名
# 遍历所有年份的文件夹名
scaner_file_0("data" )
def scaner_file_1(url):
file = os.listdir(url)
for f in file:
real_url = path.join(url, f)
if path.isfile(real_url):
# 如果是文件,则以绝度路径的方式输出
print(path.abspath(real_url))
elif path.isdir(real_url):
# 如果是目录,则是递归调用自定义函数scaner_file (url)
scaner_file_1(real_url)
else:
print("其他情况")
pass
# 遍历所有年份的文件夹名以及文件夹下面的文件
# scaner_file_1("D:/python/python project/pythonProject1/file_test/data" )
def figure_data():
for name in dir_names: #逐个读取所有年份的文件夹名
real_url = path.join("data", name)
file_names = os.listdir(name) #循环逐一打开所有年份的文件夹
for file in file_names: # 读取当前年份文件夹中所有文件,保存在file
print(file)
# # 创建DataFrame对象,将原数据传入
# # \s匹配任何空白非打印字符,\S匹配任何非空白非打印字符
# data = pd.read_csv(real_url + "/" + file, sep='\s+',
# # names=['Year','Month','Day','Hour','Air Temperature','Dew Point Temperature','Sea Level Pressure','Wind Direction','Wind Speed Rate','Sky Condition Total Coverage Code','Liquid Precipitation Depth Dimension - One Hour Duration','Liquid Precipitation Depth Dimension - Six Hour Duration'],
# names=['年', '月', '日', '时', '气温', '露点', '气压', '风向', '风速', '云量','降水量_1h','降水量_6h'],
# error_bad_lines=False, warn_bad_lines=False)
# # 筛选需要的列,这里是双重方括号!
# select_data = pd.DataFrame(data[['年', '月', '日', '时', string]])
# # 通过sep参数设置保存文件的分隔符,将筛选的列保存为csv文件
# # index参数设置为False表示不保存行索引,header设置为False表示不保存列索引
# # 转换为txt文件,可以自动生成txt文件
# select_data.to_csv(str(file_names) + "_" + file + "_" + string + ".txt", sep=" ", index=False, header=True)
# print(select_data)
# select(string, file_names, file)
figure_data()
import os
import pandas as pd
import numpy as np
"""
工作原理:
输入/指定:工作路径,指定年份(时间范围),输入站点列表
过程:指定工作路径下的文件夹全部以“china_isd_lite_”+“年份”命名,
根据输入的时间范围,创建对应年份的文件夹列表,
根据输入的站点列表,创建文件名称列表,
在每个文件夹列表元素下,顺序读取文件名称列表对应文件,并完成数据写入
"""
select_station_list = [575440,575540,575580,575620,575640,575650,575660,575740,575770,575840,575850,576400,576420,576460,
576490,576550,576570,576580,576610,576620,576630,576660,576690,576710,576730,576740,576780,576790,
576800,576820,576870,576880,577400,577430,577440,577450,577520,577540,577600,577610,577610,577620,
577630,577710,577720,577730,577740,577760,577770,577780,577790,577800,577810,578450,578650,578660,
578670,578700,578710,578720,578740,578750,578760,578810,578820,578860,578870,578890,579620,579650,
579660,579690,579710,579720,579730,579740,579750,579760,579780,579810,579850,590630] # 湖南站点列表
def trans_isd2excel(station_list, work_dir=r"D:\python\python project\pythonProject1\file_test/dataSorce/china_isd_lite",
period=[2013, 2014, 2015,2016,2017,2018,2019,2010,2021,2022]): # 工作路径,指定年份(时间范围),站点列表
datadir_list = []
for i in os.listdir(work_dir):
# 判断是否是文件夹,并指定时间范围
if os.path.splitext(i)[-1] == "" and period[0] <= int(os.path.split(i)[-1][-4:]) <= period[-1]:
print(1)
# 年份文件夹 路径,如:'D:\\A_NCDC_test\\china_isd_lite_1980'
datadir_list.append(work_dir + os.sep + i)
print(datadir_list)
erro_list = []
for datadir in datadir_list:
dataname_list = []
# 文件名称形式:"D:\\A_NCDC_test\\china_isd_lite_1980\\450070-99999-1980"
for item in station_list:
dataname_list.append(datadir + os.sep + item + "-99999-" + os.path.split(datadir)[-1][-4:])
for name in dataname_list:
try:
data = pd.read_table(name, header=None)
'''
原始数据中以空格分隔的12列数据,分别为:
年、月、日、小时、温度、露点温度、气压、风向、风速、云量、1小时降雨量和6小时降雨量。
'''
# 构建空列表用于存放提取出来的各列数据
data_list = []
for line in data.values:
line_temp = [int(x) for x in line[0].split(' ') if x != '']
data_list.append(line_temp)
df = pd.DataFrame(data_list,
columns=['年', '月', '日', '小时', '温度', '露点温度', '气压', '风向', '风速', '云量',
'1小时雨量', '6小时雨量'])
# 对数据中-9999的缺失值进行NaN替换
df = df.replace(-9999, np.nan)
# 数据说明文档中表示原始数据中温度、露点温度、气压、风速、降雨量的换算系数为10,所以要对原始数据中的对应数据除以10,进行换算。
df['温度'] = df['温度'] / 10
df['露点温度'] = df['露点温度'] / 10
df['气压'] = df['气压'] / 10
df['风速'] = df['风速'] / 10
df['1小时雨量'] = df['1小时雨量'] / 10
df['6小时雨量'] = df['6小时雨量'] / 10
# 为了便于后续重采样分析数据,给数据增加一个DataFrame列
df['Date'] = pd.PeriodIndex(year=df['年'], month=df['月'], day=df['日'], hour=df['小时'], freq='H')
df = df.set_index(df['Date'])
df.drop(columns='Date', inplace=True)
# 保存为同名excel
#out_folder = f'{work_dir}_trans_isd2excel\\{os.path.split(name)[-1][-4:]}' # 输出到工作文件夹同级的 "工作文件夹名称 + _trans_isd_2excel"下
out_folder = f'{work_dir}_trans_isd2csv\\{os.path.split(name)[-1][-4:]}' # 输出到工作文件夹同级的 "工作文件夹名称 + _trans_isd_2excel"下
if not os.path.exists(out_folder):
os.makedirs(out_folder) # 如果文件夹不存在,就创建
out_file = f'{out_folder}\\{os.path.split(name)[-1]}.xlsx'
df.to_excel(out_file)
print(out_file)
except FileNotFoundError:
#print("Error: 没有找到文件或读取文件失败",name)
erro_list.append(name)
return erro_list # 返回一个列表(储存未找到的文件名称)
# def condition(path):
# for name in path:
# if name.endswith('.DS_Store'):
# os.remove(os.path.join(path, name))
# for name in path:
# if name.startswith('~$'):
# os.remove(os.path.join(path, name))
# return path
# sub_path = condition("C:/Users/34163/Desktop/dataSorce/read/isd-history.xlsx")
# stations = pd.read_excel("C:/Users/34163/Desktop/dataSorce/read/isd-history.xlsx")
stations = pd.read_csv(r"D:\python\python project\pythonProject1\file_test/dataSorce/read/isd-history.csv")
print(stations.head())
# station_list = [str(x) for x in stations["USAF"]] # 有效站点列表
station_list = [str(x) for x in select_station_list] # 有效站点列表,筛选出湖南站点
# print(station_list)
a = trans_isd2excel(station_list, work_dir=r"D:\python\python project\pythonProject1\file_test/dataSorce/china_isd_lite", period=[2013, 2014, 2015,2016,2017,2018,2019,2010,2021,2022])