这可以通过一些递归来完成。如果发现ZIP中的文件是ZIP文件,则进行递归调用以提取CSV文件:try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from io import BytesIO
import zipfile
import pandas as pd
# Dictionary holding all the dataframes from all zip/zip/csvs
dfs = {}
def zip_to_dfs(data):
zip_file = zipfile.ZipFile(BytesIO(data))
for name in zip_file.namelist():
if name.lower().endswith('.csv'):
dfs[name] = pd.read_csv(zip_file.open(name))
elif name.lower().endswith('.zip'):
zip_to_dfs(zip_file.open(name).read())
def get_zip_data_from_url(url):
req = urlopen(url)
zip_to_dfs(req.read())
final_links_list = [
'http://www.nemweb.com.au/REPORTS/ARCHIVE/Dispatch_SCADA/PUBLIC_DISPATCHSCADA_20170523.zip',
'http://www.nemweb.com.au/REPORTS/ARCHIVE/Dispatch_SCADA/PUBLIC_DISPATCHSCADA_20170524.zip']
for link in final_links_list:
print(link)
get_zip_data_from_url(link)
# Display the first couple of dataframes
for name, df in sorted(dfs.items())[:2]:
print('\n', name, '\n')
print(df)
这将显示以下内容:http://www.nemweb.com.au/REPORTS/ARCHIVE/Dispatch_SCADA/PUBLIC_DISPATCHSCADA_20170524.zip
PUBLIC_DISPATCHSCADA_201705240010_0000000283857084.CSV
C NEMP.WORLD DISPATCHSCADA AEMO PUBLIC 2017/05/24 \
0 I DISPATCH UNIT_SCADA 1.0 SETTLEMENTDATE DUID
1 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:10:00 BARCSF1
2 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:10:00 BUTLERSG
.. .. ... ... ... ... ...
263 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:10:00 YWPS3
264 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:10:00 YWPS4
265 C END OF REPORT 267 NaN NaN NaN
00:05:08 0000000283857084 DISPATCHSCADA.1 0000000283857078
0 SCADAVALUE NaN NaN NaN
1 0 NaN NaN NaN
2 8.299998 NaN NaN NaN
.. ... ... ... ...
263 388.745570 NaN NaN NaN
264 391.568360 NaN NaN NaN
265 NaN NaN NaN NaN
[266 rows x 10 columns]
PUBLIC_DISPATCHSCADA_201705240015_0000000283857169.CSV
C NEMP.WORLD DISPATCHSCADA AEMO PUBLIC 2017/05/24 \
0 I DISPATCH UNIT_SCADA 1.0 SETTLEMENTDATE DUID
1 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:15:00 BARCSF1
2 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:15:00 BUTLERSG
.. .. ... ... ... ... ...
263 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:15:00 YWPS3
264 D DISPATCH UNIT_SCADA 1.0 2017/05/24 00:15:00 YWPS4
265 C END OF REPORT 267 NaN NaN NaN
00:10:08 0000000283857169 DISPATCHSCADA.1 0000000283857163
0 SCADAVALUE NaN NaN NaN
1 0 NaN NaN NaN
2 8.299998 NaN NaN NaN
.. ... ... ... ...
263 386.205080 NaN NaN NaN
264 389.592410 NaN NaN NaN
265 NaN NaN NaN NaN
[266 rows x 10 columns]