python数据分析 第二章
CSV文件
#!//Users/chenbryant/anaconda3/bin/python
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as filereader: # 将input_file打开为一个文件对象filereader。'r'表示只读模式
with open(output_file, 'w', newline='') as filewriter:
header = filereader.readline()
header = header.strip()
header_list = header.split(',')
print(header_list)
filewriter.write(','.join(map(str,header_list))+'\n')# 写入
for row in filereader:
row = row.strip()
row_list = row.split(',')
print(row_list)
filewriter.write(','.join(map(str,row_list))+'\n')
#pdf52页
# pandas
#!//Users/chenbryant/anaconda3/bin/python
import sys
import pandas as pd
input_file = sys.argv[1]
ouput_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
print(data_frame)
data_frame.to_csv(ouput_file,index=False)
使用csv模块
#!//Users/chenbryant/anaconda3/bin/python
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file, delimiter=',') # delimiter=','是默认分隔符
filewriter = csv.writer(csv_out_file, delimiter=',')
for row_list in filereader:
print(row_list)
filewriter.writerow(row_list)
筛选特定的行
行中的值满足某个条件
# 基础python
#!/users/chenbryant/anaconda3/bin/python
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline=' ') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader) # next函数读出输入文件的第一行,赋值给header
filewriter.writerow(header) # 将标题写入输出文件
for row_list in filereader:
supplier = str(row_list[0]).strip()
# replace 删除逗号
cost = str(row_list[3]).strip('$').replace(',', ' ')
if supplier == 'Supplier Z' or float(cost) > 600.0:
filewriter.writerow(row_list)
# pandas
#!/Users/chenbryant/anaconda3/bin/python
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame['Cost'] = data_frame['Cost'].str.strip('$').astype(float)
# loc函数中的条件设置为:Supplier Name列中姓名包含Z,或者Cost列中的值大于600.0,并且需要所有的列
data_frame_value_meets_condition = data_frame.loc[(data_frame['Supplier Name'].str.contains('Z'))|(data_frame['Cost'] > 600.0), :]
data_frame_value_meets_condition.to_csv(output_file, index=False)
行中的值属于某个集合
# 基础python
#!//Users/chenbryant/anaconda3/bin/python
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
important_dates = ['1/20/14', '1/30/14']
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader)
filewriter.writerow(header)
for row_list in filereader:
a_date = row_list[4]
if a_date in important_dates:
filewriter.writerow(row_list)
# pandas
#!//Users/chenbryant/anaconda3/bin/python
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame =pd.read_csv(input_file)
important_dates = ['1/20/14' '1/30/14']
data_frame_value_in_set = data_frame.loc[data_frame['Purchase Date'].isin(important_dates), :]
data_frame_value_in_set.to_csv(output_file, index=False)
行中的值匹配于某个模式/正则表达式
# 基础python
#!//Users/chenbryant/anaconda3/bin/python
import csv
import re
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
pattern = re.compile(r'(?P<my_pattern_group>^001-.*)', re.I) # (^)是个特殊符号,表示只在字符串开头搜索模式。
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header = next(filereader)
filewriter.writerow(header)
for row_list in filereader:
invoice_number = row_list[1]
if pattern.search(invoice_number):
filewriter.writerow(row_list)
# pandas
#!//Users/chenbryant/anaconda3/bin/python
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file)
data_frame_value_matches_pattern = data_frame.loc[data_frame['Invoice Number'].str.startswith("001-"), :]
data_frame_value_matches_pattern.to_csv(output_file, index=False)
选取连续的行
# 基础python
#!/Users/chenbryant/anaconda3/bin/python
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
row_counter = 0
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
for row in filereader:
if row_counter >= 3 and row_counter <= 15:
filewriter.writerow([value.strip() for value in row])
row_counter += 1
print(row_counter, [value.strip() for value in row])
# pandas
#!//Users/chenbryant/anaconda3/bin/python
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
data_frame = pd.read_csv(input_file, header=None)
# 丢弃前三行和最后三行
data_frame = data_frame.drop([0, 1, 2, 16, 17, 18])
data_frame.columns = data_frame.iloc[0]
# reindex 为数据框重新生成索引
data_frame = data_frame.reindex(data_frame.index.drop(3))
data_frame.to_csv(output_file, index=False)
iloc[]函数的作用
iloc函数,属于pandas库,全称为index location,即对数据进行位置索引,从而在数据表中提取出相应的数据。
添加标题行
# 基础python
#!//Users/chenbryant/anaconda3/bin/python
import csv
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'w', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
header_list = ['Supplier Name', 'Invoice Number', 'Spart Number', 'Cost', 'Purchase Date']
filewriter.writerow(header_list)
for row in filereader:
filewriter.writerow(row)
# pandas
#!//Users/chenbryant/anaconda3/bin/python
import pandas as pd
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
header_list = ['Supplier Name', 'Invoice Number', 'Part Number', 'Cost', 'Purchase Date']
data_frame = pd.read_csv(input_file, header=None, names=header_list)
data_frame.to_csv(output_file, index=False)
读取多个csv文件
文件计数与文件中的行列计数
导入Python内置的glob和os模块,glob模块可以定位匹配于某个特定模式的所有路径。模式中可以包含Unixshell风格的通配符,比如*。在本例中,要搜索的模式是‘sales_*’。这个模式表示要搜索所有文件名以sales_开头并且下划线后可以是任何字符的文件。
以后可能会用到找出一个文件夹下所有的CSV文件,而不是以。。。开头的文件,那么可以把脚本中的‘sales_*’改成‘.csv’。
os模块包含了用于解析路径名的函数。例如,os.path.basename(path)返回path的基本文件名。
#!//Users/chenbryant/anaconda3/bin/python
import csv
import sys
import os
import glob
input_path = sys.argv[1]
file_counter = 0
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
row_counter = 1
with open(input_file, 'r', newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
header = next(filereader, None)
for row in filereader:
row_counter += 1
print('{0!s}: \t{1:d} rows \t{2:d} columns'.format(os.path.basename(input_file), row_counter, len(header)))
file_counter += 1
print('Number of files: {0:d}'.format(file_counter))
从多个文件中连接数据
基础python
将以下代码保存为9csv_reader_concat_rows_from_multiple_files.py:
#!/Users/chenbryant/anaconda3/bin/python
import csv
import glob
import os
import sys
input_path = sys.argv[1]
output_file = sys.argv[2]
first_file = True
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
print(os.path.basename(input_file))
# with语句:打开输出文件
with open(input_file, 'r', newline='') as csv_in_file:
with open(output_file, 'a', newline='') as csv_out_file:
filereader = csv.reader(csv_in_file)
filewriter = csv.writer(csv_out_file)
if first_file:
for row in filereader:
filewriter.writerow(row)
first_file = False
else:
header = next(filereader, None)
for row in filereader:
filewriter.writerow(row)
‘a’:以追加的方式打开输出文件,如果用’w’则从一个输入文件中输出的数据会覆盖掉前一个输入文件输出的数据,最后的输出文件会只包含最后处理的那个输入文件的数据。
16行的if语句根据第9行创建的first_file变量来区分当前文件是第一个输入文件,还是其后的输入文件。next方法可以将每个文件中的标题行赋给要一个变量(跳过后面处理的文件中的标题行)。
chmod后运行如下脚本:
./9csv_reader_concat_rows_from_multiple_files.py "/Users/chenbryant/Desktop" 9output.csv
pandas
将下列代码保存为pandas_concat_rows_from_multple_files.py
#!/Users/chenbryant/anaconda3/bin/python
import pandas as pd
import glob
import os
import sys
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frames = []
for file in all_files:
data_frame = pd.read_csv(file, index_col=None)
# 将所有的数据框追加到一个数据框列表,然后使用concat函数将所有数据框连接成一个数据框
all_data_frames.append(data_frame)
# axis=0表示从头到尾垂直堆叠,axis=1表示并排的平行堆叠
data_frame_concat = pd.concat(all_data_frames, axis=0, ignore_index=True)
data_frame_concat.to_csv(output_file, index=False)
除了以上方法,python中另一个内置模块NumPy也提供了若干函数来垂直或平行连接数据,通常是将numpy导入为np,然后要垂直连接数据,使用np.concatenate([array1, array2], axis=0)、np.vstack((array1, array2))或np.r_[array1, array2]。同样,要平行连接数据,你可以使用np.concatenate([array1, array2], axis=1)、np.hstack((array1, array2))或np.c_[array1, array2]
要运行这个脚本,chmod后在命令行中输入:
./pandas_concat_rows_from_multiple_files.py "/Users/chenbryant/Desktop" output_files/pandas_output.csv
2.8 计算每个文件中值的总和与均值
1.基础python
将下列代码保存为:10csv_reader_sum_average_from_multiple_files.py
#!/Users/chenbryant/anaconda3/bin/python
import csv
import glob
import os
import sys
input_path = sys.argv[1]
output_file = sys.argv[2]
# 输出文件标题
output_header_list = ['file_name', 'total_sales', 'average_sales']
csv_out_file = open(output_file, 'a', newline='')
filewriter = csv.writer(csv_out_file)
filewriter.writerow(output_header_list)
for input_file in glob.glob(os.path.join(input_path, 'sales_*')):
with open(input_file, 'r', newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
output_list = [ ]
output_list.append(os.path.basename(input_file))
header = next(filereader)
total_sales = 0.0
number_of_sales = 0.0
for row in filereader:
sale_amount = row[3]
total_sales += float(str(sale_amount).strip('$').replace(',',''))
number_of_sales += 1
average_sales = '{0: .2f}'.format(total_sales / number_of_sales)
output_list.append(total_sales)
output_list.append(average_sales)
filewriter.writerow(output_list)
csv_out_file.close()
要运行这个脚本,在命令行输入以下命令:
./10csv_reader_sum_average_from_multiple_files.py "/Users/chenbryant/Desktop" output_files/10output.csv
2.pandas
将下列代码保存为pandas_sum_average_from_multiple_files.py:
#!/Users/chenbryant/anaconda3/bin/python
import pandas as pd
import glob
import os
import sys
input_path = sys.argv[1]
output_file = sys.argv[2]
all_files = glob.glob(os.path.join(input_path, 'sales_*'))
all_data_frames = []
for input_file in all_files:
data_frame = pd.read_csv(input_file, index_col=None)
total_cost = pd.DataFrame([float(str(value).strip('$').replace(',','')) for value in data_frame.loc[:, 'Sale Amount']]).sum() # 求和
average_cost = pd.DataFrame([float(str(value).strip('$').replace(',','')) for value in data_frame.loc[:, 'Sale Amount']]).mean() # 求均值
data = {'file_name': os.path.basename(input_file), 'total_sales': total_cost, 'average_sales': average_cost}
all_data_frames.append(pd.DataFrame(data, columns=['file_name', 'total_sales', 'average_sales']))
data_frame_concat = pd.concat(all_data_frames, axis=0, ignore_index=True)
data_frame_concat.to_csv(output_file, index=False)
要运行这个脚本,在命令行输入:
./pandas_sum_average_from_multiple_files.py "/Users/chenbryant/Desktop" output_files/pandas_output.csv