Python数据分析第五章
5.1 在一个大文件集合中查找一组项目
- 在桌面新建一个名为file_archive的文件夹,
打开Excel输入以下数据:
supplies_2012.csv
Item | Description | Supplier | Cost | Date |
---|---|---|---|---|
1234 | Widget 1 | Supplier A | $1,100.00 | 6/2/2012 |
Widget 1 Service | Supplier A | $600.00 | 6/3/2012 | |
2345 | Widget 2 | Supplier A | $2,300.00 | 6/17/2012 |
Widget 2 Maintenance | Supplier A | $1,000.00 | 6/30/2012 | |
3456 | Widget 3 | Supplier B | $950.00 | 7/3/2012 |
4567 | Widget 4 | Supplier B | $1,300.00 | 7/4/2012 |
5678 | Widget 5 | Supplier B | $1,050.00 | 7/11/2012 |
Widget 5 Service | Supplier B | $550.00 | 7/15/2012 | |
6789 | Widget 6 | Supplier C | $1,175.00 | 7/23/2012 |
7890 | Widget 7 | Supplier C | $1,200.00 | 7/27/2012 |
- 将文件保存在file_archive文件夹中
- 在工作表supplies_2012中,将Date列中的日期由2012改成2013.
将工作表的名称改为supplies_2013。为使中国文件成为一个具有多个工作表的工作簿,下面再添加一个新的工作表。将新的工作表命名为supplies_2014。 - 将Excel文件保存在file_archive文件夹中,命名为supplies.xls。
- 将下列数据保存为item_numbers_to_find.csv.
A | B | C | D | E | F | G |
---|---|---|---|---|---|---|
1234 | ||||||
2345 | ||||||
4567 | ||||||
6789 | ||||||
7890 |
在文本编辑器中输入下列代码,然后将文件保存为1search_for_items_write_found.py:
#!/Users/chenbryant/anaconda3/bin/python
import csv
import glob
import os
import sys
from datetime import date
from xlrd import open_workbook,xldate_as_tuple
item_numbers_file = sys.argv[1]
path_to_folder = sys.argv[2]
output_file = sys.argv[3]
item_numbers_to_find = []
with open(item_numbers_file, 'r', newline='') as item_numbers_csv_file:
filereader = csv.reader(item_numbers_csv_file)
for row in filereader:
item_numbers_to_find.append(row[0])
filewriter = csv.writer(open(output_file, 'a', newline=''))
file_counter = 0
line_counter = 0
count_of_item_numbers = 0
for input_file in glob.glob(os.path.join(path_to_folder, '*.*')):
file_counter += 1
if input_file.split('.')[1] == 'csv':
with open(input_file, 'r', newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
header = next(filereader)
for row in filereader:
row_of_output = [ ]
for column in range(len(header)):
if column == 3:
cell_value = str(row[column]).lstrip('$').replace(',','').strip()
row_of_output.append(cell_value)
else:
cell_value = str(row[column]).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
if row[0] in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
elif input_file.split('.')[1] == 'xls' or input_file.split('.')[1] =='xlsx':
workbook = open_workbook(input_file)
for worksheet in workbook.sheets():
try:
header = worksheet.row_values(0)
except IndexError:
pass
for row in range(1, worksheet.nrows):
row_of_output = []
for column in range(len(header)):
if worksheet.cell_type(row, column) == 3:
cell_value = xldate_as_tuple(worksheet.cell(row, column).value,workbook.datemode)
cell_value = str(date(*cell_value[0:3])).strip()
row_of_output.append(cell_value)
else:
cell_value = str(worksheet.cell_value(row,column)).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
row_of_output.append(worksheet.name)
if str(worksheet.cell(row,0).value).split('.')[0].strip() in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
print('Nunber of files:', file_counter)
print('Nunber of lines:', line_counter)
print('Nunber of item numbers:', count_of_item_numbers)
第8、9、10行代码读取我们在命令行中提供的三个参数,分别是包含要搜索的数值项目的csv文件的路径名、包含要搜索的文件的文件夹路径、输出文件的路径名。
os.path.join():将文件夹路径与文件夹中所有文件的文件名连接起来
glob.glob():找到路径下的特定文件
输入以下命令:
./1search_for_items_write_found.py item_numbers_to_find.csv file_archive output_files/1app_output.csv
5.2 为CSV文件中数据的任意数目分类计算统计量
在下载的数据集中找到名为customer_category_history.csv的文件,拖到桌面。打开数据集,会发现这个数据集包括4列数据和6个客户。同时包括3个服务包分类:铜牌、银牌和金牌。数据是以先按照客户姓名,再按照日期的形式升序排列的。
在文本编辑器中输入下列代码,然后保存为2calculate_statistic_by_category.py:
#!/Users/chenbryant/anaconda3/bin/python
import csv
import sys
from datetime import date, datetime
def date_diff(date1,date2):
try:
diff = str(datetime.strptime(date1, '%m/%d/%Y') - datetime.strptime(date2, '%m/%d/%Y')).split()[0]
except:
diff = 0
if diff == '0:00:00':
diff = 0
return diff
input_file = sys.argv[1]
output_file = sys.argv[2]
packages = { }
previous_name = 'N/A'
previous_package = 'N/A'
previous_package_date = 'N/A'
first_row = True
today = date.today().strftime('%m/%d/%Y')
with open(input_file, 'r', newline='') as input_csv_file:
filereader = csv.reader(input_csv_file)
header = next(filereader)
for row in filereader:
current_name = row[0]
current_package = row[1]
current_package_date = row[3]
if current_name not in packages:
packages[current_name] = { }
if current_package not in packages[current_name]:
packages[current_name][current_package] = 0
if current_name != previous_name:
if first_row:
first_row = False
else:
diff = date_diff(today, previous_package_date)
if previous_package not in packages[previous_name]:
packages[previous_name][previous_package] = int(diff)
else:
packages[previous_name][previous_package] += int(diff)
else:
diff = date_diff(current_package_date, previous_package_date)
packages[previous_name][previous_package] += int(diff)
previous_name = current_name
previous_package = current_package
previous_package_date = current_package_date
header = ['Customer Name', 'Category', 'Total Time (in Days)']
with open(output_file, 'w', newline='') as output_csv_file:
filewriter = csv.writer(output_csv_file)
filewriter.writerow(header)
for customer_name, customer_name_value in packages.items():
for package_category, package_category_value in packages[customer_name].items():
row_of_output = [ ]
print(customer_name, package_category, package_category_value)
row_of_output.append(customer_name)
row_of_output.append(package_category)
row_of_output.append(package_category_value)
filewriter.writerow(row_of_output)
命令行输入:
./2calculate_statistic_by_category.py customer_category_history.csv output_files/2app.output.csv