因本人刚开始写博客,学识经验有限,如有不正之处望读者指正,不胜感激;也望借此平台留下学习笔记以温故而知新。这个系列主要是Python数据分析基础的学习笔记。
从历史记录集合中提取特定记录的方法,包括csv文件和Excel文件,以下给出原始文件:
item_numbers_to_find.csv
1234
2345
4567
6789
7890
file_archive 文件夹
csv 文件
Item Number,Description,Supplier,Cost,Date
1234,Widget 1,Supplier A,"$1,100.00 ",6/2/2012
,Widget 1 Service,Supplier A,$600.00 ,6/3/2012
2345,Widget 2,Supplier A,"$2,300.00 ",6/17/2012
,Widget 2 Maintenance,Supplier A,"$1,000.00 ",6/30/2012
3456,Widget 3,Supplier B,$950.00 ,7/3/2012
4567,Widget 4,Supplier B,"$1,300.00 ",7/4/2012
5678,Widget 5,Supplier B,"$1,050.00 ",7/11/2012
,Widget 5 Service,Supplier B,$550.00 ,7/15/2012
6789,Widget 6,Supplier C,"$1,175.00 ",7/23/2012
7890,Widget 7,Supplier C,"$1,200.00 ",7/27/2012
Excel 文件
suppliers_2013
Item Number Description Supplier Cost Date
1234 Widget 1 Supplier A $1,100.00 2013/6/2
Widget 1 Supplier A $600.00 2013/6/3
2345 Widget 2 Supplier A $2,300.00 2013/6/17
Widget 2 Supplier A $1,000.00 2013/6/30
3456 Widget 3 Supplier B $950.00 2013/7/3
4567 Widget 4 Supplier B $1,300.00 2013/7/4
5678 Widget 5 Supplier B $1,050.00 2013/7/11
Widget 5 Supplier B $550.00 2013/7/15
6789 Widget 6 Supplier C $1,175.00 2013/7/23
7890 Widget 7 Supplier C $1,200.00 2013/7/27
suppliers_2014
Item Number Description Supplier Cost Date
1234 Widget 1 Supplier A $1,100.00 2014/6/2
Widget 1 Supplier A $600.00 2014/6/3
2345 Widget 2 Supplier A $2,300.00 2014/6/17
Widget 2 Supplier A $1,000.00 2014/6/30
3456 Widget 3 Supplier B $950.00 2014/7/3
4567 Widget 4 Supplier B $1,300.00 2014/7/4
5678 Widget 5 Supplier B $1,050.00 2014/7/11
Widget 5 Supplier B $550.00 2014/7/15
6789 Widget 6 Supplier C $1,175.00 2014/7/23
7890 Widget 7 Supplier C $1,200.00 2014/7/27
Python代码:
# 从历史记录集合中提取特定记录的方法
# !/usr/bin/env python3
import csv
import glob
import os
import sys
from datetime import date
from xlrd import open_workbook,xldate_as_tuple
item_numbers_file = sys.argv[1]
path_to_folder = sys.argv[2]
output_file = sys.argv[3]
item_numbers_to_find = []
with open(item_numbers_file,'r',newline='') as item_numbers_csv_file:
filereader = csv.reader(item_numbers_csv_file)
for row in filereader:
item_numbers_to_find.append(row[0])
print(item_numbers_to_find)
filewriter = csv.writer(open(output_file,'a',newline=''))
file_counter = 0
line_counter = 0
count_of_item_numbers = 0
for input_file in glob.glob(os.path.join(path_to_folder,'*.*')):
file_counter += 1
if input_file.split('.')[1] == 'csv':
with open(input_file,'r',newline='') as csv_in_file:
filereader = csv.reader(csv_in_file)
header = next(filereader)
for row in filereader:
row_of_output = []
for column in range(len(header)):
if column == 3:
cell_value = str(row[column]).lstrip('$').replace\
(',','').strip()
row_of_output.append(cell_value)
else:
cell_value = str(row[column]).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
if row[0] in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
elif input_file.split('.')[1] == 'xls' or \
input_file.split('.')[1] == 'xlsx':
workbook = open_workbook(input_file)
for worksheet in workbook.sheets():
try:
header = worksheet.row_values(0)
except IndexError:
pass
for row in range(1,worksheet.nrows):
row_of_output = []
for column in range(len(header)):
if column < 3:
cell_value = str(worksheet.cell_value(row,column)).strip()
row_of_output.append(cell_value)
elif column == 3:
cell_value = str(worksheet.cell_value(row,column)).\
lstrip('$').replace(',','').strip()
row_of_output.append(cell_value)
else:
cell_value = xldate_as_tuple(worksheet.cell\
(row,column).value,workbook.datemode)
cell_value = str(date(*cell_value[0:3])).strip()
row_of_output.append(cell_value)
row_of_output.append(os.path.basename(input_file))
row_of_output.append(worksheet.name)
if str(worksheet.cell_value(row,0)).split('.')[0].strip() \
in item_numbers_to_find:
filewriter.writerow(row_of_output)
count_of_item_numbers += 1
line_counter += 1
print('Number of files:',file_counter)
print('Number of lines:',line_counter)
print('Number of item numbers:',count_of_item_numbers)
程序结果:
C:\Users\yuxi\Desktop>python input.py item_numbers_to_find.csv file_archive output1.csv
['1234', '2345', '4567', '6789', '7890']
Number of files: 3
Number of lines: 50
Number of item numbers: 25
1234.0,Widget 1,Supplier A,1100.0,2013-06-02,suppliers.xls,suppliers_2013
2345.0,Widget 2,Supplier A,2300.0,2013-06-17,suppliers.xls,suppliers_2013
4567.0,Widget 4,Supplier B,1300.0,2013-07-04,suppliers.xls,suppliers_2013
6789.0,Widget 6,Supplier C,1175.0,2013-07-23,suppliers.xls,suppliers_2013
7890.0,Widget 7,Supplier C,1200.0,2013-07-27,suppliers.xls,suppliers_2013
1234.0,Widget 1,Supplier A,1100.0,2014-06-02,suppliers.xls,suppliers_2014
2345.0,Widget 2,Supplier A,2300.0,2014-06-17,suppliers.xls,suppliers_2014
4567.0,Widget 4,Supplier B,1300.0,2014-07-04,suppliers.xls,suppliers_2014
6789.0,Widget 6,Supplier C,1175.0,2014-07-23,suppliers.xls,suppliers_2014
7890.0,Widget 7,Supplier C,1200.0,2014-07-27,suppliers.xls,suppliers_2014
1234.0,Widget 1,Supplier A,1100.0,2013-06-02,suppliers.xlsx,suppliers_2013
2345.0,Widget 2,Supplier A,2300.0,2013-06-17,suppliers.xlsx,suppliers_2013
4567.0,Widget 4,Supplier B,1300.0,2013-07-04,suppliers.xlsx,suppliers_2013
6789.0,Widget 6,Supplier C,1175.0,2013-07-23,suppliers.xlsx,suppliers_2013
7890.0,Widget 7,Supplier C,1200.0,2013-07-27,suppliers.xlsx,suppliers_2013
1234.0,Widget 1,Supplier A,1100.0,2014-06-02,suppliers.xlsx,suppliers_2014
2345.0,Widget 2,Supplier A,2300.0,2014-06-17,suppliers.xlsx,suppliers_2014
4567.0,Widget 4,Supplier B,1300.0,2014-07-04,suppliers.xlsx,suppliers_2014
6789.0,Widget 6,Supplier C,1175.0,2014-07-23,suppliers.xlsx,suppliers_2014
7890.0,Widget 7,Supplier C,1200.0,2014-07-27,suppliers.xlsx,suppliers_2014
1234,Widget 1,Supplier A,1100.00,6/2/2012,suppliers_2012.csv
2345,Widget 2,Supplier A,2300.00,6/17/2012,suppliers_2012.csv
4567,Widget 4,Supplier B,1300.00,7/4/2012,suppliers_2012.csv
6789,Widget 6,Supplier C,1175.00,7/23/2012,suppliers_2012.csv
7890,Widget 7,Supplier C,1200.00,7/27/2012,suppliers_2012.csv
为CSV文件中数据的任意数目分类计算统计量,以下给出原始数据:
customer_category_history.csv
Customer Name,Category,Price,Date
John Smith,Bronze,$20.00 ,1/22/2014
John Smith,Bronze,$25.00 ,3/15/2014
John Smith,Silver,$30.00 ,4/2/2014
John Smith,Gold,$40.00 ,5/11/2014
John Smith,Gold,$45.00 ,7/13/2014
Mary Yu,Silver,$30.00 ,2/3/2014
Mary Yu,Gold,$40.00 ,4/16/2014
Mary Yu,Gold,$45.00 ,6/23/2014
Wayne Thompson,Bronze,$20.00 ,1/13/2014
Wayne Thompson,Bronze,$25.00 ,3/24/2014
Wayne Thompson,Bronze,$30.00 ,5/21/2014
Wayne Thompson,Silver,$30.00 ,6/29/2014
Bruce Johnson,Bronze,$20.00 ,2/9/2014
Bruce Johnson,Bronze,$25.00 ,3/22/2014
Bruce Johnson,Silver,$30.00 ,4/27/2014
Bruce Johnson,Silver,$35.00 ,5/8/2014
Bruce Johnson,Gold,$40.00 ,6/26/2014
Bruce Johnson,Gold,$45.00 ,7/21/2014
Annie Lee,Bronze,$20.00 ,3/16/2014
Annie Lee,Silver,$30.00 ,4/11/2014
Annie Lee,Gold,$40.00 ,5/25/2014
Annie Lee,Gold,$45.00 ,7/14/2014
Annie Lee,Gold,$50.00 ,7/21/2014
Priya Patel,Silver,$30.00 ,1/19/2014
Priya Patel,Silver,$35.00 ,2/28/2014
Priya Patel,Silver,$40.00 ,3/26/2014
Priya Patel,Gold,$40.00 ,4/28/2014
Priya Patel,Gold,$45.00 ,5/12/2014
Priya Patel,Gold,$50.00 ,6/21/2014
Python代码:
# 为CSV文件中数据的任意数目分类计算统计量
import csv
import sys
from datetime import date,datetime
def date_diff(date1,date2):
try:
diff = str(datetime.strptime(date1,'%m/%d/%Y')-datetime.strptime\
(date2,'%m/%d/%Y')).split()[0]
except:
diff = 0
if diff == '0:00:00':
diff = 0
return diff
input_file = sys.argv[1]
output_file = sys.argv[2]
# 外部字典-键为客户名称,与这个键相对的值是另一个字典
packages = { }
previous_name = 'N/A'
previous_package = 'N/A'
previous_package_date = 'N/A'
first_row = True
today = date.today().strftime('%m/%d/%Y')
with open(input_file,'r',newline='') as input_csv_file:
filereader = csv.reader(input_csv_file)
header = next(filereader)
for row in filereader:
current_name = row[0]
current_package = row[1]
current_package_date = row[3]
if current_name not in packages:
packages[current_name] = { }
if current_package not in packages[current_name]:
packages[current_name][current_package] = 0
if current_name != previous_name:
if first_row:
first_row = False
else:
diff = date_diff(today,previous_package_date)
if previous_package not in packages[previous_name]:
packages[previous_name][previous_package] = int(diff)
else:
packages[previous_name][previous_package] += int(diff)
else:
diff = date_diff(current_package_date,previous_package_date)
packages[previous_name][previous_package] += int(diff)
previous_name = current_name
previous_package = current_package
previous_package_date = current_package_date
header = ['Customer Name','Category','Total Time(in Days)']
with open(output_file,'w',newline='') as output_csv_file:
filewriter = csv.writer(output_csv_file)
filewriter.writerow(header)
for customer_name,customer_name_value in packages.items():
for package_category,package_category_value in packages\
[customer_name].items():
row_of_output = []
print(customer_name,package_category,package_category_value)
row_of_output.append(customer_name)
row_of_output.append(package_category)
row_of_output.append(package_category_value)
filewriter.writerow(row_of_output)
程序结果:
C:\Users\yuxi\Desktop>python input.py customer_category_history.csv output2.csv
John Smith Bronze 70
John Smith Silver 39
John Smith Gold 1671
Mary Yu Silver 72
Mary Yu Gold 1696
Wayne Thompson Bronze 167
Wayne Thompson Silver 1622
Bruce Johnson Bronze 77
Bruce Johnson Silver 60
Bruce Johnson Gold 1625
Annie Lee Bronze 26
Annie Lee Silver 44
Annie Lee Gold 1657
Priya Patel Silver 99
Priya Patel Gold 54
从文本文件中提取数据并根据这些数据计算统计量,以下给出原始数据:
mysql_server_error_log.txt
246824 10:40:55 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data
2014-02-03 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3
2014-02-03 10:40:55 98765 [Note] InnoDB: Using atomics to ref count buffer pool pages
2014-02-03 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567
2014-02-03 10:47:18 64208 [Note] InnoDB: Using atomics to ref count buffer pool pages
2014-02-03 10:47:18 64208 [Note] InnoDB: Compressed tables use zlib 1.2.3
2014-02-03 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete
135791 15:59:29 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data
2014-03-07 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3
2014-03-07 10:40:55 98765 [Note] InnoDB: Compressed tables use zlib 1.2.3
2014-03-07 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567
2014-03-07 10:47:18 64208 [Note] InnoDB: Using atomics to ref count buffer pool pages
2014-03-07 10:47:18 64208 [Note] InnoDB: Compressed tables use zlib 1.2.3
2014-03-07 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete
124578 15:59:29 mysqld_safe Starting mysqld daemon with databases from /usr/local/mysql/data
2014-10-27 10:40:55 98765 [Note] InnoDB: Completed initialization of buffer pool
2014-10-27 10:40:55 98765 [Note] InnoDB: IPv6 is available.
2014-10-27 10:40:55 98765 [Note] InnoDB: 5.6.16 started; log sequence number 1234567
2014-10-27 10:47:18 64208 [Note] InnoDB: Completed initialization of buffer pool
2014-10-27 10:47:18 64208 [Note] InnoDB: IPv6 is available.
2014-10-27 10:55:55 64208 [Note] /usr/local/mysql/bin/mysqld: Shutdown complete
Python代码:
# 从文本文件中提取数据并根据这些数据计算统计量
import sys
input_file = sys.argv[1]
output_file = sys.argv[2]
message = { }
notes = []
with open(input_file,'r',newline='') as text_file:
for row in text_file:
if '[Note]' in row:
row_list = row.split(' ',4) # 表示仅分割成5个字符串
day = row_list[0].strip()
note = row_list[4].strip('\n').strip() # 错误消息
if note not in notes:
notes.append(note)
if day not in message:
message[day] = {}
if note not in message[day]:
message[day][note] = 1
else:
message[day][note] += 1
filewriter = open(output_file,'w',newline = '')
header = ['Date']
header.extend(notes)
header = ','.join(map(str,header))+'\n'
print(header)
filewriter.write(header)
for day,day_value in message.items():
row_of_output = []
row_of_output.append(day)
for index in range(len(notes)):
if notes[index] in day_value.keys():
row_of_output.append(day_value[notes[index]])
else:
row_of_output.append(0)
output = ','.join(map(str,row_of_output))+'\n'
print(output)
filewriter.write(output)
filewriter.close()
程序结果:
C:\Users\yuxi\Desktop>python input.py mysql_server_error_log.txt output3.csv
Date InnoDB: Compressed tables use zlib 1.2.3 InnoDB: Using atomics to ref count buffer pool pages InnoDB: 5.6.16 started; log sequence number 1234567 /usr/local/mysql/bin/mysqld: Shutdown complete InnoDB: Completed initialization of buffer pool InnoDB: IPv6 is available.
2014/2/3 2 2 1 1 0 0
2014/3/7 3 1 1 1 0 0
2014/10/27 0 0 1 1 2 2