fuzzywuzzy
运用了fuzzyeuzzy中的fuzz和process
复习了 apply和groupby
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process #模糊匹配就是要导入这两个东西 暂时不细究
def enum_row(row):
print(row['state'])
def find_state_code(row):
if row['state'] != 0:
print(process.extractOne(row['state'], states, score_cutoff=80))
def capital(str):
return str.capitalize() #capitalize将字符串的第一个字母变成大写,其余字母变为小写
def correct_state(row):
if row['state'] != 0:
state = process.extractOne(row['state'], states, score_cutoff=80) #score_cutof分数阈值
# state是一个这种形式的东西('NORTH CAROLINA', 100)
if state:
state_name = state[0]
print('state:',state_name,state_name.split(' '))
return ' '.join(map(capital, state_name.split(' ')))
#map中是将后面state_name.split(' ')的数据放到capital中,然后通过join用‘ ’将数据连接,所以输出的东西都有空格
return row['state']
def fill_state_code(row):
if row['state'] != 0:
state = process.extractOne(row['state'], states, score_cutoff=80)
#state是一个这种形式的东西('NORTH CAROLINA', 100)
if state:
state_name = state[0]
return state_to_code[state_name]
return ''
if __name__ == "__main__":
pd.set_option('display.width', 200)
# pd.set_option('display.max_columns',None)
data = pd.read_excel('sales.xlsx', sheet_name='sheet1', header=0)
#第一个是文件名,sheet_name='sheet是表头名,这里就是主表的意思',header为确定某一行为列名一般都取第0行
print('data.head() = \n', data.head())
print('data.tail() = \n', data.tail())
print('data.dtypes = \n', data.dtypes)
print('data.columns = \n', data.columns)
for c in data.columns:
print(c)
data['total'] = data['Jan'] + data['Feb'] + data['Mar']
print(data.head())
print(data['Jan'].sum())
print(data['Jan'].min())
print(data['Jan'].max())
print(data['Jan'].mean())
print('=============')
# 添加一行
s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()
print(s1)
s2 = pd.DataFrame(data=s1)
print(s2)
print(s2.T)
print(s2.T.reindex(columns=data.columns)) #reindex是修改索引,重新定义列索引
# 即:
s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).T
s = s.reindex(columns=data.columns, fill_value=0)
print(s)
data = data.append(s, ignore_index=True) #把s加到data中 ignore_index=True不要使用索引标签
data = data.rename(index={15:'Total'}) #将刚加入的第十五行变成Total
# apply的使用
print('==============apply的使用==========')
data.apply(enum_row, axis=1)
pd.set_option('display.max_columns',None)
print('加入state的data',data)
state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU",
"KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI",
"NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID",
"FEDERATED STATES OF MICRONESIA": "FM",
"Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL",
"Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT",
"MASSACHUSETTS": "MA",
"PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD",
"NEW MEXICO": "NM",
"MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO",
"Armed Forces Middle East": "AE",
"NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA",
"MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI",
"MARSHALL ISLANDS": "MH",
"WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV",
"LOUISIANA": "LA",
"NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI",
"NORTH DAKOTA": "ND",
"Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY",
"RHODE ISLAND": "RI",
"DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}
states = state_to_code.keys()
print(fuzz.ratio('Python Package', 'PythonPackage')) #全匹配,输出相似度
print('-'*100)
print(process.extract('Mississippi', states)) #不输入limit的话就是默认为5
#process就是进程函数的意思,是fuzzyeuzzy带的
print('-'*100)
print(process.extract('Mississipi', states, limit=1))
print(process.extract('Mississipi', states, limit=3))
#limit:返回的元素数的可选最大值。默认为到五点。
print('-'*100)
print(process.extractOne('Mississipi', states)) #etractOne 就选出1个
print('-'*100)
data.apply(find_state_code, axis=1)
print('Before Correct State:\n', data['state'])
data['state'] = data.apply(correct_state, axis=1)
print('After Correct State:\n', data['state'])
data.insert(5, 'State Code', np.nan) #5的意思是插到第五个位置
print(data)
# print('state:',data['State Code'])
# print(data)
data['State Code'] = data.apply(fill_state_code, axis=1)
print(data)
# group by
print('==============group by================')
print(data.groupby('State Code'))
print('All Columns:\n')
print(data.groupby('State Code').sum())
print('Short Columns:\n')
print(data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum())
# 写入文件
data.to_excel('sales_result.xls', sheet_name='Sheet1', index=False)