数据清洗:
赔率、公路堵车模型的概念及应用
主成分分析PCA:新的的特征组合
车辆数据描述:one-hot编码会使特征值大量增加(维度变高视情况而定)
Logistic回归:AUC:曲线下的面积
求取素数以及赔率的代码:
import operator
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from time import time
import math
def is_prime(x):
return 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)]
def is_prime3(x):
flag = True
for p in p_list2:
if p > math.sqrt(x):
break
if x % p == 0:
flag = False
break
if flag:
p_list2.append(x)
return flag
if __name__ == "__main__":
a = 2
b = 1000000
# # 方法1:直接计算
# t = time()
# p = [p for p in range(a, b) if 0 not in [p % d for d in range(2, int(math.sqrt(p)) + 1)]]
# print (time() - t)
# print (p)
# # 方法2:利用filter
# t = time()
# p = filter(is_prime, range(a, b))
# print time() - t
# print p
#
# # 方法3:利用filter和lambda
# t = time()
# is_prime2 = (lambda x: 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)])
# p = filter(is_prime2, range(a, b))
# print time() - t
# print p
#
# 方法4:定义
t = time()
p_list = []
for i in range(2, b):
flag = True
for p in p_list:
if p > math.sqrt(i):
break
if i % p == 0:
flag = False
break
if flag:
p_list.append(i)
print(time() - t)
print(p_list)
# 方法5:定义和filter
p_list2 = []
t = time()
filter(is_prime3, range(2, b))
print(time() - t)
print(p_list2)
print('---------------------')
a = 1180
b = 1230
a = 1600
b = 1700
p_list2 = []
p = np.array(filter(is_prime3, range(2, b+1)))
p = p[p >= a]
print(p)
p_rate = float(len(p)) / float(b-a+1)
print('素数的概率:', p_rate, '\t')
print('公正赔率:', 1/p_rate)
print('合数的概率:', 1-p_rate, '\t')
print('公正赔率:', 1 / (1-p_rate))
NagelSchreckenberg公路堵车模型代码:
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
def clip(x, path):
for i in range(len(x)):
if x[i] >= path:
x[i] %= path
if __name__ == "__main__":
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
path = 5000 # 环形公路的长度
n = 100 # 公路中的车辆数目
v0 = 50 # 车辆的初始速度
p = 0.15 # 随机减速概率
Times = 3000
np.random.seed(0)
x = np.random.rand(n) * path
x.sort()
v = np.tile([v0], n).astype(np.float)
plt.figure(figsize=(10, 8), facecolor='w')
for t in range(Times):
plt.scatter(x, [t]*n, s=1, c='k', alpha=0.05)
for i in range(n):
if x[(i+1)%n] > x[i]:
d = x[(i+1) % n] - x[i] # 距离前车的距离
else:
d = path - x[i] + x[(i+1) % n]
if v[i] < d:
if np.random.rand() > p:
v[i] += 1
else:
v[i] -= 1
else:
v[i] = d - 1
v = v.clip(0, 150)
x += v
clip(x, path)
plt.xlim(0, path)
plt.ylim(0, Times)
plt.xlabel(u'车辆位置', fontsize=16)
plt.ylabel(u'模拟时间', fontsize=16)
plt.title(u'环形公路车辆堵车模拟', fontsize=20)
plt.tight_layout(pad=2)
plt.show()
模糊查询与替换:
#!/usr/bin/python
# -*- encoding: utf-8
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def enum_row(row):
print(row['state'])
def find_state_code(row):
if row['state'] != 0:
print(process.extractOne(row['state'], states, score_cutoff=80))
def capital(str):
return str.capitalize()
def correct_state(row):
if row['state'] != 0:
state = process.extractOne(row['state'], states, score_cutoff=80)
if state:
state_name = state[0]
return ' '.join(map(capital, state_name.split(' ')))
return row['state']
def fill_state_code(row):
if row['state'] != 0:
state = process.extractOne(row['state'], states, score_cutoff=80)
if state:
state_name = state[0]
return state_to_code[state_name]
return ''
if __name__ == "__main__":
pd.set_option('display.width', 200)
data = pd.read_excel('sales.xlsx', sheetname='sheet1', header=0)
print('data.head() = \n', data.head())
print('data.tail() = \n', data.tail())
print('data.dtypes = \n', data.dtypes)
print('data.columns = \n', data.columns)
for c in data.columns:
print(c)
print
data['total'] = data['Jan'] + data['Feb'] + data['Mar']
print(data.head())
print(data['Jan'].sum())
print(data['Jan'].min())
print(data['Jan'].max())
print(data['Jan'].mean())
print('=============')
# 添加一行
s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()
print(s1)
s2 = pd.DataFrame(data=s1)
print(s2)
print(s2.T)
print(s2.T.reindex(columns=data.columns))
# 即:
s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).T
s = s.reindex(columns=data.columns, fill_value=0)
print(s)
data = data.append(s, ignore_index=True)
data = data.rename(index={15:'Total'})
print(data.tail())
# apply的使用
print('==============apply的使用==========')
data.apply(enum_row, axis=1)
state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU",
"KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI",
"NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID",
"FEDERATED STATES OF MICRONESIA": "FM",
"Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL",
"Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT",
"MASSACHUSETTS": "MA",
"PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD",
"NEW MEXICO": "NM",
"MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO",
"Armed Forces Middle East": "AE",
"NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA",
"MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI",
"MARSHALL ISLANDS": "MH",
"WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV",
"LOUISIANA": "LA",
"NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI",
"NORTH DAKOTA": "ND",
"Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY",
"RHODE ISLAND": "RI",
"DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}
states = state_to_code.keys()
print(fuzz.ratio('Python Package', 'PythonPackage'))
print(process.extract('Mississippi', states))
print(process.extract('Mississipi', states, limit=1))
print(process.extractOne('Mississipi', states))
data.apply(find_state_code, axis=1)
print('Before Correct State:\n', data['state'])
data['state'] = data.apply(correct_state, axis=1)
print('After Correct State:\n', data['state'])
data.insert(5, 'State Code', np.nan)
data['State Code'] = data.apply(fill_state_code, axis=1)
print(data)
# group by
print('==============group by================')
print(data.groupby('State Code'))
print('All Columns:\n')
print(data.groupby('State Code').sum())
print('Short Columns:\n')
print(data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum())
# 写入文件
data.to_excel('sales_result.xls', sheet_name='Sheet1', index=False)
回归:
线性回归:
1、高斯分布
2、最大似然估计MLE
3、最小二乘法的本质
Logistic回归:
分类问题的首选算法
多分类:Softmax回归
目标函数
技术点:
1、梯度下降算法
2、最大似然估计
3、特征选择