数据科学指南-学习笔记

Apls008
已于 2023-04-07 10:13:41 修改
阅读量154
点赞数
分类专栏：学习笔记文章标签：学习 python
于 2023-04-07 10:04:43 首次发布
本文链接：https://blog.csdn.net/weixin_36840127/article/details/130005617
版权
学习笔记专栏收录该内容
11 篇文章 0 订阅
订阅专栏
https://github.com/joelgrus/data-science-from-scratch/blob/master/code/natural_language_processing.py
#import package
import requests,json,sys,re,csv,math,random,matplotlib,matplotlib.pyplot as plt,pandas as pd,numpy as np
from dateutil.parser import parse
from __future__ import division
from collections import Counter,defaultdict,deque
from functools import partial
from pandas import Series,DataFrame
--------------------------------------------CP1导论
---------1.3.1 find the key contacter
users=[
{'id':0,'name':'zieox'},
{'id':1,'name':'jack'},
{'id':2,'name':'rose'},
{'id':3,'name':'nike'},
{'id':4,'name':'dick'},
{'id':5,'name':'lucco'},
{'id':6,'name':'zpoo'},
{'id':7,'name':'sqqpo'},
{'id':8,'name':'dv'},
{'id':9,'name':'sad'},
{'id':10,'name':'kobe'}]

friends=[(0,1),(0,2),(1,2),(1,3),(2,3),(3,4),(4,5),(5,6),(5,7),(6,8),(7,8),(8,9)]

#对users-list加入一friend列
for user in users:
user["friend"]=[]
#将朋友加入fr列
for i,j in friends:
users[i]["friend"].append(users[j]['name'])
users[j]["friend"].append(users[i]['name'])
#定义一个求fr列长的公式
def num_of_friend(user):return len(user["friend"])
total_conn=sum(num_of_friend(user) for user in users)

from __future__ import division
num_users=len(users)
#求人均好友量
avg_conn=total_conn/num_users
#生成(user_id,num_friends)
n_f_nyid=[(user['id'],num_of_friend(user)) for user in users]
#sorted(n_f_nyid,key= lambda ab:ab[1],reverse=True)# 
#n_f_nyid=[(avg_conn,num_of_friend(user)) for user in users]
#a=pd.DataFrame(n_f_nyid,index=[user['name'] for user in users]).plot()

---1.3.2 The scientist maybe you know
这个方式太抽象 我们构造一个新列来返回ID 对应好友id
def f_of_fid(user):
return [foaf['id'] 
for fr in user['friend']
for foaf in friend['friend']] 
##print([friend['id'] for friend in users[0]['friend']])##


u=[{user['id']:user['fr_id']} for user in users]
#观察发现人们可以通过朋友的朋友认识
#所以我们设计一个共同的朋友,来计数.同时 我们需要排除已经成为好友的用户,使用以下功能实现

'''from collections import Counter
def not_the_same(user,other_user):
return user['id']!=other_user['id']
def not_fr(user,other_user):
return all(not_the_same(friend,other_user) for friend in user['friend'])
def f_of_fid(user):
return Counter(foaf['id']
for friend in user['friend']
for foaf in friend['friend']
if not_the_same(user,foaf)
and not_fr(user,foaf))
print(f_of_fid(users[3]))'''

interest=[(0,"hive"),(0,"spark"),(0,"python"),(0,"java"),
(0,"scala"),(0,"tableau"),(1,"python"),(1,"java"),(1,"scala"),
(2,"hadoop"),(2,"python"),(3,"sql"),(3,"impala"),(3,"go"),(4,"perl"),
(4,"java"),(4,"hive"),(5,"python"),(5,"mapreduce"),(5,"java"),(6,"C++"),
(6,"java"),(6,"vba"),(7,"tableau"),(7,"ppt"),(8,"python"),(8,"java"),
(9,"python"),(9,"hadoop"),(9,"scala")]

#将喜欢的interest加入数据集
for i in users:
i['interest']=[]
for i,j in interest:
    users[i]['interest'].append(j)
---#找到有共同兴趣的id
def data_s_who_like(target_interest):
return [user_id for user_id,user_interest in interest if user_interest==target_interest]

from collections import defaultdict
#键是interest 值是带有这个interest和user_id的列表
user_id_by_interest=defaultdict(list)
for user_id,interest in interest:
user_id_by_interest[interest].append(user_id)
#我们找出最受欢迎的interest的
def most_common_interest_with(user):
    return counter(interested_user_id
    for interest in interest_by_userid[user['id']]
    for interest_by_userid in user_id_by_interest[interest]
    if interested_user_id!=user['id'])

---------1.3.3 working y-experience
sal_year=[(83000,8.7),(88000,8.1),(48000,0.7),(76000,6),
(69000,6.5),(76000,7.5),(60000,2.5),(83000,10),(48000,1.9),(63000,4.2)]
#在这里我们做一个键值对的列表
sal_byear=defaultdict(list)
for sal,year in sal_year:
sal_byear[year].append(sal)
avg_sal_byear={year:sum(sal)/len(sal)
for year,sal in sal_byear.items()}

#分桶操作(qcut)
def y_bucket(year):
if year <2:
return "less than 2"
elif year<5:
return"between 2 and 5"
else:
return "more than 5"

sal_byear_bucket=defaultdict(list)
for sal,year in sal_year:
bucket=y_bucket(year)
sal_byear_bucket[bucket].append(sal)
avg_sal_bucket={y_bucket:sum(sal)/len(sal)
for y_bucket,sal in sal_byear_bucket.items()}
#我们来基于interest做一个兴趣统计
words_count=Counter(word for user,interest in interest 
for word in interest.lower().split())

for word,count in words_count.most_common():
if count>1:
print(word,count)

##defaultdict方法的count
http://kodango.com/understand-defaultdict-in-python
strings = ('puppy', 'kitten', 'puppy', 'puppy',
           'weasel', 'puppy', 'kitten', 'puppy')
counts = {}
for kw in strings:
    if kw not in counts:
        counts[kw] = 1
    else:
        counts[kw] += 1
        
#defaultdict方法
for kw in strings:
    counts.setdefault(kw, 0)
    counts[kw] += 1

--------------------------------------------CP2 PYTHON-basic
for i in [1,2,3,4,5]:
print(i)
for j in [1,2,3,4,5]:
print(j)
print(j+i)
print(i)
print('done looping')
list=[[1,2,3],[4,5,6],[7,8,9]]
def add(f):
return f[1]
y=add(lambda x:x+4)
#列表+值
list.extend([11,12,13])
list+[15]
list.append(15)
#给y赋值
_,y=[1,2]
x,y=1,2
x,y=y,x

------dict
grades={'zieox':90,'jack':88}
grades['jack']                        --->88
grades['kobe']=99                          
grades.get('zieox',0)                      
grades.keys()                         --->['zieox','jack']
grades.values()                       --->dict_values([90, 88])
grades.items()                        --->dict_items([('zieox',90), ('jack', 88)])

#使用defaultdict统计词频
strings = ('puppy', 'kitten', 'puppy', 'puppy',
           'weasel', 'puppy', 'kitten', 'puppy','ppp')
counts = {}
#method-1
for word in strings:
    if kw not in counts:
        counts[word] = 1
    else:
        counts[word] += 1
#method-2
for word in strings:
try:
counts[word]+=1
except KeyError:
counts[word]=1
#method-3
for kw in strings:
p_count=counts.get(word,0)
counts[word]=p_count+1
#以上方法太笨拙,以下代码简便
from collections import defaultdict
counts=defaultdict(int)
for word in strings:
counts[word]+=1
#使用collections包的Counter计数器是最简单的
from collections import Counter
---
c=Counter(strings)
for i,j in c.most_common(10):
print(i,j)
---this is the easist method

------set
我们使用set有2个原因
#1.set上有一种很快的操作IN
s=set()
for i in range(30):
s.add(i)
sl=['a','an','at']+['yet','you']
'zip' in sl                           #这种操作需要遍历每个元素
sl_set=set(sl)
'zip' in sl _set                      #书上说这样就快了
#2.便于在一个汇总中寻找离散项目
l=[1,2,4,5,6,7,8,9,9,9,8]
numl=len(l)
num_distinct_set=len(set(l))

------控制流
#if 流
p='even' if x%2==0 else 'odd'
#while 流
x=0
while x<10:
print(x,'less than 10')
x+=1
#for if 
for x in range(10):
if x==3:
continue
if x==5:
break
print(x)

------排序 (sort)
l=[1,2,4,5,6,7,8,9,9,9,8]
l2=sorted(l)
l.sort()
x=sorted([-1,4,99,-8],key=abs,reverse=True)
w=sorted(counts.items(),key=lambda ab:ab[1],reverse=True)

------列表解析
even=[x for x in range(5) if x%2==0]
square=[x*x for x in range(5)]
even_s=[x*x for x in range(5) if x%2==0]
#转换为dict或set
square={x:x*x for x in range(5)}
pair=[(x,y) for x in range(10) for y in range(10)]
inxrease_pair=[(x,y) for x in range(10) for y in range(x+1,10)]

------生成器和迭代器
def lazy_r(n):
i=0
while i < n:
yield i
i+=1
def natural_num():
n=1
while True:
yield n
n+=1
#way2
laz_below20=(random.random() for _ in range(4))

------随机性
import random
f4 = [random.random() for _ in range(4)]
#设置随机数种子为10
random.seed(10) 
random.random()
#当我们再次设置随机数种子为10时random.random()产出与上次一致
#范围内随机取
random.randrange(10)           --->0-9 随机取一值
random.randrange(3.6)          --->[3,4,5]
#使用random.shuffle 随机重排列表中元素
t1=[ i for i in range(10)]
random.shuffle(t1)
print(t1)

fr=['zieox','jack','rose','nike','dick']
#使用choice 在列表中随机取数
random.choice(fr)
#使用random.sample 随机选择(不替换)选择一个元素样本
num=[i for i in range(60)]
win=random.sample(num,6)                          --->不重复采6个样本
ff=[random.choice(range(10)) for _ in range(4)]   --->重复采4个样本

------面向对象的编程
#自己写一个set集合
class set:
def __init__(self,values=None)
s1=Set()
s2=Set([1,2,2,3])
self.dict={}
if values is not None:
for value in values:
self.add(value)
def __repr__(self):
return "set:"+str(self.dict.keys())
def add(self,value):
self.dict[value]=True
def contains(self,value):
return value in self.dict
def remove(self,value):
del self.dict[value]

------函数式工具functools
def exp(base,power):return base**power        n**m函数
def two_to_the(power):return exp(2,power)     2**m函数
from functools import partial
two_to_the=partial(exp,2)                 --->包含一个变量的函数
print(two_to_the(3))                      ---> 2**3
square=partial(exp,power=2)               --->n**2函数

def double(x):
return 2*x
xs=[1,2,3,4]
twice_xs=[double(x) for x in xs] 
以上等价于 twice_xs=map(double,xs)  
list_double=partial(map,double)          --->double 了一个列表的function
twice_xs=list_double(xs)                 --->同上    
---map函数 <映射>
def x2(x,y):return x*y
pro=map(x2,[1,2],[3,4])                  --->1*3+2*4=[3,8]


---filter函数 <过滤>
def is_even(x):
return x%2==0
x_evens=[x for x in xs if is_even(x)]
以上等价于 x_evens=filter(is_even,xs)
list_evener=partial(filter,is_even)
x_evens=list_evener(xs)

---reduce
x_product=reduce(x2,xs)                  --->=1*2*3*4=24
list_product=partial(reduce,multiply)    --->reduce了一个列表function
x_product=list_product(xs)               --->同上24

------枚举 enumerate<==>(index,element)
fr=['zieox','jack','rose','nike','dick']
for i,j in enumerate(fr):
print(i,j)

------压缩和参数拆分 zip&unzip
list1=[1,2,3,4]
list2=['a','b','c','d']
zip(list2,list1)                         --->[('a',1),('b',2),('c',3),('d',4)]
pairs=[('a',1),('b',2),('c',3),('d',4)]
a,b=zip(*pairs)

zip(('a',1),('b',2),('c',3),('d',4))     --->(('a','b','c','d'),(1,2,3,4))
def add(a,b):return a+b
add(1,2)                                 --->3
add(*[1,2])                              --->3

------args&kargs
def doubler(f):
def g(x):
return 2*f(x)
return g
def f1(x):
return x+1
g=doubler(f1)

#对于多个参数的函数以上写法并不适用
所以我们用到参数拆分和魔法
def magic(*args,**kwargs):
print('unnamed args:',args)
print('keyword args:',kwargs)
magic(1,2,key='word',key2='word2')
def other_way_magic(x,y,z):
return x+y+z
x_y_list=[1,2]
z_dict={'z':3}
print(other_way_magic(*x_y_list,**z_dict))

p33
def doubler_correct(f):
def g(*args,**kwargs):
return 2*f(*args,**kwargs)
return g
g=doubler_correct(f2)
print( g(1,2))                         --->6


--------------------------------------------CP3可视化
#example-matplotlib
from matplotlib import pyplot as pl
year=[1950,1960,1970,1980,1990,2000,2010]
gdp=[300.2,543.3,1075.9,2862.5,5979.6,10289.7,14958.3]
pl.plot(year,gdp,color='green',marker='o',linestyle='solid')
pl.title('name-GDP')
pl.ylabel('billion$')
pl.show()

------bar (条形图)
#coding:utf-8
movies=['old boy','revenages','iron-man','starwar','HelloKitty']
num_oscars=[11,65,53,26,11]
#在3.6中此操作将条形放置于中心左侧
xs=[i+0.1 for i ,_ in enumerate(movies)]
pl.bar(xs,num_oscars)
pl.ylabel('num of get_oscar')
pl.title('my favourite movie')
pl.xticks([i+0.5 for i ,_ in enumerate(movies)],movies)
pl.show()

#example3
from collections import Counter
grades=[86,45,46,79,13,49,79,65,35,46]
#//取整
decile=lambda grade:grade//10*10  === y=(x//10)*10
#用counter计数器计数
histgram=Counter(decile(grade) for grade in grades)
#分别按照histgrame的key和value计数值作图,其中设置宽度为8
pl.bar([x for x in histgram.keys()],histgram.values(),8)
#设置x,y轴取值范围
pl.axis([-5,105,0,5])
#设置x轴的标记点
pl.xticks([10*i for i in range(11)])
pl.xlabel('+scorelike')
pl.ylabel('num of students')
pl.title('score-pic')
pl.show()

#example4
mentions=[500,505]
years=[2013,2014]
pl.bar(years,mentions,0.8)
pl.xticks(years)
pl.ylabel('mentioned ds')
pl.ticklabel_format(useOffset=False)
pl.axis([2012.5,2014.5,499,506])
pl.title('look this is a big change')
pl.show()
#now we set x,y -value_range
pl.axis([2013,2015,0,550])
pl.show()

------line picture (线图)
'''#easy_way
varience=[2**(i-1) for i in range(1,10)]
bias_aquared=sorted(varience,reverse=True)'''
varience=[1, 2, 4, 8, 16, 32, 64, 128, 256]
bias_aquared=[256, 128, 64, 32, 16, 8, 4, 2, 1]
total_error=[x+y for x,y in zip(varience,bias_aquared)]
xs=[i for i ,_ in enumerate(varience)]
pl.plot(xs,varience,'g-',label='varience')
pl.plot(xs,bias_aquared,'r-.',label='bias^2')
pl.plot(xs,total_error,'b:',label='total_error')
#loc=9指(tag框)顶部中央
pl.legend(loc=9)
pl.xlabel('model-complex_value')
pl.title('bias - var picture')
pl.show()

------scatter 散点图
import random
'''friends=[random.randint(1,100) for _ in range(10)]
minutes=[random.randint(100,200) for _ in range(10)]'''
#这里我们不用随机数
friends=[61, 73, 80, 93, 13, 26, 57, 59, 88, 84]
minutes=[157, 184, 101, 198, 196, 158, 178, 150, 113, 154]
labels=['a','b','c','d','e','f','g','h','i','j']
pl.scatter(friends,minutes)

for label,friend_count,minute_count in zip(labels,friends,minutes):
pl.annotate(label,
xy=(friend_count,minute_count),
xytext=(-5,5),
textcoords='offset points')
pl.title('num_minutes&friends')
pl.xlabel('num of friend')
pl.ylabel('minutes of spending')
pl.show()


'''#在这里我们实验一下在轴上放置tag,利用annotate,xytext为位移,axis为xy轴的值域
pl.bar([i for i in range(10)],minutes)
pl.axis([-1,10,0,200])
for label,i,j in zip(labels,[i for i in range(10)],minutes):
pl.annotate(label,
xy=(i,j),
xytext=(-5,5),
textcoords='offset points')'''

--------------------------------------------CP4 linear algebra
------4.1 vector 向量
height_weight_age=[70,170,40]
grades=[95,80,75,62]
def vector_add(v,w):
 return[v_i+w_j for v_i,w_j in zip(v,w)]
def vector_subtract(v,w):
  return[v_i-w_j for v_i,w_j in zip(v,w)]

'''def vector_sum(vectors):
 result=vectors[0]
 for vector in vectors[1:]:
  result=vector_add(result,vector)
 return result'''

def vector_sum(vectors):
 return reduce(vector_add,vectors)
 #vector_sum=partial(reduce,vector_add)
def scalar_multiply(c,v):
 return [c*v_i for v_i in v]
def vector_mean(vectors):
 n=len(vectors)
 return scalar_multiply(1/n,vector_sum(vectors))
#点乘
def dot(v,w):
 return sum(v_i*w_i for v_i,w_i in zip(v,w))
#计算向量平方和
def sum_of_squares(v):
 return dot(v,v)
#计算向量的大小(长度)
import maths
def magnitude(v):
 return math.sqrt(sum_of_squares(v))
def squared_distance(v,w):
 return sum_of_squares(vector_subtract(v,w))
def distance(v,w):
 return magnitude(vector_subtract(v,w))

------4.2矩阵。
a=[[1,2,3],[4,5,6]]
b=[[1,2],[3,4],[5,6]]

def shape(A):
 num_rows=len(A)
 num_cols=len(A[0]) if A else 0
 return num_cols,num_rows
#get_row
def get_row(A,i):
 return A[i]
#get_col
def get_column(A,j):
 return [A_i[j] for A_i in A]
#我们创建一个 对角线为1 其余为0的单位矩阵
def make_matrix(num_rows,num_cols,entry_fn):
 return [[entry_fn(i,j) for j in range(num_cols)] for i in range(num_rows)]
def is_diagonal(i,j):
 return 1 if i==j else 0 
indentity_matrix=make_matrix(5,5,is_diagonal)

data=[[70,170,40],[65,120,26],[77,250,19]]

---------------------------------------------CP5 基础统计
------basic_item
#friend-bar 绘制朋友数直方图
num_friends=[11,65,53,26,11]*3
daily_minutes=[10,88,76,20,10]
from collections import Counter
from matplotlib.pyplot import plot as pl
friends=Counter(num_friends)
xs=range(101)
ys=[friends[x] for x in xs]
pl.bar(xs,ys)
pl.axis([0,101,0,25])
pl.title('friend_num')
pl.xlabel('num of friends')
pl.ylabel('num_peo')
pl.show()

num_point=len(num_friends)
largest_value=max(num_friends)
smallest_value=min(num_friends)
sorted_values=sorted(num_friends)
smallest_value=min(num_friends)=sorted_values[0]
largest_value=max(num_friends)=sorted_values[-1]
print('count:',num_point,'max:',largest_value,'min:',smallest_value)

------5.1.1中心倾向
#写个平均值
from __future__ import division
def mean(x):
 return sum(x)/len(x)
mean(num_friends)

#中位数
def median(v):
 n=len(v)
 sorted_v=sorted(v)
 midpoint=n//2
 if n%2==1:
  return sorted_v[midpoint]
 else:
  lo=midpoint-1
  hi=midpoint
  return(sorted_v[lo]+sorted_v[hi])/2
mdedian(num_friends)

#分位数(quantile)
def quantile(x,p):
 p_index=int(p*len(x))
 return sorted(x)[p_index]
quantile(num_friends,0.1)

#众数(mode)
def mod(x):
 counts=Counter(x)
 max_count=max(counts.values())
 return [x_i for x_i,count in counts.iteritems() if count==max_count]
mod(num_friends)

------5.1.2离散度
#极差(range)
def data_range(x):
 return max(x)-min(x)
data_range(num_friends)

#方差(variance)  sum((X-Xavg)/(n-1))
def de_mean(x):
 x_bar=mean(x)
 return [x_i - x_bar for x_i in x]
def sum_of_suqares(x):
 return sum([i**2 for i in x])
def variance(x):
 n=len(x)
 deviations=de_mean(x)
 return sum_of_suqares(deviations)/(n-1)
variance(num_friends)

#标准差(standard deviation)
def standard_deviation(x):
 return math.sqrt(variance(x))
standard_deviation(num_friends)

#分位数75% 与25%分位数之差
def interquantile_range(x):
 return quantile(x,0.75)-quantile(x,0.25)
interquantile_range(num_friends)

------5.2相关
#协方差(covariance)  COV  dot点乘：元素相乘再相加
from numpy import dot
def covariance(x,y):
 n=len(x)
 return dot(de_mean(x),de_mean(y))/(n-1)
covariance(num_friends,daily_minutes)
#相关系数 value_range [-1,1]
def correlation(x,y):
 stdev_x=standard_deviation(x)
 stdev_y=standard_deviation(y)
 if stdev_x>0 and stdev_y>0:
  return covariance(x,y)/stdev_x/stdev_y
 else:
  return 0

'''
outlier=num_friends.index(100)
num_friends_good=[x for i,x in enumerate(num_friends) if i!=outlier]
num_minutes_good=[x for i,x in enumerate(daily_minutes) if i!=outlier]


correlation(num_friends_good,num_minutes_good)
'''
------------------------------------------------CP6概率
------6.1 independence
if E&F dependent then P(E,F)=P(E)P(F)

------6.2条件概率
if E&F dependent then P(E,F)=P(E)P(F)
if not dependent then P(E|F)=P(E,F)/P(F) ==> P(E,F)=P(E,F)P(F) ==>P(E|F)=P(E)
---example生男孩女孩概率问题
假设
1.每个孩子是男孩和女孩的概率相同
2.第二个孩子性别和第一个孩子性别独立
2个孩子都是女孩概率
P(B|G)=P(B,G)/P(G)=P(B)/P(G)=0.5
import random
def random_kid():
return random.choice(['boy','girl'])
random.seed(0)
both_girl=0
either_girl=0
older_girl=0
for _ in range(1000):
younger=random_kid()
older=random_kid()
if older=='girl':
older_girl+=1
if older=='girl' and younger=='girl':
both_girl+=1
if older=='girl' or younger=='girl':
either_girl+=1
print('P(both|older):',both_girl/older_girl)
print('P(both|either):',both_girl/either_girl)

------6.3贝叶斯定理
P(E|F)=P(E,F)/P(F)=P(F|E)P(E)/P(F)
p(F)=P(F,E)+P(F,-E)     -E表示E没发生
==>P(E|F)=P(E,F)/(P(F,E)+P(F,-E))=P(E,F)/(P(F|E)P(E)+P(F|-E)P(-E))

------6.4随机变量

------6.5连续分布
#均匀分布概率密度函数
def uniform_pdf(x):
return 1 if x>=0 and x<1 else 0
#累积分布函数
def uniform_cdf(x):
if x<0:return 0
elif x<1:return x
else: return 1 

------6.6正态分布
f(x|a,b)=(1/(sqrt(2*pi)*b))exp(-(x-a)^2/(2b^2))
如果a=0 b=1则为标准正态分布
import math
import matplotlib
import matplotlib.pyplot as pl
def normal_pdf(x,mu=0,sigma=1):
sqrt_two_pi=math.sqrt(2*math.pi)
return (math.exp(-(x-mu)**2/2/sigma**2)/(sqrt_two_pi*sigma))
xs=[x/10 for x in range(-50,50)]

pl.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend()
pl.title('多个正态分布的概率密度函数')
pl.show()
#标准正态分布的累计分布函数
def normal_cdf(x,mu=0,sigma=1):
return(1+math.erf((x-mu)/math.sqrt(2)/sigma))/2
#draw some 概率累计分布函数
pl.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
pl.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
pl.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
pl.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
pl.legend(loc=4)
pl.title('多个正态分布的累计分布函数')
pl.show()
#对normal_cdf取逆，从而可以求出特定的概率的相应值,因而我们创建二分查找函数inverse_normal_cdf
def inverse_normal_cdf(p,mu=0,sigma=1,tolerance=0.00001):
if nu!=0 or sigma!=1:
return mu+sigma*inverse_normal_cdf(p,tolerance=tolerance)
low_z,low_p=-10.0,0
hi_z,hi_p=10.0,1
while hi_z-low_z>tolerance:
mid_z=(low_z+hi_z)/2
mid_p=normal_cdf(mid_z)
if mid_p<p:
low_z,low_p=mid_z,mid_p
elif mid_p>p:
hi_z,hi_p=mid_z,mid_p
else:
break
return mid_z

------6.7 中心极限定理
一个定义为大量独立同分布的随机变量函数的均值的随机变量本身就是接近于正态分布的
def bernoulli_trial(p):
return 1 if random.random() < p else 0
def binomial(n,p):
return sum(bernoulli_trial(p) for _ in range(n))
#二项分布与正态分布图
def make_hist(p,n,num_points):
data=[binomial(n,p) for _ in range(num_points)]
histogram=Counter(data)
pl.bar([x-0.4 for x in histogram.keys()],[v/num_points for v in histogram.values()],0.8,color='0.75')
mu=p*n
sigma=math.sqrt(n*p*(1-p))
xs=range(min(data),max(data)+1)
ys=[normal_cdf(i+0.5,mu,sigma)-normal_cdf(i-0.5,mu,sigma) for i in xs]
pl.plot(xs,ys)
pl.title('二项分布与正态近似')
pl.show()
make_hist(0.75,100,10000)



------------------------------------------------CP7假设检验
------------Coin-example
import math
#创建正态分布曲线
def normal_appro_to_binomial(n,p):
 mu=p*n
 sigma=math.sqrt(p*(1-p)*n)
 return mu,sigma
#正态CDF是一个变量在一个阀值以下的概率
normal_ptobability_below=normal_cdf
def normal_ptobability_above(lo,mu=0,sigma=1):return 1-normal_cdf(lo,mu,sigma)
def normal_ptobability_between(lo,hi,mu=0,sigma=1):return normal_cdf(hi,mu,sigma)-normal_cdf(lo,mu,sigma)
def normal_ptobability_outside(lo,hi,mu=0,sigma=1):return 1-normal_ptobability_between(lo,hi,mu,sigma)

def normal_upper_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(ptobability,mu,sigma)
def normal_lower_bound(ptobability,mu=0,sigma=1):return inverse_normal_cdf(1-ptobability,mu,sigma)
def normal_two_sided_bounds(ptobability,mu=0,sigma=1):
 tail_ptobability=(1-ptobability)/2
 upper_bound=normal_lower_bound(tail_ptobability,mu,sigma)
 lower_bound=normal_upper_bound(tail_ptobability,mu,sigma)
 return lower_bound,upper_bound

m0,sigma0=normal_appro_to_binomial(1000,0.5)
normal_two_sided_bounds(0.95,mu0,sigma0)

#检验势
lo,hi=normal_two_sided_bounds(0.95,mu0,sigma0)
mu1,sigma1=normal_appro_to_binomial(1000,0.55)
type_2_ptobability=normal_ptobability_between(lo,hi,mu1,sigma1)
power=1-type_2_ptobability

hi=normal_upper_bound(0.95,mu0,sigma0)
type_2_ptobability=normal_ptobability_below(lo,hi,mu1,sigma1)
power=1-type_2_ptobability

#是否均匀的双面检验
def two_sided_p_value(x,mu=0,sigma=1):
 if x >= mu:
  return 2*normal_ptobability_above(x,mu,sigma)
 else:
  return x*normal_ptobability_below(x,mu,sigma)
two_sided_p_value(529.5,mu0,sigma0)
#模拟检验
extreme_value_count=0
for _ in range(100000):
 num_heads=sum(1 if random.random()<0.5 else 0 for _ in range(1000))
 if num_heads>= 530 or num_heads<=470:
  extreme_value_count+=1
print(extreme_value_count/100000)
# so we got
upper_p_value=normal_ptobability_above
lower_p_value=normal_ptobability_below

#对于单边检验,如果我们看到525次正面朝上，那么可以计算
upper_p_value(524.5,mu0,sigma0)
upper_p_value(526.5,mu0,sigma0)

------------置信区间
math.sqrt(p*(1-p)/1000)
#根据我们跑出的次数计算p_hat
p_hat=525/1000
mu=p_hat
sigma=math.sqrt(mu*(1-mu)/1000)
#计算置信区间
normal_two_sided_bounds(0.95,mu,sigma)

------------P-hacking
def run_exp():
 return [random.random()<0.5 for _ in range(1000)]
def reject(exp):
 num_heads=len([flip for flip in exp if flip])
 return num_heads<469 or num_heads>531
random.seed(0)
exps=[run_exp() for _ in range(1000)]
num_rejectons=len([exp for exp in exps if reject(exp)])
print(num_rejectons)

------------ABtest example
def estimated_parameters(N,n):
 p=n/N
 sigma=math.sqrt(p*(1-p)/N)
 return p,sigma

def a_b_test_stas(N_A,n_A,N_B,n_B):
 P_A,sigma_A=estimated_parameters(N_A,n_A)
 P_B,sigma_B=estimated_parameters(N_B,n_B)
 return (P_B-P_A)/math.sqrt(sigma_A**2,sigma_B**2)

z=a_b_test_stas(1000,200,1000,180)
two_sided_p_value(z)
z=a_b_test_stas(1000,200,1000,150)
two_sided_p_value(z)

------------Bayes
def b(alpha,beta):
 return math.gamma(alpha)*math.gamma(beta)/math.gamma(alpha+beta)
def beta_pdf(x,alpha,beta):
 if x<0 or x>1:
  return 0
 return x**(alpha-1)*(1-x)**(beta-1)/b(alpha,beta)
alpha/(alpha+beta)



------------------------------------------------CP8梯度下降
y=2 * (x ** 2)-2*x + 1 最小值求解(梯度下降)
x = 6 
step = 0.1
for i in range(1000):
    x -= step * (4 * x-2)
def y(x):
    return 2 * (x ** 2)-2*x + 1
print ('x:',x,'y:',y(x))
#s^2函数
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)

------8.2估算梯度
from functools import partial
#差商
def difference_quotient(f,x,h):return(f(x+h)-f(x))/h
#x^2
def square(x):return x*x
#x^2倒数2x
def derivative(x):return 2*x

derivative_estimate=partial(difference_quotient,square,h=0.00001)
x=range(-10,10)
pl.title('精确的倒数值与估计值')
pl.plot(x,map(derivative,x),'rx',label='Actual')
pl.plot(x,map(derivative_estimate),'b+',label='Estimate')
pl.legend(loc=9)
pl.show()
#我们把导数看成是其第i个变量的函数,其他变量保持不变,以此计算第i个偏导数
def partial_difference_quotient(f,v,i,h):
w=[v_j+(h if j==i else 0) for j,v_j in enumerate(v)]
return (f(w)-f(v))/h

------8.3使用梯度 p90
def step(v,direction,step_size):
return [v_i+step_size*direction_i for v_i,direction_i in zip(v,direction)]
def sum_of_squares_gradient(v):
    return [2*v_i for v_i in v]
v=[random.randint(-10,10) for i in range(3)]
tolerance=0.0000001
while True:
    gradient=sum_of_squares_gradient(v)
    next_v=step(v,gradient,-0.01)
    if distance(next_v,v)<tolerance:
        break
    v=next_v

------8.4选择正确步长
step_size=[100,10,0.1,1,0.01,0.001,0.0001,0.00001]
def safe(f):
def safe_f(*args,**kwargs):
try:
return f(*args,**kwargs)
except:
return float('inf')
return safe_f

------8.5综合
def minimize_batch(target_fn,gradient_fn,theta_0,tolerance=0.000001):
step_sizes=[100,10,0.1,1,0.01,0.001,0.0001,0.00001]
theta=theta_0
target_fn=safe(target_fn)
value=target_fn(theta)
while True:
gradient=gradient_fn(theta)
next_thetas=[step(theta,gradient,-step_size) for step_size in step_sizes]
next_theta=min(next_thetas,key=target_fn)
next_value=target_fn(next_theta)
if abs(value-next_value) < tolerance:
return theta
else:
theta,values =next_theta,next_value

def negate(f):
return lambda *args,**kwargs:-f(*args,**kwargs)
def negate_all(f):
return lambda *args,**kwargs:[-y for y in f(*args,**kwargs)]
def maximize_batch(target_fn,gradient_fn,theta_0,tolerance=0.000001):
return minimize_batch(negate(target_fn),negate_all(gradient_fn),theta_0,tolerance)

------8.6随机梯度下降法
def in_random_order(data):
indexes=[i for i,_ in enumerate(data)]
random.shuffle(indexes)
for i in indexes:
yield data[i]
def minimize_stochastic(target_fn,gradient_fn,x,y,theta_0,alpha_0=0.01):
data=zip(x,y)
theta=theta_0
alpha=alpha_0
min_theta,min_value=None,float('inf')
iterations_with_no_improvement=0
while iterations_with_no_improvement<100:
value=sum(target_fn(x_i,y_i,theta) for x_i,y_i in data)
if value<min_value:
min_theta,min_value=theta,value
iterations_with_no_improvement=0
alpha=alpha_0
else:
iterations_with_no_improvement+=1
alpha*=0.9
for x_i,y_i in in_random_order(data):
gradient_i=gradient_fn(x_i,y_i,theta)
theta=vector_subtract(theta,scalar_multiply(alpha,gradient=1))
return min_theta
def maximize_stochastic(target_fn,gradient_fn,x,y,theta_0,alpha_0=0.01):
return minimize_stochastic(negate(target_fn),negate_all(gradient_fn),x,y,theta_0,alpha_0)

------------------------------------------------CP9 catch-data
------9.1 stdin&stdout
#egrep.py
import sys,re
regex=sys.argv[1]
for line in sys.stdin:
 if re.search(regex,line):
  sys.stdout.write(line)
#line_count.py
import sys
count=0
for line in sys.stdin:
 count+=1
#sys.stdout
#execute computing
type somefile.txt|python egrep.py"[0-9]"|line_count.py
#use the Script calculate mostcommon_value
#most_common_words.py
import sys
from collections import Counter
try:
 num_words=int(sys.argv[1])
except:
 print('usage:most_common_words.py num_words')
 sys.exit(1)
counter=Counter(word.lower() for line in sys.stdin for word in line.strip().split() if word)
for word,count in count.most_common(num_words):
 sys.stdout.write(str(count))
 sys.stdout.write("\t")
 sys.stdout.write(word)
 sys.stdout.write("\n")

#if you want to EXECUTE this Script to Bible
C:\DataScience>type the_bible.txt|python most_common_words.py 10

------9.2 read file 
---9.2.1 text_file basic
#reading
file_for_reading=open('reading_file.txt',r)
#writing
file_for_writing=open('reading_file.txt',w)
#appending
file_for_appending=open('reading_file.txt',a)
file_for_writing.close()

with open(filename,'r') as f:
 data=function_that_gets_data_from(f)
 process(data)

starts_with_hash=0
#if start with '#' then +1
with open('input.txt','r') as f:
 for line in file:
  if re.match('^#',line):
   starts_with_hash+=1

def get_domain(email_address):
 return email_address.lower().split('@')[-1]
with open('email_address.txt',r) as f:
 domain_counts=Counter(get_domain(line.strip()) for line in f if '@' in line)

a=open('C:/Users/zhangjiajun858/Desktop/te.txt','r')
with open('C:/Users/zhangjiajun858/Desktop/te.txt','r') as f:
 for line in lines:
  print line

------9.2.2 restrict file
import csv 
with open('C:/Users/zhangjiajun858/Desktop/te.csv','rb') as f:
    reader=csv.reader(f,delimiter='\t')
    for row in reader:
        date=row[0]
        symbol=row[1]
        closing_price=float(row[2])
        process(date,symbol,closing_price)

------9.3web catching (p99)
---9.3.2 O'Reilly book

------9.4 use API
---9.4.1 JSON(XML)

---9.4.2 use unverification API
import requests,json
endpoint="https://api.github.com/users/joelgrus/repos"
repos=json.loads(requests.get(endpoint).txt)

from dateutil.parser import parse
dates=[parse(repo['create_at']) for repo in repos]
month_counts=Counter(date.month for date in dates)
weekday_counts=Counter(date.weekday() for date in dates)

---9.4.3

------9.5 EX: use Twitter API
from twython import Twython
twitter=Twython(CONSUMER_KEY,CONSUMER_SECRET)
for status in twitter.search(q='"data science"')['statuses']:
 user=status['user']['screen_name'].encode('utf-8')
 text=status['text'].encode('utf-8')
 print user,':',text
 print 

------------------------------------------------CP10 data-work
------10.1 探索一维数据
import matplotlib.pyplot as pl
import math
from collections import Counter
def bucketize(point,bucket_size):return bucket_size*math.floor(point/bucket_size)
def make_histogram(points,bucket_size):return Counter(bucketize(point,bucket_size) for point in points)
def plot_histogram(points,bucket_size,titlt=''):
 histogram=make_histogram(points,bucket_size)
 pl.bar(histogram.keys(),histogram.values(),width=bucket_size)
 pl.title(title)
 pl.show()
random.seed(0)
uniform=[200*random.random()-100 for _ in range(1000)] 
normal=[57*inverse_normal_cdf(random.random()) for _ in range(1000)]
plot_histogram(uniform,10,'均匀分布的直方图')
plot_histogram(normal,10,'正态分布的直方图')

------10.1.2二维数据
df random_normal():return inverse_normal_cdf(random.random())
xs=[random_normal() for _ in range(1000)]
ys1=[x+random_normal()/2 for x in xs]
ys2=[-x+random_normal()/2 for x in xs]
pl.scatter(xs,ys1,marker='.',color='black',label='ys1')
pl.scatter(xs,ys2,marker='.',color='gray',label='ys2')
pl.xlabel('xs')
pl.ylabel('ys')
pl.legend=(loc=9)
pl.title('差别很大的联合分布')
pl.show()

print(correlation(xs,ys1))
print(correlation(xs,ys22))

------10.1.3多维数据
def correlation_matrix(data):
 _,num_columns=shape(data)
 def matrix_entry(i,j):
  return correlation(get_column(data,i),get_column(data,j))
 return make_matrix(num_columns,num_columns,matrix_entry)

import matplotlib.pyplot as pl
_,num_columns=shape(data)
fig,ax=pl.subplots(num_columns,num_columns)
for i in range(num_columns):
 for j in range(num_points):
  if i !=j :ax[i][j].scatter(get_column(datat,j),get_column(data,i))
  else:a[i][j].annotate('series'+str(i),(0.5,0.5),xycoords='axes fraction',ha='center',va='center')
  if i <num_columns-1:ax[i][j].xaxis.set_visible(False)
  if j>0:ax[i][j].yaxis.set_visible(False)
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_xlim(ax[0][1].get_xlim())
pl.show()

------10.2清理与修改
def parse_row(input_row,parsers):
 return [parser(value) if parsers is not None else value for value,parser in zip(input_row,parsers)]
def parse_rows_with(reader,parsers):
 for row in reader:
  yield parse_row(row,parsers)
def try_or_none(f):
 def f_or_none(x):
  try:return f(x)
  except: return none
 return f_or_none
#rewrite
def parse_row():
 return [try_or_none(parser)(value) if parser is not None else value for value,parser in zip(input_row,parsers)]

import dateutil.parser
data=[]
with open(,'rb') as f:
 reader=csv.reader(f)
 for line in parse_rows_with(reader,[dateutil.parser.parse,none,float]):
  data.append(line)
for row in data:
 if any(x is None for x in row):
  print(row)
def try_parse_field(field_name,value,parser_dict):
 parser=parser_dict.get(field_name)
 if parser is not None:
  return try_or_none(parser)(value)
 else:
  return value
def parse_dict(input_dict,parser_dict):
 return{field_name:try_parse_field(field_name,value,parser_dict)
for field_name,value in input_dict.iteritems()}


------10.3 data process   P121
------10.4 data adjusting
import math
def sum_of_squares(v):
return sum(v_i**2 for v_i in v)
def magnitude(v):
return math.sqrt(sum_of_squares(v))
def vector_subtract(v,w):
return[v_i-w_j for v_i,w_j in zip(v,w)]
def distance(v,w):
return magnitude(vector_subtract(v,w))
ab=distance([63,150],[67,160])
ac=distance([63,150],[70,171])
bc=distance([67,160],[70,171])
#calculate means&STD 计算评价值和方差
def scale(data_matrix):
num_rows,num_cols=shape(data_matrix)
means=[mean(get_column(data_matrix,j)) for j in range(num_cols)]
stdevs=[standard_deviation(get_column(data_matrix,j) for j in range(num_cols))]
return means,stdevs
#then we made a new data-matrix 创建一个新矩阵
def rescale(data_matrix):
def rescaled(i,j):
if stdevs[j]>0:
return (data_matrix[i][j]-means[j])/stdevs[j]
else:
return data_matrix[i][j]
num_rows,num_cols=shape()
return make_matrix(num_rows,num_cols,rescaled)

------10.5 降维
#convert to non-zero 将数据转换成每个维度为零的形式
def fr_mean_matrix(a):
nr,nc=shape(a)
column_means,_ = scale(a)
return make_matrix(nr,nc,lambda i,j:a[i][j]-column_means[j])
#calculate the direction of vector 计算向量方向
def direction(w):
mag=magnitude(w)
return [w_i/mag for w_i in w]
#calculate direction-w 's var 计算w方向上的方差
def directional_variance_i(x_i,w):
return dot(x_i,direction(w))**2
def directional_variance(X,w):
return sum(directional_variance_i(x_i,w) for x_i in X)
#使用梯度下降计算 方差最大的那个方向
def directional_variance_gradient_i(x_i,w):
projection_length=dot(x_i,direction(w))
return [2*projection_length*x_ij for x_ij in x_i]
def directional_variance_gradient(X,w):
return vector_sum(directional_variance_gradient_i(x_i,w) for x_i in X)
#使用随机梯度下降法
def first_principal_component_sgd(X):
guess=[1 for _ in X[0]]
unscaled_maximizer=maximize_stochastic(
lambda x,_, w:directional_variance_i(x,w),
lambda x,_, w:directional_variance_gradient_i(x,w),
X,
[None for _ in X],guess)
return direction(unscaled_maximizer)
#使用以上方程得到第一主成分方向后,我们将数据在这个方向上投影得到这个成分的值
def project(v,w):
projection_length=dot(v,w)
return scalar_multiply(projection_length,w)
#if we want get anyother we remove p127
def remove_projection_from_vector(v,w):
return vector_subtract(v,project(v,w))
def remove_projection(X,w):
return [remove_projection_from_vector(x_i,w) for x_i in X]
#在更高维度的数据集中,我们可以通过迭代找到我们所需的任意数目的主成分
def principal_component_analysis(X,num_components):
components=[]
for _ in range(num_components):
component=first_principal_component(X)
components.append(component)
x=remove_projection(X,component)
return components
#然后将数据转换为由主成分生成的低维空间中的点
def transform_vector(X,components):
return [dot(v,w) for w in components]
def transform(X,components):
return [transform_vector(x_i,components) for x_i in X ]
---p128



------------------------------------------------CP11 ML
#划分数据集
def split_data(data,prob):
results=[],[]
for row in data:
results[0 if random.random()<prob else 1].append(row)
return results
def train_test_split(x,y,test_pct):
data=zip(x,y)
train,test=split_data(data,1-test_pct)
x_train,y_train=zip(*train)
x_test,y_test=zip(*test)
return x_train,x_test,y_train,y_test
#=>
model=SomeKindOfModel()
x_train,x_test,y_train,y_test=train_test_split(xs,ys,0.33)
model.train(x_train,y_train)
performance=model.test(x_test,y_test)
#计算准确率
def accuracy(tp,fp,fn,tn):
correct=tp+tn
total=tp+fp+fn+tn
return correct/total
print(accuracy(70,4930,13930,981070))
#计算查准率(precision) 和 查全率(recall)
def precision(tp,fp,fn,tn):
return tp/(tp+fp)
print(precision(70,4930,13930,981070))
def recall(tp,fp,fn,tn):
return tp/(tp+fn)
print(recall(70,4930,13930,981070))
#F1-score(结合查准率和查全率)
def f1_score(tp,fp,fn,tn):
p=precision(tp,fp,fn,tn)
r=recall(tp,fp,fn,tn)
return 2*p*r/(p+r)




import random 
import numpy as np


a=[]
b=[]
c=[]
for i in range(1000000):
if i%3==0 or i%4==0:
a.append(i)
a.remove(0)
for x in a:
for y in a:
for z in a:
n=x+y+z
b.append(n)
for i in b:
if i%x==0 and i%y==0 and i%z==0:
print(i)
print(c[:2])


------------------------------------------------CP12 k-Nearest Neighbors (k近邻法)
#投票统计函数
from collections import Counter
def raw_majority_vote(labels):
 votes=Counter(labels)
 winner,_=votes.most_common(1)[0]
 return winner 

#随机选择一个获胜者
#根据距离加权投票并选择加权的获胜者
#减少k值知道找到唯一获胜者
def majority_vote(labels):
 vote_counts=Counter(labels)
 winner,winner_count=vote_counts.most_common(1)[0]
 num_winners=len([count for count in vote_counts.values() if count==winner_count])
 if num_winners==1:
  return winner 
 else:
  return majority_vote(labels[:-1])

#创建分类器
def knn_classify(k,labeled_points,new_point):
 by_distance=sorted(labeled_points,key=lambda point: distance(point,new_point))
 k_nearest_labels=[label for _,label in by_distance[:k]]
 return majority_vote(k_nearest,labels)

#12.2 案例：最喜欢的编程语言
cities = [(-86.75,33.5666666666667,'Python'),(-88.25,30.6833333333333,'Python'),(-112.016666666667,33.4333333333333,'Java'),(-110.933333333333,32.1166666666667,'Java'),(-92.2333333333333,34.7333333333333,'R'),(-121.95,37.7,'R'),(-118.15,33.8166666666667,'Python'),(-118.233333333333,34.05,'Java'),(-122.316666666667,37.8166666666667,'R'),(-117.6,34.05,'Python'),(-116.533333333333,33.8166666666667,'Python'),(-121.5,38.5166666666667,'R'),(-117.166666666667,32.7333333333333,'R'),(-122.383333333333,37.6166666666667,'R'),(-121.933333333333,37.3666666666667,'R'),(-122.016666666667,36.9833333333333,'Python'),(-104.716666666667,38.8166666666667,'Python'),(-104.866666666667,39.75,'Python'),(-72.65,41.7333333333333,'R'),(-75.6,39.6666666666667,'Python'),(-77.0333333333333,38.85,'Python'),(-80.2666666666667,25.8,'Java'),(-81.3833333333333,28.55,'Java'),(-82.5333333333333,27.9666666666667,'Java'),(-84.4333333333333,33.65,'Python'),(-116.216666666667,43.5666666666667,'Python'),(-87.75,41.7833333333333,'Java'),(-86.2833333333333,39.7333333333333,'Java'),(-93.65,41.5333333333333,'Java'),(-97.4166666666667,37.65,'Java'),(-85.7333333333333,38.1833333333333,'Python'),(-90.25,29.9833333333333,'Java'),(-70.3166666666667,43.65,'R'),(-76.6666666666667,39.1833333333333,'R'),(-71.0333333333333,42.3666666666667,'R'),(-72.5333333333333,42.2,'R'),(-83.0166666666667,42.4166666666667,'Python'),(-84.6,42.7833333333333,'Python'),(-93.2166666666667,44.8833333333333,'Python'),(-90.0833333333333,32.3166666666667,'Java'),(-94.5833333333333,39.1166666666667,'Java'),(-90.3833333333333,38.75,'Python'),(-108.533333333333,45.8,'Python'),(-95.9,41.3,'Python'),(-115.166666666667,36.0833333333333,'Java'),(-71.4333333333333,42.9333333333333,'R'),(-74.1666666666667,40.7,'R'),(-106.616666666667,35.05,'Python'),(-78.7333333333333,42.9333333333333,'R'),(-73.9666666666667,40.7833333333333,'R'),(-80.9333333333333,35.2166666666667,'Python'),(-78.7833333333333,35.8666666666667,'Python'),(-100.75,46.7666666666667,'Java'),(-84.5166666666667,39.15,'Java'),(-81.85,41.4,'Java'),(-82.8833333333333,40,'Java'),(-97.6,35.4,'Python'),(-122.666666666667,45.5333333333333,'Python'),(-75.25,39.8833333333333,'Python'),(-80.2166666666667,40.5,'Python'),(-71.4333333333333,41.7333333333333,'R'),(-81.1166666666667,33.95,'R'),(-96.7333333333333,43.5666666666667,'Python'),(-90,35.05,'R'),(-86.6833333333333,36.1166666666667,'R'),(-97.7,30.3,'Python'),(-96.85,32.85,'Java'),(-95.35,29.9666666666667,'Java'),(-98.4666666666667,29.5333333333333,'Java'),(-111.966666666667,40.7666666666667,'Python'),(-73.15,44.4666666666667,'R'),(-77.3333333333333,37.5,'Python'),(-122.3,47.5333333333333,'Python'),(-89.3333333333333,43.1333333333333,'R'),(-104.816666666667,41.15,'Java')]
#processing
cities = [([longitude, latitude], language) for longitude, latitude, language in cities]
# key is language, value is pair (longitudes, latitudes)
plots = { "Java" : ([], []), "Python" : ([], []), "R" : ([], []) }    
# we want each language to have a different marker and color
markers = { "Java" : "o", "Python" : "s", "R" : "^" }
colors = { "Java" : "r", "Python" : "b", "R" : "g" }
for (longitude, latitude), language in cities:
 plots[language][0].append(longitude)
 plots[language][1].append(latitude)
# create a scatter series for each language
for language, (x, y) in plots.items():
 plt.scatter(x, y, color=colors[language], marker=markers[language],
 label=language, zorder=10)
 #plot_state_borders(plt) # assume we have a function that does this
 plt.legend(loc=0) # let matplotlib choose the location
 plt.axis([-130,-60,20,55]) # set the axes
 plt.title("Favorite Programming Languages")
 plt.show()

'''for k in [1, 3, 5, 7]:
 num_correct = 0
 for city in cities:
  location, actual_language=city
  other_cities = [other_city for other_city in cities if other_city != city]
  predicted_language = knn_classify(k, other_cities, location)
  if predicted_language == actual_language:
   num_correct += 1
 print(k, "neighbor[s]:", num_correct, "correct out of", len(cities))

#TypeError: unsupported operand type(s) for -: 'list' and 'float'
'''

#12.3维度灾难
#创建一个可以得到指定个数的随机数列表
def random_point(dim):
 return [random.random() for _ in range(dim)]

def random_distances(dim,num_pairs):
 return [distance(random_point(dim),random+point(dim)) for _ in range(num_pairs)]

dimensions=range(1,101)
avg_distances=[]
min_distances=[]
random.seed(0)
for dim in dimensions:
 distances=random_distances(dim,10000)
 avg_distances.append(mean(distance))
 min_distances.append(min(distance))

min_avg_ratio=[min_dist/avg_dist for min_dist,avg_dist in zip(min_distances,avg_distances)]



------------------------------------------------CP13 Naive Bayes(朴素贝叶斯算法)
def tokenize(message):
 message = message.lower()                      # convert to lowercase
 all_words = re.findall("[a-z0-9']+", message)  # extract the words
 return set(all_words)                          # remove duplicates

def count_words(training_set):
"""training set consists of pairs (message, is_spam)"""
 counts = defaultdict(lambda: [0, 0])
 for message, is_spam in training_set:
  for word in tokenize(message):
   counts[word][0 if is_spam else 1] += 1
 return counts

def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
"""turn the word_counts into a list of triplets w, p(w | spam) and p(w | ~spam)"""
 return [(w,(spam + k) / (total_spams + 2 * k),(non_spam + k) / (total_non_spams + 2 * k)) for w, (spam, non_spam) in counts.iteritems()]

def spam_probability(word_probs, message):
 message_words = tokenize(message)
 log_prob_if_spam = log_prob_if_not_spam = 0.0
 for word, prob_if_spam, prob_if_not_spam in word_probs:
# for each word in the message,
# add the log probability of seeing it
  if word in message_words:
   log_prob_if_spam += math.log(prob_if_spam)
   log_prob_if_not_spam += math.log(prob_if_not_spam)
# for each word that's not in the message
# add the log probability of _not_ seeing it
  else:
   log_prob_if_spam += math.log(1.0 - prob_if_spam)
   log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)    
 prob_if_spam = math.exp(log_prob_if_spam)
 prob_if_not_spam = math.exp(log_prob_if_not_spam)
 return prob_if_spam / (prob_if_spam + prob_if_not_spam)


class NaiveBayesClassifier:
 def __init__(self, k=0.5):
  self.k = k
  self.word_probs = []
 def train(self, training_set):
# count spam and non-spam messages
  num_spams = len([is_spam for message, is_spam in training_set if is_spam])
  num_non_spams = len(training_set) - num_spams
# run training data through our "pipeline"
  word_counts = count_words(training_set)
  self.word_probs = word_probabilities(word_counts,num_spams,num_non_spams,self.k)
 def classify(self, message):
  return spam_probability(self.word_probs, message)
    
#13.4 moudel_test


data = []
# regex for stripping out the leading "Subject:" and any spaces after it
subject_regex = re.compile(r"^Subject:\s+")
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
 is_spam = "ham" not in fn
 with open(fn,'r') as file:
  for line in file:
   if line.startswith("Subject:"):
    subject = subject_regex.sub("", line).strip()
    data.append((subject, is_spam))
    
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject)) for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) for _, is_spam, spam_probability in classified)
classified.sort(key=lambda row: row[2])
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
hammiest_spams = filter(lambda row: row[1], classified)[:5]


def p_spam_given_word(word_prob):
 word, prob_if_spam, prob_if_not_spam = word_prob
 return prob_if_spam / (prob_if_spam + prob_if_not_spam)
words = sorted(classifier.word_probs, key=p_spam_given_word)    
spammiest_words = words[-5:]
hammiest_words = words[:5]

def drop_final_s(word):
 return ("s$","",word)


------------------------------------------------*args,**kwargs 参数
*args,**kwargs
https://www.cnblogs.com/xuyuanyuan123/p/6674645.html
http://python.jobbole.com/83476/

#*args表示任何多个无名参数，它是一个tuple
#**kwargs表示关键字参数，它是一个dict
#形参*args可以传入不指定tuple，**kwargs传入一个dict
在函数定义中使用*args和**kwargs传递可变长参数。
*args用作传递非命名键值可变长参数列表（位置参数）; **kwargs用作传递键值可变长参数列表。

def foo(*args,**kwargs):
 print('args=',args)
 print('kwargs=',kwargs)
 print('**********************')
if __name__=='__main__':
 foo(1,2,3)
 foo(a=1,b=2,c=3)
 foo(1,2,3,a=1,b=2,c=3)
 foo(1,'b','c',a=1,b='b',c='c')


------------------------------------------------CP14 Simple Linear Regression (简单线性回归)
#Y_i=bX_i+a+e
num_friends_good = [49,41,40,25,21,21,19,19,18,18,16,15,15,15,15,14,14,13,13,13,13,12,12,11,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
daily_minutes_good = [68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]
    

def predict(alpha, beta, x_i):
    return beta * x_i + alpha
#计算误差
def error(alpha,beta,x_i,y_i):
    return y_i-predict(alpha,beta,x_i)
#计算误差平方和
def sum_of_squared_errors(alpha,beta,x,y):
    return sum(error(alpha,beta,x_i,y_i)**2 for x_i,y_i in zip(x,y))
#利用最小二乘法计算误差最小时的alpha和beta
def least_squares_fit(x,y):
    beta=correlation(x,y)*standard_deviation(y)/standard_deviation(x)
    alpha=mean(y)-beta*mean(x)
    return alpha,beta

alpha,beta=least_squares_fit(num_friends_good,daily_minutes_good)

#R^2(决定系数)：用来评估模型的拟合效果
def total_sum_of_squares(y):
    return sum(v**2 for v in de_mean(y))
def r_squared(alpha,beta,x,y):
    return 1.0-(sum_of_squared_errors(alpha,beta,x,y)/total_sum_of_squares(y))
r_squared(alpha,beta,num_friends_good,daily_minutes_good)                          <output>--->0.329

#利用梯度下降法求解
def squared_error(x_i,y_i,theta):
    alpha,beta=theta
    return error(alpha,beta,x_i,y_i)**2
def squared_error_gradient(x_i,y_i,theta):
    alpha,beta=theta
    return [-2*error(alpha,beta,x_i,y_i),-2*error(alpha,beta,x_i,y_i)*x_i]

random.seed(0)
theta=[random.random(),random.random()]
alpha,beta=minimize_stochastic(squared_error,squared_error_gradient,num_friends_good,daily_minutes_good,theta,0.0001 )
print(alpha,beta)



------------------------------------------------CP15 Multiple Regression(多重回归分析)
if __name__ == "__main__":
    
x = [[1,49,4,0],[1,41,9,0],[1,40,8,0],[1,25,6,0],[1,21,1,0],[1,21,0,0],[1,19,3,0],[1,19,0,0],[1,18,9,0],[1,18,8,0],[1,16,4,0],[1,15,3,0],[1,15,0,0],[1,15,2,0],[1,15,7,0],[1,14,0,0],[1,14,1,0],[1,13,1,0],[1,13,7,0],[1,13,4,0],[1,13,2,0],[1,12,5,0],[1,12,0,0],[1,11,9,0],[1,10,9,0],[1,10,1,0],[1,10,1,0],[1,10,7,0],[1,10,9,0],[1,10,1,0],[1,10,6,0],[1,10,6,0],[1,10,8,0],[1,10,10,0],[1,10,6,0],[1,10,0,0],[1,10,5,0],[1,10,3,0],[1,10,4,0],[1,9,9,0],[1,9,9,0],[1,9,0,0],[1,9,0,0],[1,9,6,0],[1,9,10,0],[1,9,8,0],[1,9,5,0],[1,9,2,0],[1,9,9,0],[1,9,10,0],[1,9,7,0],[1,9,2,0],[1,9,0,0],[1,9,4,0],[1,9,6,0],[1,9,4,0],[1,9,7,0],[1,8,3,0],[1,8,2,0],[1,8,4,0],[1,8,9,0],[1,8,2,0],[1,8,3,0],[1,8,5,0],[1,8,8,0],[1,8,0,0],[1,8,9,0],[1,8,10,0],[1,8,5,0],[1,8,5,0],[1,7,5,0],[1,7,5,0],[1,7,0,0],[1,7,2,0],[1,7,8,0],[1,7,10,0],[1,7,5,0],[1,7,3,0],[1,7,3,0],[1,7,6,0],[1,7,7,0],[1,7,7,0],[1,7,9,0],[1,7,3,0],[1,7,8,0],[1,6,4,0],[1,6,6,0],[1,6,4,0],[1,6,9,0],[1,6,0,0],[1,6,1,0],[1,6,4,0],[1,6,1,0],[1,6,0,0],[1,6,7,0],[1,6,0,0],[1,6,8,0],[1,6,4,0],[1,6,2,1],[1,6,1,1],[1,6,3,1],[1,6,6,1],[1,6,4,1],[1,6,4,1],[1,6,1,1],[1,6,3,1],[1,6,4,1],[1,5,1,1],[1,5,9,1],[1,5,4,1],[1,5,6,1],[1,5,4,1],[1,5,4,1],[1,5,10,1],[1,5,5,1],[1,5,2,1],[1,5,4,1],[1,5,4,1],[1,5,9,1],[1,5,3,1],[1,5,10,1],[1,5,2,1],[1,5,2,1],[1,5,9,1],[1,4,8,1],[1,4,6,1],[1,4,0,1],[1,4,10,1],[1,4,5,1],[1,4,10,1],[1,4,9,1],[1,4,1,1],[1,4,4,1],[1,4,4,1],[1,4,0,1],[1,4,3,1],[1,4,1,1],[1,4,3,1],[1,4,2,1],[1,4,4,1],[1,4,4,1],[1,4,8,1],[1,4,2,1],[1,4,4,1],[1,3,2,1],[1,3,6,1],[1,3,4,1],[1,3,7,1],[1,3,4,1],[1,3,1,1],[1,3,10,1],[1,3,3,1],[1,3,4,1],[1,3,7,1],[1,3,5,1],[1,3,6,1],[1,3,1,1],[1,3,6,1],[1,3,10,1],[1,3,2,1],[1,3,4,1],[1,3,2,1],[1,3,1,1],[1,3,5,1],[1,2,4,1],[1,2,2,1],[1,2,8,1],[1,2,3,1],[1,2,1,1],[1,2,9,1],[1,2,10,1],[1,2,9,1],[1,2,4,1],[1,2,5,1],[1,2,0,1],[1,2,9,1],[1,2,9,1],[1,2,0,1],[1,2,1,1],[1,2,1,1],[1,2,4,1],[1,1,0,1],[1,1,2,1],[1,1,2,1],[1,1,5,1],[1,1,3,1],[1,1,10,1],[1,1,6,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,4,1],[1,1,9,1],[1,1,9,1],[1,1,4,1],[1,1,2,1],[1,1,9,1],[1,1,0,1],[1,1,8,1],[1,1,6,1],[1,1,1,1],[1,1,1,1],[1,1,5,1]]
daily_minutes_good =[68.77,51.25,52.08,38.36,44.54,57.13,51.4,41.42,31.22,34.76,54.01,38.79,47.59,49.1,27.66,41.03,36.73,48.65,28.12,46.62,35.57,32.98,35,26.07,23.77,39.73,40.57,31.65,31.21,36.32,20.45,21.93,26.02,27.34,23.49,46.94,30.5,33.8,24.23,21.4,27.94,32.24,40.57,25.07,19.42,22.39,18.42,46.96,23.72,26.41,26.97,36.76,40.32,35.02,29.47,30.2,31,38.11,38.18,36.31,21.03,30.86,36.07,28.66,29.08,37.28,15.28,24.17,22.31,30.17,25.53,19.85,35.37,44.6,17.23,13.47,26.33,35.02,32.09,24.81,19.33,28.77,24.26,31.98,25.73,24.86,16.28,34.51,15.23,39.72,40.8,26.06,35.76,34.76,16.13,44.04,18.03,19.65,32.62,35.59,39.43,14.18,35.24,40.13,41.82,35.45,36.07,43.67,24.61,20.9,21.9,18.79,27.61,27.21,26.61,29.77,20.59,27.53,13.82,33.2,25,33.1,36.65,18.63,14.87,22.2,36.81,25.53,24.62,26.25,18.21,28.08,19.42,29.79,32.8,35.99,28.32,27.79,35.88,29.06,36.28,14.1,36.63,37.49,26.9,18.58,38.48,24.48,18.95,33.55,14.24,29.04,32.51,25.63,22.22,19,32.73,15.16,13.9,27.2,32.01,29.27,33,13.74,20.42,27.32,18.23,35.35,28.48,9.08,24.62,20.12,35.26,19.92,31.02,16.49,12.16,30.7,31.22,34.65,13.13,27.51,33.2,31.57,14.1,33.42,17.44,10.12,24.42,9.82,23.39,30.93,15.03,21.67,31.09,33.29,22.61,26.89,23.48,8.38,27.81,32.35,23.84]

#15.1 minutes=a+b_1*friends+b_2*work_hours+b3phd_+e
#对于多重回归 可以看成简单线性的列衍生（只要在数据列中加一列）
from __future__ import division
from collections import Counter
from functools import partial

def predict(x_i,beta):
    return dot(x_i,beta)

#15.2 最小二乘模拟的进一步假设 P165
#15.3拟合模型
#计算误差&平方和
def error(x_i, y_i, beta):
    return y_i - predict(x_i, beta)
def squared_error(x_i, y_i, beta):
    return error(x_i, y_i, beta) ** 2
#w2
def squared_error_gradient(x_i, y_i, beta):
    """the gradient corresponding to the ith squared error term"""
    return [-2 * x_ij * error(x_i, y_i, beta) for x_ij in x_i]

def estimate_beta(x, y):
    beta_initial = [random.random() for x_i in x[0]]
    return minimize_stochastic(squared_error,squared_error_gradient,x, y,beta_initial,0.001) 
random.seed(0)
beta=estimate_beta(x,daily_minutes_good)

#15.5拟合优度
#r^2
def multiple_r_squared(x, y, beta):
    sum_of_squared_errors = sum(error(x_i, y_i, beta) ** 2 for x_i, y_i in zip(x, y))
    return 1.0 - sum_of_squared_errors / total_sum_of_squares(y)

#15.6Bootstrap
data=get_sample(num_points=n)
def bootstrap_sample(data):
    """randomly samples len(data) elements with replacement"""
    return [random.choice(data) for _ in data]
    
def bootstrap_statistic(data, stats_fn, num_samples):
    """evaluates stats_fn on num_samples bootstrap samples from data"""
    return [stats_fn(bootstrap_sample(data))
    for _ in range(num_samples)]

close_to_100=[99.5+random.random() for _ in range(101)]
far_from_100=([99.5+random.random()]+[random.random() for _ in range(50)]+[200+random.random() for _ in range(50)])

bootstrap_statistic(close_to_100,median,100)
bootstrap_statistic(far_from_100,median,100)

#15.7 回归系数的标准误差
def estimate_sample_beta(sample):
    x_sample, y_sample = zip(*sample) # magic unzipping trick
    return estimate_beta(x_sample, y_sample)
random.seed(0)
bootstrap_betas=bootstrap_statistic(list(zip(x,daily_minutes_good)),estimate_sample_beta,100)
bootstrap_standard_errors=[standard_deviation([beta[i] for beta in bootstrap_betas]) for i in range(4)]    

#计算P值
def p_value(beta_hat_j, sigma_hat_j):
    if beta_hat_j > 0:
        return 2 * (1 - normal_cdf(beta_hat_j / sigma_hat_j))
    else:
        return 2 * normal_cdf(beta_hat_j / sigma_hat_j)

#15.8 正则化
def ridge_penalty(beta, alpha):
    return alpha * dot(beta[1:], beta[1:])
    
def squared_error_ridge(x_i, y_i, beta, alpha):
    """estimate error plus ridge penalty on beta"""
    return error(x_i, y_i, beta) ** 2 + ridge_penalty(beta, alpha)
    
def ridge_penalty_gradient(beta, alpha):
    """gradient of just the ridge penalty"""
    return [0] + [2 * alpha * beta_j for beta_j in beta[1:]]
    
def squared_error_ridge_gradient(x_i, y_i, beta, alpha):
    """the gradient corresponding to the ith squared error term
    including the ridge penalty"""
    return vector_add(squared_error_gradient(x_i, y_i, beta),ridge_penalty_gradient(beta, alpha))
    
def estimate_beta_ridge(x, y, alpha):
    """use gradient descent to fit a ridge regression
    with penalty alpha"""
    beta_initial = [random.random() for x_i in x[0]]
    return minimize_stochastic(partial(squared_error_ridge, alpha=alpha),partial(squared_error_ridge_gradient,alpha=alpha),x, y,beta_initial,0.001)
    
random.seed(0)
beta = estimate_beta_ridge(x, daily_minutes_good, alpha=0.0)

def lasso_penalty(beta, alpha):
    return alpha * sum(abs(beta_i) for beta_i in beta[1:]) 


------------------------------------------------CP16 Logistic Regression(逻辑回归)
#raw_data[experience,salary,paid_account]
data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1)]
#取出experience,salary
x=[[1]+list(row[:2]) for row in data]
#取出是否是付费用户(1 or 0)
y=[row[2] for row in data]
#利用线性回归找出最佳模型 paid_account=b_0+b_1*experience+b_2*salary+e
rescaled_x=rescale(x)
beta=estimate_beta(rescaled_x,y)
predictions=[predict(x_i,beta) for x_i in rescaled_x]
plt.scatter(predictions,y)
plt.xlabel('predicted')
plt.ylabel('actual')
pl.show()
#logistic函数
def logistic(x):
    return 1.0 / (1 + math.exp(-x))
def logistic_prime(x):
    return logistic(x) * (1 - logistic(x))

def logistic_log_likelihood_i(x_i, y_i, beta):
    if y_i == 1:
        return math.log(logistic(dot(x_i, beta)))
    else:
        return math.log(1 - logistic(dot(x_i, beta)))

def logistic_log_partial_ij(x_i, y_i, beta, j):
    """here i is the index of the data point,
    j the index of the derivative"""

    return (y_i - logistic(dot(x_i, beta))) * x_i[j]
    
def logistic_log_gradient_i(x_i, y_i, beta):
    """the gradient of the log likelihood 
    corresponding to the i-th data point"""

    return [logistic_log_partial_ij(x_i, y_i, beta, j) for j, _ in enumerate(beta)]
            
def logistic_log_gradient(x, y, beta):
    return reduce(vector_add,[logistic_log_gradient_i(x_i, y_i, beta) for x_i, y_i in zip(x,y)]) 

data = [(0.7,48000,1),(1.9,48000,0),(2.5,60000,1),(4.2,63000,0),(6,76000,0),(6.5,69000,0),(7.5,76000,0),(8.1,88000,0),(8.7,83000,1),(10,83000,1),(0.8,43000,0),(1.8,60000,0),(10,79000,1),(6.1,76000,0),(1.4,50000,0),(9.1,92000,0),(5.8,75000,0),(5.2,69000,0),(1,56000,0),(6,67000,0),(4.9,74000,0),(6.4,63000,1),(6.2,82000,0),(3.3,58000,0),(9.3,90000,1),(5.5,57000,1),(9.1,102000,0),(2.4,54000,0),(8.2,65000,1),(5.3,82000,0),(9.8,107000,0),(1.8,64000,0),(0.6,46000,1),(0.8,48000,0),(8.6,84000,1),(0.6,45000,0),(0.5,30000,1),(7.3,89000,0),(2.5,48000,1),(5.6,76000,0),(7.4,77000,0),(2.7,56000,0),(0.7,48000,0),(1.2,42000,0),(0.2,32000,1),(4.7,56000,1),(2.8,44000,1),(7.6,78000,0),(1.1,63000,0),(8,79000,1),(2.7,56000,0),(6,52000,1),(4.6,56000,0),(2.5,51000,0),(5.7,71000,0),(2.9,65000,0),(1.1,33000,1),(3,62000,0),(4,71000,0),(2.4,61000,0),(7.5,75000,0),(9.7,81000,1),(3.2,62000,0),(7.9,88000,0),(4.7,44000,1),(2.5,55000,0),(1.6,41000,0),(6.7,64000,1),(6.9,66000,1),(7.9,78000,1),(8.1,102000,0),(5.3,48000,1),(8.5,66000,1),(0.2,56000,0),(6,69000,0),(7.5,77000,0),(8,86000,0),(4.4,68000,0),(4.9,75000,0),(1.5,60000,0),(2.2,50000,0),(3.4,49000,1),(4.2,70000,0),(7.7,98000,0),(8.2,85000,0),(5.4,88000,0),(0.1,46000,0),(1.5,37000,0),(6.3,86000,0),(3.7,57000,0),(8.4,85000,0),(2,42000,0),(5.8,69000,1),(2.7,64000,0),(3.1,63000,0),(1.9,48000,0),(10,72000,1),(0.2,45000,0),(8.6,95000,0),(1.5,64000,0),(9.8,95000,0),(5.3,65000,0),(7.5,80000,0),(9.9,91000,0),(9.7,50000,1),(2.8,68000,0),(3.6,58000,0),(3.9,74000,0),(4.4,76000,0),(2.5,49000,0),(7.2,81000,0),(5.2,60000,1),(2.4,62000,0),(8.9,94000,0),(2.4,63000,0),(6.8,69000,1),(6.5,77000,0),(7,86000,0),(9.4,94000,0),(7.8,72000,1),(0.2,53000,0),(10,97000,0),(5.5,65000,0),(7.7,71000,1),(8.1,66000,1),(9.8,91000,0),(8,84000,0),(2.7,55000,0),(2.8,62000,0),(9.4,79000,0),(2.5,57000,0),(7.4,70000,1),(2.1,47000,0),(5.3,62000,1),(6.3,79000,0),(6.8,58000,1),(5.7,80000,0),(2.2,61000,0),(4.8,62000,0),(3.7,64000,0),(4.1,85000,0),(2.3,51000,0),(3.5,58000,0),(0.9,43000,0),(0.9,54000,0),(4.5,74000,0),(6.5,55000,1),(4.1,41000,1),(7.1,73000,0),(1.1,66000,0),(9.1,81000,1),(8,69000,1),(7.3,72000,1),(3.3,50000,0),(3.9,58000,0),(2.6,49000,0),(1.6,78000,0),(0.7,56000,0),(2.1,36000,1),(7.5,90000,0),(4.8,59000,1),(8.9,95000,0),(6.2,72000,0),(6.3,63000,0),(9.1,100000,0),(7.3,61000,1),(5.6,74000,0),(0.5,66000,0),(1.1,59000,0),(5.1,61000,0),(6.2,70000,0),(6.6,56000,1),(6.3,76000,0),(6.5,78000,0),(5.1,59000,0),(9.5,74000,1),(4.5,64000,0),(2,54000,0),(1,52000,0),(4,69000,0),(6.5,76000,0),(3,60000,0),(4.5,63000,0),(7.8,70000,0),(3.9,60000,1),(0.8,51000,0),(4.2,78000,0),(1.1,54000,0),(6.2,60000,0),(2.9,59000,0),(2.1,52000,0),(8.2,87000,0),(4.8,73000,0),(2.2,42000,1),(9.1,98000,0),(6.5,84000,0),(6.9,73000,0),(5.1,72000,0),(9.1,69000,1),(9.8,79000,1),]
data = map(list, data) 

#16.3应用模型 P178
random.seed(0)
x_train, x_test, y_train, y_test = train_test_split(rescaled_x, y, 0.33)
fn = partial(logistic_log_likelihood, x_train, y_train)
gradient_fn = partial(logistic_log_gradient, x_train, y_train)
# pick a random starting point
beta_0 = [1, 1, 1]
# and maximize using gradient descent
beta_hat = maximize_batch(fn, gradient_fn, beta_0)
beta_hat = maximize_stochastic(logistic_log_likelihood_i,logistic_log_gradient_i,x_train, y_train, beta_0)


#16.4拟合优度
rue_positives = false_positives = true_negatives = false_negatives = 0

for x_i, y_i in zip(x_test, y_test):
        predict = logistic(dot(beta_hat, x_i))

        if y_i == 1 and predict >= 0.5:  # TP: paid and we predict paid
            true_positives += 1
        elif y_i == 1:                   # FN: paid and we predict unpaid
            false_negatives += 1
        elif predict >= 0.5:             # FP: unpaid and we predict paid
            false_positives += 1
        else:                            # TN: unpaid and we predict unpaid
            true_negatives += 1

precision = true_positives / (true_positives + false_positives)
recall = true_positives / (true_positives + false_negatives)
predictions=[logistic(dot(beta_hat,x_i)) for x_i in x_test]
plt.scatter(predictions,y_test)
plt.xlabel('predicted probability')
plt.ylabel('actual outcome')
plt.title('logistic regression predicted vs. actual')
pl.show()


------------------------------------------------CP17 Decision Trees(决策树)
#data
inputs = [
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),
        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'},  False),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'},  True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),
        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),
        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),
        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),
        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)]
#17.2熵
"""given a list of class probabilities, compute the entropy"""
def entropy(class_probabilities):
    return sum(-p * math.log(p, 2) for p in class_probabilities if p)
#计算类别概率
def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count for count in Counter(labels).values()]
def data_entropy(labeled_data):        
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)

#17.3分割之熵
#在数学上我们把数据集S划分成若干子集s1,s2,s3...，各个子集数量上所占比为q1,q2,q3... 我们通过加权形式计算分割熵
def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets"""
    total_count = sum(len(subset) for subset in subsets)
    return sum( data_entropy(subset) * len(subset) / total_count for subset in subsets )

def group_by(items, key_fn):
    """returns a defaultdict(list), where each input item is in the list whose key is key_fn(item)"""
    groups = defaultdict(list)
    for item in items:
        key = key_fn(item)

        groups[key].append(item)
    return groups
#17.4创建决策树
def partition_by(inputs, attribute):
    """returns a dict of inputs partitioned by the attribute each input is a pair (attribute_dict, label)"""
    return group_by(inputs, lambda x: x[0][attribute]) 
#分割熵
def partition_entropy_by(inputs,attribute):
    """computes the entropy corresponding to the given partition"""        
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values()) 

def partition_entropy_by(inputs,attribute):
    """computes the entropy corresponding to the given partition"""        
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values()) 
#找出最小分割的熵
senior_inputs = [(input, label) for input, label in inputs if input["level"] == "Senior"]
for key in ['level','lang','tweets','phd']:
    print(key,partition_entropy_by(senior_inputs,key))

#17.5综合应用（招聘决策树）
def classify(tree,input):
    if tree in [True,False]:
        return tree
    attribute,subtree_dict=tree
    subtree_key=input.get(attribute)
    if subtree_key not in subtree_dict:
        subtree_key=None
    subtree=subtree_dict[subtree_key]
    return classify(subtree,input)
def build_tree_id3(inputs,split_candidates=None):
    if split_candidates is None:
        split_candidates=inputs[0][0].keys()
    num_inputs=len(inputs)
    num_trues=len([label for item,label in inputs if label])
    num_falses=num_inputs-num_trues
    if num_trues==0:return False
    if num_falses==0:return True
    if not split_candidates:
        return num_trues>=num_falses
    best_attribute=min(split_candidates,key=partial(partitions_entropy_by,inputs))
    partitions=partition_by(inputs,best_attributes)
    new_candidates=[a for a in split_candidates if a!=best_attribute]
    subtrees={attribute_valueLbuild_tree_id3(subset,new_candidates) for attribute_value,subset in partitions.iteritems()}
    subtrees[None]=num_trues>num_falses
    return (best_attribute,subtrees)
#随机森林
def forest_classify(trees,input):
    votes=[classify(tree,input) for tree in trees]
    vote_counts=Counter(votes)
    return vote_counts.most_common(1)[0][0]
if len(split_candidates)<=self.num_split_candidates:
    sampled_split_candidates=split_candidates
else:
    sampled_split_candidates=random.sample(split_candidates,self.num_split_candidates)
best_attribute=min(sampled_split_candidates,key=partial(partition_entropy_by,inputs))
partitions=partition_by(inputs,best_attribute)

------------------------------------------------CP18 Neural Networks
#18.1 perception 感知器
def step_function(x):
    return 1 if x >= 0 else 0

def perceptron_output(weights, bias, x):
    """returns 1 if the perceptron 'fires', 0 if not"""
    return step_function(dot(weights, x) + bias)

def sigmoid(t):
    return 1 / (1 + math.exp(-t))
    
def neuron_output(weights, inputs):
    return sigmoid(dot(weights, inputs))

#18.2 前馈神经网络
def feed_forward(neural_network, input_vector):
    """takes in a neural network (represented as a list of lists of lists of weights)
    and returns the output from forward-propagating the input"""

    outputs = []

    for layer in neural_network:

        input_with_bias = input_vector + [1]             # add a bias input
        output = [neuron_output(neuron, input_with_bias) # compute the output
                  for neuron in layer]                   # for this layer
        outputs.append(output)                           # and remember it

        # the input to the next layer is the output of this one
        input_vector = output

    return outputs
#18.3反向传播
def backpropagate(network, input_vector, target):

    hidden_outputs, outputs = feed_forward(network, input_vector)
    
    # the output * (1 - output) is from the derivative of sigmoid
    output_deltas = [output * (1 - output) * (output - target[i]) for i, output in enumerate(outputs)]
                     
    # adjust weights for output layer (network[-1])
    for i, output_neuron in enumerate(network[-1]):
        for j, hidden_output in enumerate(hidden_outputs + [1]):output_neuron[j] -= output_deltas[i] * hidden_output

    # back-propagate errors to hidden layer
    hidden_deltas = [hidden_output * (1 - hidden_output) * dot(output_deltas, [n[i] for n in network[-1]]) for i, hidden_output in enumerate(hidden_outputs)]

    # adjust weights for hidden layer (network[0])
    for i, hidden_neuron in enumerate(network[0]):
        for j, input in enumerate(input_vector + [1]):
            hidden_neuron[j] -= hidden_deltas[i] * input

def patch(x, y, hatch, color):
    """return a matplotlib 'patch' object with the specified
    location, crosshatch pattern, and color"""
    return matplotlib.patches.Rectangle((x - 0.5, y - 0.5), 1, 1,
                                        hatch=hatch, fill=False, color=color)

#
def show_weights(neuron_idx):
    weights = network[0][neuron_idx]
    abs_weights = map(abs, weights)
# turn the weights into a 5x5 grid
# [weights[0:5], ..., weights[20:25]]
    grid = [abs_weights[row:(row+5)] for row in range(0,25,5)] 
    ax = plt.gca() # to use hatching, we'll need the axis
    ax.imshow(grid, # here same as plt.imshow
              cmap=matplotlib.cm.binary, # use white-black color scale
              interpolation='none') # plot blocks as blocks

# cross-hatch the negative weights
    for i in range(5): # row
        for j in range(5): # column
            if weights[5*i + j] < 0: # row i, column j = weights[5*i + j]
                # add black and white hatches, so visible whether dark or light
                ax.add_patch(patch(j, i, '/', "white"))
                ax.add_patch(patch(j, i, '\\', "black"))
    plt.show()

def make_digit(raw_digit):
        return [1 if c == '1' else 0 for row in raw_digit.split("\n") for c in row.strip()]
              
inputs = map(make_digit, raw_digits)
targets = [[1 if i == j else 0 for i in range(10)] for j in range(10)]
random.seed(0)   # to get repeatable results
input_size = 25  # each input is a vector of length 25
num_hidden = 5   # we'll have 5 neurons in the hidden layer
output_size = 10 # we need 10 outputs for each input
# each hidden neuron has one weight per input, plus a bias weight
hidden_layer = [[random.random() for __ in range(input_size + 1)] for __ in range(num_hidden)]
# each output neuron has one weight per hidden neuron, plus a bias weight
output_layer = [[random.random() for __ in range(num_hidden + 1)] for __ in range(output_size)]
# the network starts out with random weights
network = [hidden_layer, output_layer]
# 10,000 iterations seems enough to converge
for __ in range(10000):
    for input_vector, target_vector in zip(inputs, targets):
            backpropagate(network, input_vector, target_vector)
def predict(input):
    return feed_forward(network, input)[-1]

for i, input in enumerate(inputs):
    outputs = predict(input)
    print i, [round(p,2) for p in outputs]

------------------------------------------------CP19 clustering 聚类分析
inputs = [[-14,-5],[13,13],[20,23],[-19,-11],[-9,-16],[21,27],[-49,15],[26,13],[-46,5],[-34,-1],[11,15],[-49,0],[-22,-16],[19,28],[-12,-8],[-13,-19],[-41,8],[-11,-6],[-25,-9],[-18,-3]]

#19.2模型
class KMeans:
    """performs k-means clustering"""
    def __init__(self, k):
        self.k = k          # number of clusters
        self.means = None   # means of clusters
    def classify(self, input):
        """return the index of the cluster closest to the input"""
        return min(range(self.k),key=lambda i: squared_distance(input, self.means[i]))
    def train(self, inputs):
        self.means = random.sample(inputs, self.k)
        assignments = None
        while True:
            # Find new assignments
            new_assignments = map(self.classify, inputs)
            # If no assignments have changed, we're done.
            if assignments == new_assignments:                
                return
            # Otherwise keep the new assignments,
            assignments = new_assignments    
            for i in range(self.k):
                i_points = [p for p, a in zip(inputs, assignments) if a == i]
                # avoid divide-by-zero if i_points is empty
                if i_points:
                    self.means[i] = vector_mean(i_points)
random.seed(0)
cluster=KMeans(3)
KMeans.train(inputs)
print(clusterer.means)

#19.4选择聚类数目K
def squared_clustering_errors(inputs, k):
    """finds the total squared error from k-means clustering the inputs"""
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = map(clusterer.classify, inputs)
    return sum(squared_distance(input,means[cluster]) for input, cluster in zip(inputs, assignments))

def plot_squared_clustering_errors(plt):
    ks = range(1, len(inputs) + 1)
    errors = [squared_clustering_errors(inputs, k) for k in ks]
    plt.plot(ks, errors)
    plt.xticks(ks)
    plt.xlabel("k")
    plt.ylabel("total squared error")
    plt.show()

import matplotlib.image as mpimg
path_to_png_file="/home/lufax/桌面/py3-into/img.jpg"
img=mpimg.imread(path_to_png_file)
top_row=img[0]
top_left_pixel=top_row[0]
red,green,blue=top_left_pixel
pixels=[pixel for row in img for pixel in row]
clusterer=KMeans(5)

#19.5 示例：对色彩聚类
def recolor_image(input_file, k=5):
    img = mpimg.imread(path_to_png_file)
    pixels = [pixel for row in img for pixel in row]
    clusterer = KMeans(k)
    clusterer.train(pixels) # this might take a while    
    def recolor(pixel):
        cluster = clusterer.classify(pixel) # index of the closest cluster
        return clusterer.means[cluster]     # mean of the closest cluster
    new_img = [[recolor(pixel) for pixel in row] for row in img]
    plt.imshow(new_img)
    plt.axis('off')
    plt.show()

#19.6自上而下的分层聚类
def is_leaf(cluster):
    """a cluster is a leaf if it has length 1"""
    return len(cluster) == 1
def get_children(cluster):
    """returns the two children of this cluster if it's a merged cluster;raises an exception if this is a leaf cluster"""
    if is_leaf(cluster):
        raise TypeError("a leaf cluster has no children")
    else:
        return cluster[1]
def get_values(cluster):
    """returns the value in this cluster (if it's a leaf cluster)
    or all the values in the leaf clusters below it (if it's not)"""
    if is_leaf(cluster):
        return cluster # is already a 1-tuple containing value
    else:
        return [value
                for child in get_children(cluster)
                for value in get_values(child)]
def cluster_distance(cluster1, cluster2, distance_agg=min):
    """finds the aggregate distance between elements of cluster1
    and elements of cluster2"""
    return distance_agg([distance(input1, input2)
                        for input1 in get_values(cluster1) for input2 in get_values(cluster2)])
def get_merge_order(cluster):
    if is_leaf(cluster):
        return float('inf')
    else:
        return cluster[0] # merge_order is first element of 2-tuple
#创建聚类算法
def bottom_up_cluster(inputs, distance_agg=min):
    # start with every input a leaf cluster / 1-tuple
    clusters = [(input,) for input in inputs]
    
    # as long as we have more than one cluster left...
    while len(clusters) > 1:
        # find the two closest clusters
        c1, c2 = min([(cluster1, cluster2)
                     for i, cluster1 in enumerate(clusters)
                     for cluster2 in clusters[:i]],
                     key=lambda (x, y): cluster_distance(x, y, distance_agg))

        # remove them from the list of clusters
        clusters = [c for c in clusters if c != c1 and c != c2]

        # merge them, using merge_order = # of clusters left
        merged_cluster = (len(clusters), [c1, c2])

        # and add their merge
        clusters.append(merged_cluster)

    # when there's only one cluster left, return it
    return clusters[0]

def generate_clusters(base_cluster, num_clusters):
    # start with a list with just the base cluster
    clusters = [base_cluster]
    
    # as long as we don't have enough clusters yet...
    while len(clusters) < num_clusters:
        # choose the last-merged of our clusters
        next_cluster = min(clusters, key=get_merge_order)
        # remove it from the list
        clusters = [c for c in clusters if c != next_cluster]
        # and add its children to the list (i.e., unmerge it)
        clusters.extend(get_children(next_cluster))

    # once we have enough clusters...
    return clusters

------------------------------------------------CP20 Natural Language Processing 自然语言
(word,job_popularity,resume_popularity)
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),("data science", 60, 70), ("analytics", 90, 3),("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),("actionable insights", 40, 30), ("think out of the box", 45, 10),("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]
#20.1词云
"""equals 8 if total is 0, 28 if total is 200"""
def text_size(total):
   return 8 + (total/200*20)
#生成data 词云图
for word, job_popularity, resume_popularity in data:
        plt.text(job_popularity, resume_popularity, word,ha='center',va='center',size=text_size(job_popularity + resume_popularity))
plt.xlabel("Popularity on Job Postings")
plt.ylabel("Popularity on Resumes")
plt.axis([0, 100, 0, 100])
plt.show()

#20.2 n-grams 模型
def fix_unicode(text):
    return text.replace(u"\u2019", "'")

def get_document():
    url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html5lib')
    content = soup.find("div", "article-body")        # find article-body div
    regex = r"[\w']+|[\.]"                            # matches a word or a period
    document = []

for paragraph in content("p"):
        words = re.findall(regex, fix_unicode(paragraph.text))
        document.extend(words)

bigrams=zip(document,document[1:])
transitions=defaultdict(list)
for prev,current in bigrams:
    transitions[prev].append(current)
#生成句子
def generate_using_bigrams(transitions):
    current = "."   # this means the next word will start a sentence
    result = []
    while True:
        next_word_candidates = transitions[current]    # bigrams (current, _)
        current = random.choice(next_word_candidates)  # choose one at random
        result.append(current)                         # append it to results
    if current == ".": return " ".join(result) # if "." we're done

#3元 n-gram
trigrams=zip(document,document[1:],document[2:])
trigram_transitions=defaultdict(list)
starts=[]
for prev,current,next in trigrams:
    if prev=='.':
        starts.append(current)
    trigram_transitions[(prev,current)].append(next)

def generate_using_trigrams(starts, trigram_transitions):
    current = random.choice(starts)   # choose a random starting word
    prev = "."                        # and precede it with a '.'
    result = [current]
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next = random.choice(next_word_candidates)
        prev, current = current, next
        result.append(current)
        if current == ".":
            return " ".join(result)

#20.3 语法
grammar = {
        "_S"  : ["_NP _VP"],
        "_NP" : ["_N",
                 "_A _NP _P _A _N"],
        "_VP" : ["_V",
                 "_V _NP"],
        "_N"  : ["data science", "Python", "regression"],
        "_A"  : ["big", "linear", "logistic"],
        "_P"  : ["about", "near"],
        "_V"  : ["learns", "trains", "tests", "is"]
}
def is_terminal(token):
    return token[0] != "_"

def expand(grammar, tokens):
    for i, token in enumerate(tokens):
        # ignore terminals
        if is_terminal(token): continue
        # choose a replacement at random
        replacement = random.choice(grammar[token])
        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
        return expand(grammar, tokens)
    return tokens
def generate_sentence(grammar):
    return expand(grammar, ["_S"])

#20.4 Gibbs Sampling（吉布斯采样）
def roll_a_die():
    return random.choice([1,2,3,4,5,6])

def direct_sample():
    d1 = roll_a_die()
    d2 = roll_a_die()
    return d1, d1 + d2

def random_y_given_x(x):
    """equally likely to be x + 1, x + 2, ... , x + 6"""
    return x + roll_a_die()

def random_x_given_y(y):
    if y <= 7:
        # if the total is 7 or less, the first die is equally likely to be
        # 1, 2, ..., (total - 1)
        return random.randrange(1, y)
    else:
        # if the total is 7 or more, the first die is equally likely to be
        # (total - 6), (total - 5), ..., 6
        return random.randrange(y - 6, 7)

def gibbs_sample(num_iters=100):
    x, y = 1, 2 # doesn't really matter
    for _ in range(num_iters):
        x = random_x_given_y(y)
        y = random_y_given_x(x)
    return x, y

def compare_distributions(num_samples=1000):
    counts = defaultdict(lambda: [0, 0])
    for _ in range(num_samples):
        counts[gibbs_sample()][0] += 1
        counts[direct_sample()][1] += 1
    return counts

#20.5  TOPIC MODELING 主题建模
def sample_from(weights):
    total = sum(weights)
    rnd = total * random.random()       # uniform between 0 and total
    for i, w in enumerate(weights):
        rnd -= w                        # return the smallest i such that
        if rnd <= 0: return i           # sum(weights[:(i+1)]) >= rnd

documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

K = 4

document_topic_counts = [Counter()
                         for _ in documents]

topic_word_counts = [Counter() for _ in range(K)]

topic_counts = [0 for _ in range(K)]

document_lengths = map(len, documents)

distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

D = len(documents)

def p_topic_given_document(topic, d, alpha=0.1):
    """the fraction of words in document _d_that are assigned to _topic_ (plus some smoothing)"""

    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    """the fraction of words assigned to _topic_that equal _word_ (plus some smoothing)"""

    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d, word, k):
    """given a document and a word in that document,return the weight for the k-th topic"""

    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1
 topic_names = ["Big Data and programming languages",
                   "Python and statistics",
                   "databases",
                   "machine learning"]
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
         if count > 0: print k, word, count

for document, topic_counts in zip(documents, document_topic_counts):
        print document
        for topic, count in topic_counts.most_common():
            if count > 0:
                print topic_names[topic], count,
        print

------------------------------------------------CP21 Network Analysis(网络分析)
#21.1 Betweenness Centrality (中介中心度)
users = [
    { "id": 0, "name": "Hero" },
    { "id": 1, "name": "Dunn" },
    { "id": 2, "name": "Sue" },
    { "id": 3, "name": "Chi" },
    { "id": 4, "name": "Thor" },
    { "id": 5, "name": "Clive" },
    { "id": 6, "name": "Hicks" },
    { "id": 7, "name": "Devin" },
    { "id": 8, "name": "Kate" },
    { "id": 9, "name": "Klein" }]
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),(4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
# give each user a friends list
for user in users:
    user["friends"] = []   
# and populate it
for i, j in friendships:
    # this works because users[i] is the user whose id is i
    users[i]["friends"].append(users[j]) # add i as a friend of j
    users[j]["friends"].append(users[i]) # add j as a friend of i   

# 广度优先搜索算法实现 P239
def shortest_paths_from(from_user):
    # a dictionary from "user_id" to *all* shortest paths to that user
    shortest_paths_to = { from_user["id"] : [[]] }
    # a queue of (previous user, next user) that we need to check.
    # starts out with all pairs (from_user, friend_of_from_user)
    frontier = deque((from_user, friend) for friend in from_user["friends"])
    # keep going until we empty the queue
    while frontier: 
        prev_user, user = frontier.popleft() # take from the beginning
        user_id = user["id"]
        # the fact that we're pulling from our queue means that
        # necessarily we already know a shortest path to prev_user
        paths_to_prev = shortest_paths_to[prev_user["id"]]
        paths_via_prev = [path + [user_id] for path in paths_to_prev]
        # it's possible we already know a shortest path to here as well
        old_paths_to_here = shortest_paths_to.get(user_id, [])
        # what's the shortest path to here that we've seen so far?
        if old_paths_to_here:
            min_path_length = len(old_paths_to_here[0])
        else:
            min_path_length = float('inf')       
        # any new paths to here that aren't too long
        new_paths_to_here = [path_via_prev
                             for path_via_prev in paths_via_prev
                             if len(path_via_prev) <= min_path_length
                             and path_via_prev not in old_paths_to_here]
        shortest_paths_to[user_id] = old_paths_to_here + new_paths_to_here
        # add new neighbors to the frontier
        frontier.extend((user, friend) for friend in user["friends"] if friend["id"] not in shortest_paths_to)
    return shortest_paths_to
#载入路径数值(one-all路径——以{list}形式)
for user in users:
    user['shortest_paths']=shortest_paths_from(user)

#now we calculate Betweenness Centrality (计算中介中心度BC)
#set ori_value of B_C 
for user in users:
    user['betweenness_centrality']=0
#then we get B_C 至此我们得到了中介中心度
for source in users:
    source_id = source["id"]
    for target_id, paths in source["shortest_paths"].items():
        if source_id < target_id:   # don't double count
            num_paths = len(paths)  # how many shortest paths?
            contrib = 1 / num_paths # contribution to centrality
            for path in paths:
                for id in path:
                    if id not in [source_id, target_id]:
                        users[id]["betweenness_centrality"] += contrib
#print the B_C
print([str(u['id'])+' :'+str(u['betweenness_centrality']) for u in users])
##(查看各相对中心度值相对大小)

#计算接近中心度(closeness centrality= 1/疏远度)
#计算疏远度(farness)
def farness(user):
    """the sum of the lengths of the shortest paths to each other user"""
    return sum(len(paths[0]) 
               for paths in user["shortest_paths"].values())
for user in users:
    user["closeness_centrality"] = 1 / farness(user)

#21.2 特征向量中心度
#21.2.1 矩阵乘法(matrix multiplication)
#函数：矩阵A的i行乘以矩阵B的j列
def matrix_product_entry(A, B, i, j):
    return np.dot(get_row(A, i), get_column(B, j))
#函数：矩阵乘法
def matrix_multiply(A, B):
    n1, k1 = np.shape(A)
    n2, k2 = np.shape(B)
    if k1 != n2:
        raise ArithmeticError("incompatible shapes!")             
    return make_matrix(n1, k2, partial(matrix_product_entry, A, B))
#将1维向量转换为矩阵
def vector_as_matrix(v):
    """returns the vector v (represented as a list) as a n x 1 matrix"""
    return [[v_i] for v_i in v]
#将矩阵转为n*1向量
def vector_from_matrix(v_as_matrix):
    """returns the n x 1 matrix as a list of values"""
    return [row[0] for row in v_as_matrix]
#对向量做矩阵乘法（返回值为向量）
def matrix_operate(A, v):
    v_as_matrix = vector_as_matrix(v)
    product = matrix_multiply(A, v_as_matrix)
    return vector_from_matrix(product)

#计算向量特征值
def find_eigenvector(A, tolerance=0.00001):
    guess = [1 for __ in A]
    while True:
        result = matrix_operate(A, guess)
        length = magnitude(result)
        next_guess = scalar_multiply(1/length, result)

        if distance(guess, next_guess) < tolerance:
            return next_guess, length # eigenvector, eigenvalue
        guess = next_guess

#21.2.2 eigenvector centrality(中心度)
#相当于返回一个i行j列的矩阵，再计算eigenvector_centralities就是中心度
def entry_fn(i, j):
    return 1 if (i, j) in friendships or (j, i) in friendships else 0
n = len(users)
adjacency_matrix = make_matrix(n, n, entry_fn)
eigenvector_centralities, _ = find_eigenvector(adjacency_matrix)

#21.3 directed graphs & pagerank（有向图和PageRank） P246
endorsements = [(0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1), (1, 3),
                (2, 3), (3, 4), (5, 4), (5, 6), (7, 5), (6, 8), (8, 7), (8, 9)]

for user in users:
    user["endorses"] = []       # add one list to track outgoing endorsements
    user["endorsed_by"] = []    # and another to track endorsements
    
for source_id, target_id in endorsements:
    users[source_id]["endorses"].append(users[target_id])
    users[target_id]["endorsed_by"].append(users[source_id])
endorsements_by_id = [(user["id"], len(user["endorsed_by"]))for user in users]

sorted(endorsements_by_id,key=lambda (user_id, num_endorsements): num_endorsements,reverse=True)

def page_rank(users, damping = 0.85, num_iters = 100):
    # initially distribute PageRank evenly
    num_users = len(users)
    pr = { user["id"] : 1 / num_users for user in users }
    # this is the small fraction of PageRank
    # that each node gets each iteration
    base_pr = (1 - damping) / num_users
    for __ in range(num_iters):
        next_pr = { user["id"] : base_pr for user in users }
        for user in users:
            # distribute PageRank to outgoing links
            links_pr = pr[user["id"]] * damping
            for endorsee in user["endorses"]:
                next_pr[endorsee["id"]] += links_pr / len(user["endorses"])
        pr = next_pr     
    return pr


------------------------------------------------CP22 Recommender Systems(推荐系统)
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

#22.2推荐流行
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()
#创建最受欢迎top函数  (推荐前5个流行的)
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency) 
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]

#22.3 user-based filtering (基于用户的协同过滤方法)
#创建一个指标衡量2用户之间的相似度（余弦相似度）
def cosine_similarity(v, w):
    return np.dot(v, w) / math.sqrt(np.dot(v, v) * np.dot(w, w))
#独一数组
unique_interests = sorted(list({ interest 
                                 for user_interests in users_interests
                                 for interest in user_interests }))

#创建user-interest向量
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose i-th element is 1
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0 for interest in unique_interests]
#创建兴趣矩阵（对应axis=1 为unique_interest，axis=0为users_interests）
user_interest_matrix = list(map(make_user_interest_vector, users_interests))
#计算用户兴趣相似度
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j) 
for interest_vector_j in user_interest_matrix]
for interest_vector_i in user_interest_matrix]

#创建用户相似度函数（输入id返回 id和相似度）
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero 
             if user_id != other_user_id and similarity > 0]  # similarity
    return sorted(pairs,                                      # sort them
                  key=lambda similarity: similarity,          # most similar
                  reverse=True)                               # first
#创建推荐函数
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities 
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),key=lambda weight: weight[1],reverse=True)
    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]

#22.4 Item-Based Collaborative Filtering (基于物品的协同过滤)
#计算用户兴趣矩阵
interest_user_matrix =[[user_interest_vector[j] for user_interest_vector in user_interest_matrix] for j, _ in enumerate(unique_interests)]
#计算基于物品的兴趣相似度矩阵
interest_similarities =[[cosine_similarity(user_vector_i, user_vector_j) for user_vector_j in interest_user_matrix]for user_vector_i in interest_user_matrix]
#计算与interest_id最相似的项（并推荐兴趣）
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,key=lambda similarity: similarity[1],reverse=True)
#基于物品的协同过滤推荐函数
def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity
    suggestions = sorted(suggestions.items(),key=lambda similarity: similarity[1],reverse=True)
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) for suggestion, weight in suggestions if suggestion not in users_interests[user_id]]


------------------------------------------------CP23 Databases and SQL
#23.1 create table &insert
users =[[0, "Hero", 0],[1, "Dunn", 2],[2, "Sue", 3],[3, "Chi", 3]]
#sql
create table users(
 user_id INT NOT NULL,
 name VARCHAR(200),
 num_friends INT)
insert into users(user_id,name,num_friends) values(1,'zieox','8')

#py
class Table:
    def __init__(self, columns):
        self.columns = columns
        self.rows = []

    def __repr__(self):
        """pretty representation of the table: columns then rows"""
        return str(self.columns) + "\n" + "\n".join(map(str, self.rows))

    def insert(self, row_values):
        if len(row_values) != len(self.columns):
            raise TypeError("wrong number of elements")
        row_dict = dict(zip(self.columns, row_values))
        self.rows.append(row_dict)


#23.2 update

def update(self, updates, predicate):
        for row in self.rows:
            if predicate(row):
                for column, new_value in updates.iteritems():row[column] = new_value
#sql
update users set num_friends=3 where user_id =1;
#py
users.update({'num_friends':3},lambda row:row['user_id']==1)


#23.3delete 
def delete(self, predicate=lambda row: True):
        """delete all rows matching predicate or all rows if no predicate supplied"""
        self.rows = [row for row in self.rows if not(predicate(row))]

#sql 
delete from users
delete from users where user_id=1
#py
users.delete(lambda row:row['user_id']==1)
users.delete()

#23.4 select 
def select(self, keep_columns=None, additional_columns=None):
        if keep_columns is None:         # if no columns specified,
            keep_columns = self.columns  # return all columns
        if additional_columns is None:
            additional_columns = {}
        # new table for results
        result_table = Table(keep_columns + additional_columns.keys())

        for row in self.rows:
            new_row = [row[column] for column in keep_columns]
            for column_name, calculation in additional_columns.iteritems():
                new_row.append(calculation(row))
            result_table.insert(new_row)
        return result_table

def where(self, predicate=lambda row: True):
        """return only the rows that satisfy the supplied predicate"""
        where_table = Table(self.columns)
        where_table.rows = filter(predicate, self.rows)
        return where_table

def limit(self, num_rows=None):
        """return only the first num_rows rows"""
        limit_table = Table(self.columns)
        limit_table.rows = (self.rows[:num_rows] if num_rows is not None else self.rows)
        return limit_table
#sql-py
select * from users
users.select()

select * from users limit 2
users.limit(2)

select user_id from users
users.select(keep_columns=['user_id'])

select user_id from users where name='Dunn'
users.where(lambda row:row['name']=='Dunn')

#23.5 group by
def group_by(self, group_by_columns, aggregates, having=None):
        grouped_rows = defaultdict(list)
        # populate groups
        for row in self.rows:
            key = tuple(row[column] for column in group_by_columns)
            grouped_rows[key].append(row)
        result_table = Table(group_by_columns + aggregates.keys())
        for key, rows in grouped_rows.iteritems():
            if having is None or having(rows):
                new_row = list(key)
                for aggregate_name, aggregate_fn in aggregates.iteritems():
                    new_row.append(aggregate_fn(rows))
                result_table.insert(new_row)
        return result_table

 def order_by(self, order):
        new_table = self.select()       # make a copy
        new_table.rows.sort(key=order)
        return new_table

def join(self, other_table, left_join=False):
        join_on_columns = [c for c in self.columns if c in other_table.columns]      # columns in both tables
        additional_columns = [c for c in other_table.columns if c not in join_on_columns]   # columns only in right table
        # all columns from left table + additional_columns from right table
        join_table = Table(self.columns + additional_columns)
        for row in self.rows:
            def is_join(other_row):
                return all(other_row[c] == row[c] for c in join_on_columns)
            other_rows = other_table.where(is_join).rows
            # each other row that matches this one produces a result row
            for other_row in other_rows:
                join_table.insert([row[c] for c in self.columns]+[other_row[c] for c in additional_columns])
            # if no rows match and it's a left join, output with Nones
            if left_join and not other_rows:
                join_table.insert([row[c] for c in self.columns] + [None for c in additional_columns])
        return join_table

def min_user_id(rows):return min(row['user_id'] for row in rows)

stats_by_length=users.select(additional_columns={'name_length':name_length}).group_by(group_by_columns=['name_length'],aggregates={'min_user_id':min_user_id,'num_users':len})

def first_letter_of_name(row):return row["name"][0] if row["name"] else ""
def average_num_friends(rows):return sum(row["num_friends"] for row in rows) / len(rows)
def enough_friends(rows):return average_num_friends(rows) > 1
avg_friends_by_letter = users.select(additional_columns={'first_letter' : first_letter_of_name}).group_by(group_by_columns=['first_letter'],aggregates={ "avg_num_friends" : average_num_friends },having=enough_friends)

def sum_user_ids(rows): return sum(row["user_id"] for row in rows)
user_id_sum = users.where(lambda row: row["user_id"] > 1).group_by(group_by_columns=[],aggregates={ "user_id_sum" : sum_user_ids })

friendliest_letters = avg_friends_by_letter.order_by(lambda row: -row["avg_num_friends"]).limit(4)

# 23.7 JOINs
user_interests = Table(["user_id", "interest"])
user_interests.insert([0, "SQL"])
user_interests.insert([0, "NoSQL"])
user_interests.insert([2, "SQL"])
user_interests.insert([2, "MySQL"])
#sql
select users.name from users join user_interests on users.user_id=user_interests.user_id where user_interests.interest='SQL'
#py
sql_users = users.join(user_interests).where(lambda row: row["interest"] == "SQL").select(keep_columns=["name"])

def count_interests(rows):
        """counts how many rows have non-None interests"""
        return len([row for row in rows if row["interest"] is not None])

user_interest_counts = users.join(user_interests, left_join=True).group_by(group_by_columns=["user_id"],aggregates={"num_interests" : count_interests })

#23.8 SUBQUERIES
#sql
select min(user_id) as min_user_id from (select user_id from user_interests where interest='sql') sql_interest
#py
likes_sql_user_ids = user_interests.where(lambda row: row["interest"] == "SQL").select(keep_columns=['user_id'])
likes_sql_user_ids.group_by(group_by_columns=[],aggregates={ "min_user_id" : min_user_id })


------------------------------------------------CP24 mapreduce
data=pd.read_csv('/home/lufax/桌面/py3-into/test.txt')
#24.1 example:words_count
def word_count_old(documents):
    """word count not using MapReduce"""
    return Counter(word for document in documents for word in tokenize(document))

def tokenize(message):
    message = message.lower()                      # convert to lowercase
    all_words = re.findall("[a-z0-9']+", message)  # extract the words
    return set(all_words)                          # remove duplicates
#键值对转换
def wc_mapper(document):
    """for each word in the document, emit (word,1)"""        
    for word in tokenize(document):
        yield (word, 1)
def wc_reducer(word, counts):
    """sum up the counts for a word"""
    yield (word, sum(counts))

def word_count(documents):
    """count the words in the input documents using MapReduce"""
    # place to store grouped values
    collector = defaultdict(list) 
    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)
    return [output for word, counts in collector.iteritems() for output in wc_reducer(word, counts)]

#
def map_reduce(inputs, mapper, reducer):
    """runs MapReduce on the inputs using mapper and reducer"""
    collector = defaultdict(list)

    for input in inputs:
        for key, value in mapper(input):
            collector[key].append(value)

    return [output for key, values in collector.iteritems() for output in reducer(key,values)]

def reduce_with(aggregation_fn, key, values):
    """reduces a key-values pair by applying aggregation_fn to the values"""
    yield (key, aggregation_fn(values))

def values_reducer(aggregation_fn):
    """turns a function (values -> output) into a reducer"""
    return partial(reduce_with, aggregation_fn)

sum_reducer = values_reducer(sum)
max_reducer = values_reducer(max)
min_reducer = values_reducer(min)
count_distinct_reducer = values_reducer(lambda values: len(set(values)))

# Analyzing Status Updates
def data_science_day_mapper(status_update):
    """yields (day_of_week, 1) if status_update contains "data science" """
    if "data science" in status_update["text"].lower():
        day_of_week = status_update["created_at"].weekday()
    yield (day_of_week, 1)
data_science_days = map_reduce(status_updates,data_science_day_mapper,sum_reducer)

def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))
            
def most_popular_word_reducer(user, words_and_counts):
    """given a sequence of (word, count) pairs, 
    return the word with the highest total count"""
    word_counts = Counter()
    for word, count in words_and_counts:
        word_counts[word] += count
    word, count = word_counts.most_common(1)[0]
    yield (user, (word, count))
user_words = map_reduce(status_updates,words_per_user_mapper,most_popular_word_reducer)

def liker_mapper(status_update):
    user = status_update["username"]
    for liker in status_update["liked_by"]:
        yield (user, liker)
                
distinct_likers_per_user = map_reduce(status_updates, liker_mapper, count_distinct_reducer)

#24.5 matrix multiplication

def matrix_multiply_mapper(m, element):
    """m is the common dimension (columns of A, rows of B)
    element is a tuple (matrix_name, i, j, value)"""
    matrix, i, j, value = element

    if matrix == "A":
        for column in range(m):
            # A_ij is the jth entry in the sum for each C_i_column
            yield((i, column), (j, value))
    else:
        for row in range(m):
            # B_ij is the ith entry in the sum for each C_row_j
    yield((row, j), (i, value))

def matrix_multiply_reducer(m, key, indexed_values):
    results_by_index = defaultdict(list)
    for index, value in indexed_values:
        results_by_index[index].append(value)

    # sum up all the products of the positions with two results
sum_product = sum(results[0] * results[1] for results in results_by_index.values() if len(results) == 2)

if sum_product != 0.0:
    yield (key, sum_product)


entries = [("A", 0, 0, 3), ("A", 0, 1,  2),("B", 0, 0, 4), ("B", 0, 1, -1), ("B", 1, 0, 10)]
mapper = partial(matrix_multiply_mapper, 3)
reducer = partial(matrix_multiply_reducer, 3)




#exp.1
import time
from pandas import Series,DataFrame
import numpy as np
from pandas.tseries.offsets import Hour,Minute,Day,MonthEnd

data={'re':[272, 277, 173, 262, 136, 103, 291, 44, 196, 247],
'wi':[118, 141, 162, 136, 147, 167, 177, 111, 114, 142]}
indx=pd.date_range('4/2/2018',periods = 10)

df2 = DataFrame(data,columns=['re', 'wi'],index=indx)

df2['answer']=df2.cumsum()['re']-df2.cumsum()['wi'].shift(1).fillna(0)
#移动增量
df2['improv']=df2.answer-df2.answer.shift(1).fillna(0)

#exp.2
配置P3
https://blog.csdn.net/yin__ren/article/details/78848171
#计算区域花费占比图
grouped=df['cost'].groupby(df['region'])
grouped.sum().plot(kind='pie',autopct = '%3.1f%%',)
--->
df['cost'].groupby(df['region']).sum().plot(kind='pie',autopct = '%3.1f%%',title='cost% by regions')
#计算BJ的年龄分布图
bins=[i*10 for i in range(1,11)]
pd.cut(df[df.region=='BJ'].age,bins).value_counts().plot(kind='pie',autopct = '%3.1f%%',title='age-T-OF-BJ')

for i in ['ee', 'num', 'co st']:
    for j  in ['ee', 'num', 'cost']:
        print(i+' - '+j+': '+str(df[i].corr(df[j])))
#计算相关系数(并绘图)
a=['vage', 'age', 'ageg', 'ee','num', 'cost']
#这种写法出来的数据有类似AB，BA的重复项
corr=pd.Series({i+' - '+j:df[i].corr(df[j]) for i in a for j in a if i!=j})
#去除重复 写法
b={}
for i in a:
    for j in a:
        if i==j:
            break
        else:
             x=i+' - '+j
             y=df[i].corr(df[j])
             b[x]=y
corr=pd.Series(b)
corr.sort_values(ascending=True).plot(kind='barh',figsize=(10,10),title='corr_value')
corr.plot(kind='barh',figsize=(10,10),title='corr_value')