Ch1 导论
此系列记录《数据科学入门》学习笔记
1.3.1 寻找关键联系人
1、定义users字典list
users = [
{'id': 0, 'name': 'Hero'},
{'id': 1, 'name': 'Dunn'},
{'id': 2, 'name': 'Sue'},
{'id': 3, 'name': 'Chi'},
{'id': 4, 'name': 'Thor'},
{'id': 5, 'name': 'Clive'},
{'id': 6, 'name': 'Hicks'},
{'id': 7, 'name': 'Devin'},
{'id': 8, 'name': 'Kate'},
{'id': 9, 'name': 'Klein'},
]
2、定义友邻关系list
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
3、在users添加friends列
for user in users:
user['friends'] = []
for i ,j in friendships:
users[i]['friends'].append(users[j])
users[j]['friends'].append(users[i])
4、计算友邻关系的联系数
# 计算全部联系数
def number_of_friends(user):
"""how many friends dose _user_ have?"""
return len(user['friends'])
total_connections = sum(number_of_friends(user) for user in users)
total_connections
# 计算平均联系数
# from __future__ import division 导入整数除法,python3不需要这样操作
num_users = len(users)
avg_connections = total_connections / num_users
avg_connections
5、对users按照朋友数排序
# 按照朋友数目排序,得到(id,number)
num_friends_by_id = [(user['id'], number_of_friends(user)) for user in users]
sorted(num_friends_by_id, key=lambda x: x[1], reverse=True) #表示从大到小排序
1.3.2 你可能知道的科学家
1、对users内用户计算朋友的朋友
# 计算朋友的朋友
def friends_of_friend_ids_bad(user):
# foaf 是朋友的朋友 英文缩写
return [foaf['id']
for friend in user['friends']
for foaf in friend['friends']]
friends_of_friend_ids_bad(users[0]) #[0, 2, 3, 0, 1, 3]
# 1,2是0的朋友;0,2,3是1的朋友;0,1,3是2的朋友
print([friend['id'] for friend in users[0]['friends']])
print([friend['id'] for friend in users[1]['friends']])
print([friend['id'] for friend in users[2]['friends']])
# 计算共同朋友
from collections import Counter
# Counter是一个简单的计数器,例如,统计字符出现的个数
def not_the_same(user, other_user):
"""two users are not the same if they have different ids"""
return user['id'] != other_user['id']
def not_friends(user, other_user):
"""other_user is not a friend if he's not in user['friends],
that is, if he's not_the_same as all the people in user['friends]"""
return all(not_the_same(friend, other_user) for friend in user['friends'])
def friends_of_friend_ids(user):
return Counter(foaf['id']
for friend in user['friends']
for foaf in friend['friends']
if not_the_same(user, foaf) and not_friends(user, foaf))
print(friends_of_friend_ids(users[3]))
# Counter({0: 2, 5: 1}) 说明3和2有两个共同好友,3和5有一个共同好友
3、寻找相同爱好的用户
interests = [(0,"Hadoop"),(0,"Big Data"),(0,"HBase"),(0,"Java"),(0,"Spark"),(0,"Storm"),
(0,"Cassandra"),
(1,"NoSQL"),(1,"MongoDB"),(1,"Cassandra"),(1,"HBase"),(1,"Postgres"),
(2,"Python"),(2,"scikit-learn"),(2,"Scipy"),(2,"numpy"),(2,"satasmodels"),
(2,"pandas"),
(3,"R"),(3,"Python"),(3,"Statistics"),(3,"regression"),(3,"probability"),
(4,"machine learning"),(4,"regression"),(4,"decision trees"),(4,"libsvm"),
(5,"Python"),(5,"R"),(5,"Java"),(5,"C++"),(5,"Haskell"),
(5,"programing languages"),
(6,"statistics"),(6,"probability"),(6,"machine learning"),(6,"theory"),
(7,"machine learning"),(7,"scikit-learn"),(7,"Mahout"),(7,"neural networdks"),
(8,"neural networdks"),(8,"deep learning"),(8,"Big Data"),
(8,"artificial intelligence"),
(9,"Hadoop"),(9,"Java"),(9,"MapReduce"),(9,"Big Data")]
# 找出有共同爱好的用户——需要遍历整个兴趣列表
def data_scientists_who_like(target_interest):
return [user_id
for user_id, user_interest in interests
if user_interest == target_interest]
print(data_scientists_who_like('regression'))
# [3, 4] 说明用户3和4具有共同爱好‘regeression’
from collections import defaultdict
# 使用dict时,如果引用的Key不存在,就会抛出KeyError。如果希望key不存在时,返回一个默认值,就可以用defaultdict:
# 除了在Key不存在时返回默认值,defaultdict的其他行为跟dict是完全一样的。
# 得到关于每个interest的用户id,键是interest,值是带有这个interest的user_id列表
user_ids_by_interest = defaultdict(list)
for user_id, interest in interests:
user_ids_by_interest[interest].append(user_id)
interests_by_user_id = defaultdict(list)
# 得到关于每个用户的interest,键是user_id,值是带有这个user_id的interest列表
for user_id, interest in interests:
interests_by_user_id[user_id].append(interest)
def most_common_interests_with(user):
return Counter(interested_user_id
for interest in interests_by_user_id[user['id']]
for interested_user_id in user_ids_by_interest[interest]
if interested_user_id != user['id'])
most_common_interests_with(users[0])
# Counter({1: 2, 5: 1, 8: 1, 9: 3}) 用户1与用户0有两个兴趣相同,用户9与用户0有三个兴趣相同
1.3.3 工资与工作年限
寻找工作年限和工资的关系
salaries_and_tenures = [(83000,8.7),(88000,8.1),(48000,0.7),(76000,6),(69000,6.5),
(76000,7.5),(60000,2.5),(83000,10),(48000,1.9),(63000,4.2)]
# 计算每个工作年限的薪水,键是year,值是对每一个tenure的salary列表
salary_by_tenure = defaultdict(list)
for salary, tenure in salaries_and_tenures:
salary_by_tenure[tenure].append(salary)
# 计算每个工作年限的平均薪水,键是year,值是tenure相对应的平均salary
average_salary_by_tenure = {
tenure : sum(salaries) / len(salaries)
for tenure, salaries in salary_by_tenure.items()
}
average_salary_by_tenure
# 因为上述数据中任意两个用户都没有相同的工作年限,所以均值结果作用有限,仅表示每个用户的独立收入
# 将用户工作年限分组
def tenure_bucket(tenure):
if tenure < 2:
return 'less than two years'
elif tenure < 5:
return 'between two and five years'
else:
return 'more than five years'
salary_by_tenure_bucket = defaultdict(list)
for salary, tenure in salaries_and_tenures:
bucket = tenure_bucket(tenure)
salary_by_tenure_bucket[bucket].append(salary)
# 计算每个工作年限的平均薪水,键是year,值是tenure相对应的平均salary
average_salary_by_bucket = {
tenure_bucket : sum(salaries) / len(salaries)
for tenure_bucket, salaries in salary_by_tenure_bucket.items()
}
average_salary_by_bucket
# {'between two and five years': 61500.0,
# 'less than two years': 48000.0,
# 'more than five years': 79166.66666666667}
1.3.4 付费账户
years_paid = [(0.7, 'paid'),
(1.9, 'unpaid'),
(2.5, 'paid'),
(4.2, 'unpaid'),
(6, 'unpaid'),
(6.5, 'unpaid'),
(7.5, 'unpaid'),
(8.1, 'unpaid'),
(8.7, 'paid'),
(10, 'paid'),]
# 预测是否付费
def predict_paid_or_unpaid(years_experience):
if years_experience < 3.0:
return 'paid'
elif years_experience < 8.5:
return 'unpaid'
else:
return 'paid'
predict_paid_or_unpaid(years_paid[1][0])
# 'paid' year == 2.5 的预测结果为paid,实际上是unpaid
1.3.5 兴趣主题
words_and_counts = Counter(word
for user, interest in interests
for word in interest.lower().split())
for word, count in words_and_counts.most_common():
if count > 1:
print(word, count)
learning 4
big 3
data 3
java 3
python
machine 3
hadoop 2
hbase 2
cassandra 2
scikit-learn 2
r 2
statistics 2
regression 2
probability 2
neural 2
networdks 2
以上是Ch1的相关内容,第一次写博客,排版不好,请见谅
2018.01.30 YR