图灵程序丛书 —《数据科学入门》— Ch2 Python 速成

Ch2 Python 速成

此系列记录《数据科学入门》学习笔记

2.1 基础内容

2.1.3 空白形式
python使用缩进的形式分隔代码块,可利用‘Tab’键快速找到键入位置。
for i in [1, 2, 3, 4, 5]:
    print(i)
    for j in [1, 2, 3, 4, 5]:
        print(j)
        print(i + j)
    print(i)
print('done looping')

2.1.4 模块
anaconda里有很多包安装好了,直接通过 import 语句便可以导入。
import re
my_regex = re.compile("[0-9]+", re.I)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from collections import defaultdict, Counter

2.1.5  算法
python 2.7 默认整除,python 3默认精确除法。
10/3  # 3.3333333333333335
10//3 # 3

2.1.6 函数
函数是一种规则,可实现我们想要的功能,python中用 def 语句定义。
def double(x):
    return x * 2
double(2)   # 4
def apply_to_one(f):
    return f(1)

my_double = double
x = apply_to_one(my_double)
x   # 2
定义默认参数,需要默认值意外的值需要具体说明
def my_print(message='My default message'):
    print(message)
my_print('hello')   # ‘hello’
my_print()          # ‘My default message’
def subtract(a=0, b=0):
    return a - b
print(subtract(10, 5))  # 5
print(subtract(0, 5))   # -5
print(subtract(b=5))    # -5

2.1.7 字符串
# 当双引号皆可
single_quoted_string = 'data science'
double_quoted_string = "data science"
single_quoted_string == double_quoted_string    # True
# \t表示tab字符, r"可用于生成原始字符串
tab_string = '\t'
print(tab_string, len(tab_string))
not_tab_string = r"\t" 
print(not_tab_string, len(not_tab_string))
# 利用三重(两重)引号生成多行字符串
multi_line_string = """This is the first line.
and this is the second line
and this is the third line"""
multi_line_string

2.1.8 异常
try:
    print(0/0)
except ZeroDivisionError:
    print('cannot divide by zero')   # cannot divide by zero

2.1.9 列表
integer_list = [1, 2, 3]
heterogeneous_list = ['string', 0.1, True]
list_of_lists = [integer_list, heterogeneous_list, []]
print(integer_list, heterogeneous_list, list_of_lists)    
# [1, 2, 3] ['string', 0.1, True] [[1, 2, 3], ['string', 0.1, True], []]
x = range(10)
zero = x[0]
one = x[1]
nine = x[-1]
eight = x[-2]
print(x, zero, one, nine, eight)
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 0 1 9 8
first_three = x[:3]
three_to_end = x[3:]
one_to_four = x[1:5]
last_three = x[-3:]
without_first_and_last = x[1:-1]
copy_of_x = x[:]
print(first_three, three_to_end, one_to_four, last_three, without_first_and_last, copy_of_x)
# [0, 1, 2] [3, 4, 5, 6, 7, 8, 9] [1, 2, 3, 4] [7, 8, 9] [1, 2, 3, 4, 5, 6, 7, 8]            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
x = [1, 2, 3]
x.extend([4, 5, 6])
# [1, 2, 3, 4, 5, 6]

x = [1, 2, 3]
y = x + [4, 5, 6]
# [1, 2, 3, 4, 5, 6]

x = [1, 2, 3]
x.append(4)
# [1, 2, 3, 4]
# 同时赋值
x, y = [1, 2]
print('x = %d' % x,'y = %d' % y)
x = 1 y = 2

# 可以用下划线来忽略某些值
_, y = [1, 2]
print('_ = %d' % _,'y = %d' % y)
_ = 1 y = 2

2.1.10 元组
元组是列表的亲哥哥,除了修改,列表的所有操作都适用于元组。
不同于列表的方括号,元组用户圆括号(什么都不加)
my_list = [1, 2]
my_tuple = (1, 2)
other_tuple = 3, 4
my_list[1] = 3
print(my_list, my_tuple, other_tuple)
# [1, 3] (1, 2) (3, 4)
# 元组是通过函数返回多重值的便捷方法
def sum_and_product(x, y):
    return (x +y), (x * y)
sp = sum_and_product(5, 10)
s, p = sum_and_product(5, 10)
print(sp, s, p)
# (15, 50) 15 50
# 元组和列表可以进行多重赋值
x, y = 1, 2
print('x = %d' % x,'y = %d' % y)
x, y = y, x   # python 风格互换变量
print('x = %d' % x,'y = %d' % y)
# x = 1 y = 2
# x = 2 y = 1

2.1.11 字典
字典的键不可改变,且列表不可以作为键;若需要一个多维的键,应先将键转换为字符串
empty_dict = {}
empty_dict2 = dict()
grades = {'Joel': 80, 'Tim': 95}
grades['Joel']   # 80
# 确认键是否存在
print('Joel' in grades)  # True
print('Kate' in grades)  # False
try:
    my_tuple[1] = 3
except TypeError:
    print('cannot modify a tuple')
# cannot modify a tuple

# 当查找的键字典中不存在,字典可以通过get返回默认值(而不是报错)
joel_grade = grades.get('Joel',0)
kate_grade = grades.get('Kate',0)
no_one_grade = grades.get('No One')
print(joel_grade, kate_grade, no_one_grade)
# 80 0 None
# 可利用方括号对字典赋值
grades['Tim'] = 99
grades['Kate'] = 100
grades
# {'Joel': 80, 'Kate': 100, 'Tim': 99}
# 常用字典来表示结构数据的简单方式
tweet = {
    'user' : 'joelgrus',
    'text' : 'data science is awesome',
    'retweet_count' : 100,
    'hashtags' : ['#data', '#science', '#datascience', '#awesome', '#yolo']
}

tweet.keys()
# dict_keys(['user', 'text', 'retweet_count', 'hashtags'])
tweet.values()
# dict_values(['joelgrus', 'data science is awesome', 100, ['#data', '#science', '#datascience', '#awesome', '#yolo']])
tweet.items()
# dict_items([('user', 'joelgrus'), ('text', 'data science is awesome'), ('retweet_count', 100), ('hashtags', ['#data', '#science', '#datascience', '#awesome', '#yolo'])])

2.1.11.1 Defaultdict
一个defaultdict相当于一个标准的字典,但是当你查找一个没有包含在内的键时,它能够用你提供的零参数函数建立一个新的键,并为它的值增加1,当用字典‘收集’某些键对应的结果,且不希望每次查找某键是否存在都遍历一遍时,defaultdict非常有用。
# 建立字典计算interests的各个interest个数(interests 见 Ch1)
interest_counts = {}
for i,interest in interests:
    if interest in interest_counts:
        interest_counts[interest] += 1
    else:
        interest_counts[interest] = 1
interest_counts

# 以异常值的方式来处理
interest_counts = {}
for i, interest in interests:
    try:
        interest_counts[interest] += 1
    except KeyError:
        interest_counts[interest] = 1
interest_counts

# 用get来实现
interest_counts = {}
for i, interest in interests:
    previous_interest = interest_counts.get(interest, 0)
    interest_counts[interest] = previous_interest + 1
interest_counts

# 上述三种方法都比较笨拙,所以引进defaultdict
from collections import defaultdict

interest_counts = defaultdict(int)  # int()结果为0
for i, interest in interests:
    interest_counts[interest] += 1
interest_counts
dd_int = defaultdict(int)
dd_int[2] = 1
dd_int[1]
dd_int
# defaultdict(int, {1: 0, 2: 1})

dd_list = defaultdict(list)
dd_list[2].append(1)
dd_list
# defaultdict(list, {2: [1]})

dd_dict = defaultdict(dict)
dd_dict['Jeol']['City'] = 'Seattle'
dd_dict
# defaultdict(dict, {'Jeol': {'City': 'Seattle'}})

dd_pair = defaultdict(lambda: [0, 0])
dd_pair[2][0] = 3
dd_pair
# defaultdict(<function __main__.<lambda>>, {2: [3, 0]})

2.1.11.2 Counter
一个计数器将一个序列的值转化成一个类似于整型的标准字典(即defaultdict(int))的键到计数的映射。
from collections import Counter
c = Counter([0, 1, 2, 0])
# Counter({0: 2, 1: 1, 2: 1})
# 上面计算单词可用如下来表示
interest_counts = Counter(interest for _, interest in interests)
interest_counts 

# 打印十个最常见的兴趣及其计数
for interest, count in interest_counts.most_common(10):
    print(interest, count)
c = Counter('abcdeabcdabcaba')
c.most_common(3) 
# [('a', 5), ('b', 4), ('c', 3)]
c['a']  # 5

sorted(c.elements())
# ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'e']

d = Counter('simsalabim')       
c.update(d)                     
c['a']   # 7

c = Counter('aaabbc')
c['b'] -= 2                    
c.most_common() 
# [('a', 3), ('c', 1), ('b', 0)]

2.1.12 集合
集合是另一种数据结构,它表示为一组不同的元素。
使用集合的原因:集合上有一种非常快速的操作——in;对大量项目的成分测试时,集合要比列表合适
但是,set的使用频率要远远低于list和dict。
s = set()
s.add(1)
print(s)   # {1}
s.add(2)
print(s)   # {1, 2}
s.add(2)
print(s)   # {1, 2}
print(len(s))
print(2 in s)   # True
print(s in s)   # False
# 需要遍历所有元素
stopwords_list = ['a', 'an', 'at'] + ['hundreds_of_other_words ']+ ['yet', 'you']
'zip' in stopwords_list

# 非常快的搜索
stopwords_set = set(stopwords_list)
'zip' in stopwords_set

2.1.13 控制流
# while
x = 0
while x < 10:
    print(x, 'is less than 10')
    x += 1

for x in range(10):
    print(x, 'is less than 10')

# 0 is less than 10
# 1 is less than 10
# 2 is less than 10
# 3 is less than 10
# 4 is less than 10
# 5 is less than 10
# 6 is less than 10
# 7 is less than 10
# 8 is less than 10
# 9 is less than 10
# 0,1,2正常执行;3退出第一个if所在循环;4正常执行;5退出整个循环
for x in range(10):
    if x == 3:
        continue
    if x == 5:
        break
    print(x)
# 0 1 2 4

2.1.14 真和假
# python 使用 None 表示一个不存在的值,类似于别的语言中的 null
x = None
print(x == None)  # True
print(x is None)  # True

print(all([True, 1, {3}]))  # True
print(all([True, 1, {}]))   # False
print(any([True, 1, {}]))   # True
print(all([]))   # True
print(any([]))   # False


2.2 进阶内容

2.2.1 排序
sort 会扰乱列表,直接覆盖在原有序列上(默认从小到大排序)
sorted  不扰乱列表,会返回一个新的序列
x = [4, 1, 2, 3]
y = sorted(x)    # 将x排序后赋给y,x没变
print(x, y)
x.sort()         # x发生变化
print(x, y)
# [4, 1, 2, 3] [1, 2, 3, 4] [1, 2, 3, 4] [1, 2, 3, 4]
# reverse=True 表示从大到小排列
x = sorted([-4, 1, -2, 3], key=abs, reverse=True)
# [-4, 3, -2, 1]

# 从最高数到最低数排序单词和计数
ic = sorted(interest_counts.items(),
            key=lambda x: x[1], reverse=True)

2.2.2 列表解析
列表解析是一种python操作技巧。
把一个列表转化为另一个列表,例如只保留其中一些元素,或者更改其中一些元素,或者同时做着两种变动。
even_numbers = [x for x in range(5) if x % 2 == 0]
print(even_numbers)   # [0, 2, 4]
squares = [x * x  for x in range(5)]
print(squares)   # [0, 1, 4, 9, 16]
even_squares = [x * x for x in even_numbers]
print(even_squares)   # [0, 4, 16]
# 将列表转化为字典或集合
square_dict = {x : x * x for x in range(5)}
print(square_dict)   # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
square_set = {x * x for x in[-1, 1]}
print(square_set)   # {1}

2.2.3 生成器和迭代器
生成器是一种可以对其进行迭代的程序,但是他的值只按需延迟产生;
对于range(1000000)会创建一个100万个元素的列表,如果只需要处理其中几个值或者只需要前面几个值,但是要对整个列表进行计算就会很浪费;
延迟的确定是,只能通过生成器迭代一次,若需要多次迭代某个对象需要每次都重新生成一个生成器
# 使用函数和yield运算符创建生成器
# 调用函数时,每循环一次会消耗一个yield值,直到一个也不剩
def lazy_range(n):
    i = 0
    while i < n:
        yield i
        i += 1

for i in lazy_range(10):
    print(i)
# 把一个列表生成式的[]改成(),就创建了一个generator
L = [x * x for x in range(10)]
print('L:',L)
g = (x * x for x in range(10))
print('g:',list(g))

# L: [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
# g: [0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

2.2.4 随机性
import random
# random.random 生成标准正态分布随机数
four_uniform_randoms = [random.random() for i in range(4)]
four_uniform_randoms

# [0.21263204460300866,
#  0.604551369587164,
#  0.15973754845060006,
#  0.3221215527769148]
# 固定一次随机结果,利用随机数种子random.seed()
random.seed(10)
print(random.random())   # 0.5714025946899135
random.seed(12)
print(random.random())   # 0.4745706786885481
random.seed(10)
print(random.random())   # 0.5714025946899135
# 从某个范围内选取随机数
# 从range(10)中选取,从[3, 4, 5]中选取
print(random.randrange(10))    # 7
print(random.randrange(3,6))   # 4

# random.shuffle可随机重新排列
up_to_ten = list(range(10))
print(up_to_ten)   # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
random.shuffle(up_to_ten)
print(up_to_ten)   # [3, 4, 5, 6, 1, 9, 2, 7, 8, 0]
# random.choice 随机选取一个元素
my_best_friend = random.choice(['Alice', 'Bob', 'Charlie'])
my_best_friend   # 'Alice'

# 随机不重复的选择一个的子样本用random.sample
# 表示6次不放回的重复抽样
lottery_numbers = list(range(60))
winning_numbers = random.sample(lottery_numbers, 6)
winning_numbers   # [38, 22, 24, 26, 18, 52]

# 通过多次调用random.choice,可以选择一个允许重复的元素样本
# 表示4次有放回的重复抽样
four_with_replacement = [random.choice(range(10)) for i in range(4)]
four_with_replacement   # [4, 7, 2, 4]
four_with_replacement = [random.choice(range(10)) for i in range(6)]
four_with_replacement   # [5, 8, 2, 3, 6, 3]

2.2.5 正则表达式
import re
# cat 不是以 a 开头
print(not re.match('a', 'cat'))   # True
# cat 不包括 字母a
print( not re.search('a', 'cat'))   # False
# dog 不包括 字母a
print( not re.search('a', 'dog'))  # True
# 以字母分割字符串
print(re.split('ab', 'carbs'))
print(re.split('[ab]', 'carbs'))
print(re.split('[abc]', 'carbs'))
print(re.split('[ac]', 'carbs'))
print(re.split('[mr]', 'smart'))
print(re.split('[sm]', 'smart'))
print(re.split('[mrt]', 'smart'))
# ['carbs']
# ['c', 'r', 's']
# ['', '', 'r', 's']
# ['', '', 'rbs']
# ['s', 'a', 't']
# ['', '', 'art']
# ['s', 'a', '', '']

# 将数字替换为‘-’
print(re.sub('[0-9]', '-', 'R2D2'))
print(re.sub('[0-9]', '-', 'R25D62'))
# R-D-
# R--D--

2.2.6 面前对象的编程
python可以定义类,类可以用来封装对象和函数来对它们进行操作。
class Set:
    # 这些是成员函数
    # 每个函数都取第一个参数‘self’
    # 它表示所用到的特别的集合对象
    
    # def __init__
    def __init__(self, values=None):
        """This is the constructor.
        It gets called when you create a new Set.
        You would use it like
        s1 = Set()            # 空集合
        s2 = Set([1, 2, 3])   # 用值初始化"""
        self.dict = {}  # Set的每一个实例都有自己的dict属性
                        # 我们会用这个属性来追踪成员关系
        if values is not None:
            for value in values:
                self.add(value)
                
    def __repr__(self):
        """this is the string representation of a Set object
        if you type it at the Python prompt or pass it to str()"""
        return 'Set: ' + str(self.dict.keys())
    
    # 通过成为self.dict中对应值为True的键,来表示成员的关系
    def add(self, value):
        self.dict[value] = True
        
    # 如果它在字典中是一个键,那么在集合中就是一个值
    def contains(self, value):
        return value in self.dict
    
    def remove(self, value):
        del self.dict[value]

s = Set([1, 2, 3])
print(s)
s.add(4)
print(s.contains(4))
print(s)
s.remove(3)
print(s.contains(3))
print(s)
# Set: dict_keys([1, 2, 3])
# True
# Set: dict_keys([1, 2, 3, 4])
# False
# Set: dict_keys([1, 2, 4])

2.2.7 函数式工具
partial(偏函数):函数有多个参数,但一个参数已知,可以通过partial固定该参数,重新绑定一个新的函数,最后得到一个函数;
map:将函数参数作用到后面每一个参数上,结果用list输出;
filter:作用相当于if判断,结果用list输出;
reduce:将函数参数作用于后面参数前两个元素,再作用于第一个结果和第三个元素,以此类推,最后得到一个结果。
# 定义幂次函数
def exp(base, power):
    return base ** power
exp(2, 4) == 2**4   # True

# 利用exp函数定义2的幂数函数
def two_to_the(power):
    return exp(2, power)
two_to_the(4)   # 16
from functools import partial
two_to_the = partial(exp, 2)
print(two_to_the(4))   # 16

square_of = partial(exp, power=2)
print(square_of(3))   # 9
def double(x):
    return 2 * x
xs = [1, 2, 3, 4]
twice_xs = [double(x) for x in xs]
print(twice_xs)   # [2, 4, 6, 8]

twice_xs = map(double, xs)
print(list(twice_xs))   # [2, 4, 6, 8]

list_doubler = partial(map, double)
twice_xs = list_doubler(xs)
print(list(twice_xs))   # [2, 4, 6, 8]

def multiply(x, y): return x * y
products = map(multiply, [1, 2], [4, 5])
list(products)  # [4, 10]
# filter解决了列表解析内if的工作
def is_even(x):
    """True if x is even, False if x is odd"""
    return x % 2 == 0

x_evens = [x for x in xs if is_even(x)]
print(x_evens)   # [2, 4]
x_evens = filter(is_even, xs)
print(list(x_evens))   # [2, 4]
list_evener = partial(filter, is_even)
x_evens = list_evener(xs)
print(list(x_evens))   # [2, 4]
from functools import reduce
x_product = reduce(multiply, xs)
print(x_product)   #24

list_product = partial(reduce, multiply)
x_product = list_product(xs)
print(x_product)   #24

2.2.8 枚举(enumerate)
# (index, element)元组
for i, interest in enumerate(interests[:10]):
    print(i, interest)
# 0 (0, 'Hadoop')
# 1 (0, 'Big Data')
# 2 (0, 'HBase')
# 3 (0, 'Java')
# 4 (0, 'Spark')
# 5 (0, 'Storm')
# 6 (0, 'Cassandra')
# 7 (1, 'NoSQL')
# 8 (1, 'MongoDB')
# 9 (1, 'Cassandra')
# 只输出索引
for i, interest in enumerate(interests[:10]): print(i)
# 0
# 1
# 2
# 3
# 4
# 5
# 6
# 7
# 8
# 9
for i, interest in list(interests[:10]): print(i, interest)
# 0 Hadoop
# 0 Big Data
# 0 HBase
# 0 Java
# 0 Spark
# 0 Storm
# 0 Cassandra
# 1 NoSQL
# 1 MongoDB
# 1 Cassandra

2.2.9 压缩和参数拆分(ZIP *)
# zip可以把两个或多个列表压缩在一起
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]
list(zip(list1, list2))
# [('a', 1), ('b', 2), ('c', 3)]

# 列表长度各异,zip会在第一个列表结束时停止
list1 = ['a', 'b', 'c', 'd']
list2 = [1, 2, 3]
list(zip(list1, list2))
# [('a', 1), ('b', 2), ('c', 3)]

# * 用来执行参数拆分
pairs = [('a', 1), ('b', 2), ('c', 3)]
letters, numbers = zip(*pairs)
print(letters, numbers)
list(zip(('a', 1), ('b', 2), ('c', 3)))
# ('a', 'b', 'c') (1, 2, 3)
# [('a', 'b', 'c'), (1, 2, 3)]
# * 可以在任何函数上使用参数拆分
def add(a, b): return a + b
print(add(1,2))   # 3
try:
    add([1,2])
except TypeError:
    print('index error')   # index error
print(add(*[1,2]))   # 3

2.2.10 args 和 kwargs
# doubler 的参数是函数
def doubler(f):
    def g(x):
        return 2 * f(x)
    return g

def f1(x):
    return x + 1

g = doubler(f1)
print(g(3))   # 8
print(g(-1))   # 0
# args 是一个未命名参数的元组; kwargs是一个已命名参数的dict
def magic(*args, **kwargs):
    print('unnamed args:', args)
    print('keyword args:', kwargs)
magic(1, 2, key='word', key2='word2')
# unnamed args: (1, 2)
# keyword args: {'key': 'word', 'key2': 'word2'}
# tuple 不需要* ; list 一个* ; dict 两个*
def other_way_magic(x, y, z):
    return x + y + z

x_y_list = [1, 2]
z_dict = { 'z' : 3 }
z_list = [ 5 ]
z_tuple = ( 5 )
print(other_way_magic(*x_y_list, **z_dict))   # 6
print(other_way_magic(*x_y_list, *z_list))   # 8
print(other_way_magic(*x_y_list, z_tuple))   # 8
def doubler_correct(f):
    """works no matter what kind of inputs f expects"""
    def g(*args, **kwargs):
        """whatever arguments g is supplied, pass them through to f"""
        return 2 * f(*args, **kwargs)
    return g

g = doubler_correct(f2)
print(g(1, 2))   # 6


以上是Ch2的相关内容
2018.01.31   YR

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值