《利用Python进行数据分析》笔记+整理+案例_基于python实验数据分析整理-CSDN博客

本文链接：https://blog.csdn.net/skywuuu/article/details/109100621

《利用Python进行数据分析》笔记+整理+案例

第一部分：数据结构，函数，文件

1. Tuple

tup = 4, 5, 6

tup

(4, 5, 6)

nested_tup = (4,5),(6,7)
nested_tup

((4, 5), (6, 7))

(1) 将list转换成tuple

tuple([4,5,6])

(4, 5, 6)

(2) 将string转换成tuple

tup = tuple('string')

tup

('s', 't', 'r', 'i', 'n', 'g')

tup[0]

's'

tup = tuple(['foo', [1, 2], True])
tup[2] = False

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-8-11b694945ab9> in <module>
      1 tup = tuple(['foo', [1, 2], True])
----> 2 tup[2] = False


TypeError: 'tuple' object does not support item assignment

(3) 对嵌套中的list进行分析

tup[1].append(3)
tup

('foo', [1, 2, 3], True)

(4) 使用加号连接

(4, None, 'foo')+(1,2,3)+(True, False)

(4, None, 'foo', 1, 2, 3, True, False)

(5) 拆分元组

tup = (4,5,6)
a,b = tup

---------------------------------------------------------------------------

ValueError                                Traceback (most recent call last)

<ipython-input-11-71c3f6b411a3> in <module>
      1 tup = (4,5,6)
----> 2 a,b = tup


ValueError: too many values to unpack (expected 2)

a, b, c = tup
print("a = ",a, ", b = ", b, ", c = ", c)

a =  4 , b =  5 , c =  6

tup = 4, 5, (6, 7)
a, b, c = tup
aa, bb, (cc, dd) = tup
print("a = ",a, ", b = ", b, ", c = ", c)
print("aa = ",aa, ", bb = ", bb, ", cc = ", cc, ", dd = ", dd)

a =  4 , b =  5 , c =  (6, 7)
aa =  4 , bb =  5 , cc =  6 , dd =  7

a,*rest = tup # 一般来说rest是要被舍弃的部分，所以可能会用下划线“_”命名 
print("a = ",a)
print("rest = ", rest)

a =  4
rest =  [5, (6, 7)]

(6) 交换值的方法

print("交换前：a = ", a, ", b = ",b)
a, b = b, a
print("交换后：a = ", a, ", b = ", b)

交换前：a =  4 , b =  5
交换后：a =  5 , b =  4

2. 列表

list1=[2,3,8,None]

tup = ('a','b','c')
list2 = list(tup)

gen = range(2,20,2)
list3 = list(gen)

print(list1)
print(list2)
print(list3)

[2, 3, 8, None]
['a', 'b', 'c']
[2, 4, 6, 8, 10, 12, 14, 16, 18]

(1) 添加和删除元素

list1 = [2, 3, 8, None]
list1.append([1,11,22])
print(list1)

list1.insert(4,'Sky')
print(list1)

[2, 3, 8, None, [1, 11, 22]]
[2, 3, 8, None, 'Sky', [1, 11, 22]]

list1.pop(2)
print(list1)

list1.remove(None)
print(list1)

[2, 3, None, 'Sky', [1, 11, 22]]
[2, 3, 'Sky', [1, 11, 22]]

(2) 串联和组合列表

# 用加号
[1,2,3]+[4,5,6]

[1, 2, 3, 4, 5, 6]

# 用extend，比用加号开销小
list1 = [1,2,3]
list2 = [range(10)]
for x in list2:
    list1.extend(x)
print(list1)

[1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

(3) 排序

a = [1, 8, 0, 9, 2, 3, 8]
a.sort()
a

[0, 1, 2, 3, 8, 8, 9]

b = ['saw','sky','john','appreciate','forest']
b.sort(key=len)
b

['saw', 'sky', 'john', 'forest', 'appreciate']

(4) bisect （二分查找）

import bisect
a = [1, 8, 0, 9, 2, 3, 8]
a.sort()
print(a)
bisect.bisect(a, 9) #找到9的下标

[0, 1, 2, 3, 8, 8, 9]





7

bisect.insort(a, 4)
a

[0, 1, 2, 3, 4, 8, 8, 9]

(5) 切片

start:end

start🔚step

负数代表从后往前

seq = [1, 8, 0, 9, 2, 3, 8]
seq[2:5]

[0, 9, 2]

seq[-2:-1] = [4,9] #把3换成了4和9
seq

[1, 8, 0, 9, 2, 4, 9, 8]

seq[-2:] = [0,1]
seq

[1, 8, 0, 9, 2, 4, 0, 1]

seq[::2]

[1, 0, 2, 0]

*序列函数

(1) enumerate

跟踪当前项的序号，不需要手动命名一个count来计数了
使用字典更方便了，可以直接把键（key）和值（value）一一对应

import numpy as np
collection = np.random.randint(10,size=10)
print(collection)
mapping = {} #字典
for i, value in enumerate(collection):
    if i % 2 == 0:
        print(value)
        mapping[i]=value
print(mapping)

[4 7 9 0 9 0 6 8 2 2]
4
9
9
6
2
{0: 4, 2: 9, 4: 9, 6: 6, 8: 2}

(2) sorted （返回一个新的排好序的列表）

list1 = [1,8,0,9,2,3,8]
list2 = sorted(list1)
list1 is list2

False

(3) zip （将多个列表、元组或其他序列承兑组成一个元组列表）

seq1=['Sky','John','Neo']
seq2=[10,10,8]
zipped = zip(seq1, seq2)
out = list(zipped)
print(out)

[('Sky', 10), ('John', 10), ('Neo', 8)]

seq3 = [True, False]
zipped2 = zip(seq1, seq2, seq3)#可以处理多个！个数取决于最短的那个
out2 = list(zipped2)
print(out2)

[('Sky', 10, True), ('John', 10, False)]

for i, (a, b) in enumerate(zip(seq1, seq2)): #同时迭代多个序列，结合enumerate
    print('{0}: {1}, {2}'.format(i,a,b))

0: Sky, 10
1: John, 10
2: Neo, 8

pitchers = [('Sky', 'Wu'), 
            ('John', 'Huang'), 
            ('Lily','Zhang')]
first_name, last_name = zip(*pitchers) #类似解压缩的功能
print(first_name)
print(last_name)

('Sky', 'John', 'Lily')
('Wu', 'Huang', 'Zhang')

(4) reversed（从后向前迭代）

list(reversed(range(10)))

[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

字典

其他名称：哈希映射或关联数组。
它是键值对的⼤⼩可变集合，键和值都是Python对象

（1）创建方法

empty_dict={}

d1 = {'a':'some value','b':[1,2,3,4]}

d1

{'a': 'some value', 'b': [1, 2, 3, 4]}

（2）访问方法

d1[7] = 'an integer'
d1

{'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer'}

d1['a']

'some value'

（3）是否包含？

'b' in d1

True

（4）删除

del
pop （返回一个值，同时也删除键）

d1['c']='some value'
d1['dummy']='another value'

d1

{'a': 'some value',
 'b': [1, 2, 3, 4],
 7: 'an integer',
 'c': 'some value',
 'dummy': 'another value'}

del d1['c']
d1

{'a': 'some value',
 'b': [1, 2, 3, 4],
 7: 'an integer',
 'dummy': 'another value'}

ret = d1.pop('dummy')

d1

{'a': 'some value', 'b': [1, 2, 3, 4], 7: 'an integer'}

ret

'another value'

（5）取键或值

list(d1.keys())

['a', 'b', 7]

list(d1.values())

['some value', [1, 2, 3, 4], 'an integer']

（6）与另一个字典融合（update）

d2 = {'Chinese':'89','English':'140'}

d1.update(d2)
d1

{'a': 'some value',
 'b': [1, 2, 3, 4],
 7: 'an integer',
 'Chinese': '89',
 'English': '140'}

（7）用序列创建字典

mapping = {}
key_list = ['Sky', 'John', 'Victoria', 'Tom']
value_list = [100,90,80,70]
for key, value in zip(key_list, value_list):
    mapping[key] = value
print(mapping)

{'Sky': 100, 'John': 90, 'Victoria': 80, 'Tom': 70}

mapping2 = dict(zip(key_list,value_list))
print(mapping2)

{'Sky': 100, 'John': 90, 'Victoria': 80, 'Tom': 70}

（8）默认值（使用get函数）

dic1 = mapping2.copy()

dic1

{'Sky': 100, 'John': 90, 'Victoria': 80, 'Tom': 70}

if 'Sky' in dic1:
    value = dic1['Sky']
else:
    value = 90

同上面一样的写法

value = dic1.get('Sky',90)
print(value)

（9）分类（使用setdefault函数）

words = ['apple','ace','bat','bar','cat','catch','dog','doom']
by_letter = {}
for word in words:
    letter = word[0]
    if letter not in by_letter:
        by_letter[letter]=[word]
    else:
        by_letter[letter].append(word)

by_letter

{'a': ['apple', 'ace'],
 'b': ['bat', 'bar'],
 'c': ['cat', 'catch'],
 'd': ['dog', 'doom']}

同上面一样的写法

by_letter2 = {}
for word in words:
    letter=word[0]
    by_letter2.setdefault(letter,[]).append(word)

by_letter2

{'a': ['apple', 'ace'],
 'b': ['bat', 'bar'],
 'c': ['cat', 'catch'],
 'd': ['dog', 'doom']}

另一种写法：使用collections模块的defaultdict

from collections import defaultdict
by_letter3 = defaultdict(list)
for word in words:
    by_letter3[word[0]].append(word)

by_letter3

defaultdict(list,
            {'a': ['apple', 'ace'],
             'b': ['bat', 'bar'],
             'c': ['cat', 'catch'],
             'd': ['dog', 'doom']})

（10）有效的键的类型

键
- 不可变的标量类型
- 元组（因为元组也不可变）
可以用hash()检验是否是可哈希（可用作字典的键）

hash('Sky')

-5444718028939046860

hash([1,2,3])

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-65-35e31e935e9e> in <module>
----> 1 hash([1,2,3])


TypeError: unhashable type: 'list'

hash(tuple([1,2,3]))

529344067295497451

Set

特殊的字典：没有键只有值

（1）创建

set函数
花括号

# set
set([1,1,2,2,2,2,3,4,5,5,5,5])

{1, 2, 3, 4, 5}

# {}
{1,2,1,1,2,3,4,4,4,5,5,5}

{1, 2, 3, 4, 5}

（2）交集，并集，差分，对称差

a = {1,2,3,4,5}
b = {3,4,5,6,7,8}

(a) 并集的两种方式

a.union(b)

{1, 2, 3, 4, 5, 6, 7, 8}

a | b

{1, 2, 3, 4, 5, 6, 7, 8}

(b)交集的两种方式

a.intersection(b)

{3, 4, 5}

a & b

{3, 4, 5}

© 常用方式

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-04D2iQhh-1602751803059)(attachment:image.png)]

(5) 用元组更新

my_data = [1,2,3,4]

my_set = {my_data}

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-75-cc3f4f7e7ff8> in <module>
----> 1 my_set = {my_data}


TypeError: unhashable type: 'list'

my_set = {tuple(my_data)}

my_set

{(1, 2, 3, 4)}

（6）检测一个集合是否是另一个的子集（issubset）或父集（issuperset）

a_set = {1,2,3,4,5}

{1,2,3}.issubset(a_set)

True

{1,2,3,4,5,6,7}.issuperset(a_set)

True

列表、集合和字典推导式

可以简单进行筛选

1. 列表

[expr for val in collection if condition]

等同于

result = [] #新键一个列表
for x in collection:
    if condition:
        result.append(x)

字典和集合与列表类似，写法如下：
2. 字典

{key_expr : value_expr for value in collection if condition}

3. 集合

{expr for value in collection if condition}

strings = ['apple','banana','cat','dog','inform','perform']

[x.upper() for x in strings if len(x) > 3]

['APPLE', 'BANANA', 'INFORM', 'PERFORM']

strings2 = {'a':'apple','b':'banana','c':'cat','d':'dog','e':'inform','f':'perform'}

{key:strings2[key] for key in strings2 if key == strings2[key][0]} #for循环读取字典返回的是key

{'a': 'apple', 'b': 'banana', 'c': 'cat', 'd': 'dog'}

strings3 = {'apple','banana','cat','dog','inform','perform'}

{value for value in strings3 if len(value)<5}

{'cat', 'dog'}

# 普通方法得到长度
{len(x) for x in strings}

{3, 5, 6, 7}

* 使用map只得到长度

set(map(len,strings))

{3, 5, 6, 7}

* 创建一个单词和单词序号的映射表

loc_mapping = {x : i for i, x in enumerate(strings3)}

loc_mapping

{'banana': 0, 'inform': 1, 'perform': 2, 'apple': 3, 'cat': 4, 'dog': 5}

嵌套列表推导式

嵌套好几个for循环

all_data = [['John', 'Emily', 'Michael', 'Mike'],
            ['Maria','Juan','Steven','Javier']]

# 普通方法
names_of_interest = []
for names in all_data:
    enough_es = [name for name in names if name.count('e') >= 2]
    names_of_interest.extend(enough_es)

names_of_interest

['Steven']

# 嵌套列表推导式
result = [name for names in all_data for name in names if name.count('e') >= 2]

result

['Steven']

some_tuples = [(1,2,3),(4,5,6),(7,8,9)]

flattened = [x for tuples in some_tuples for x in tuples ]

flattened

[1, 2, 3, 4, 5, 6, 7, 8, 9]

Function

函数用def声明，用return返回值
可以返回多个值（其实就是返回一个tuple，然后将tuple的值分配给不同的variable）
- 也可以返回字典
```
def f():
    a = 5
    b = 6
    c = 7
    return {'a':a, 'b':b, 'c':c}
```
函数也是对象！
- 将函数扔进一个列表，就可以用for循环遍历，下面有例子
- 函数也可以作为一个输入，也就是其他函数的参数
lambda函数
柯里化（currying）
- 通过“部分参数应用”(partial argument application) 从现有函数派生出新函数的技术
生成器

def f():
        a = 5
        b = 6
        c = 7
        return {'a':a, 'b':b, 'c':c}

f()

{'a': 5, 'b': 6, 'c': 7}

函数也是对象

states = ['  Alabama     ', 'Georgia!', 'Georgi#a', 'geor??gia', 'FLORIDA','south carolina##','West virginia']

# 普通方法
import re
def clean_strings(strings):
    result=[]
    for value in strings:
        value = value.strip() #移除字符串头，尾指定的字符或字符序列，默认值为空格或换行符
        value = re.sub('[!#?]','',value)
        value = value.title() #把字符切换成标题模式，也就是开头第一个字母大写，其他字母小写
        result.append(value)
    return result

clean_strings(states)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

# 利用函数也是对象的性指
def remove_punctuation(value): #移除标点
    return re.sub('[?#!]','', value)

clean_ops = [str.strip, remove_punctuation, str.title]

def clean_strings2(strings, ops):
    result = []
    for value in strings:
        for func in ops:
            value = func(value)
        result.append(value)
    return result

clean_strings2(states, clean_ops)

['Alabama',
 'Georgia',
 'Georgia',
 'Georgia',
 'Florida',
 'South Carolina',
 'West Virginia']

函数作为其他函数的参数

for x in map(remove_punctuation, states):
    print(x)

  Alabama     
Georgia
Georgia
georgia
FLORIDA
south carolina
West virginia

lambda函数

def short_function(x):
    return x * 2

equiv_anon = lambda x: x * 2

def apply_to_list(some_list, f):
    return [f(x) for x in some_list] #这里是为了测试lambda，其实可以直接 [x*2 for x in some_list]

ints = [4,0,1,5,6]
apply_to_list(ints, lambda x: x * 2)

[8, 0, 2, 10, 12]

strings = ['foo', 'card', 'bar', 'aaaa', 'abab']

strings.sort(key=lambda x: len(set(list(x)))) # list(x) will transform the string in to a list. 
                                              # For example, 'foo' -> ['f', 'o', 'o']

strings

['aaaa', 'foo', 'abab', 'bar', 'card']

柯里化 (currying)

def add_num(x, y):
    return x+y

add_five = lambda y : add_num(5, y)

add_five(1)

生成器（yield关键字）

def squares(n=10):
    print('Generating squares from 1 to {0}'.format(n**2))
    for i in range(1, n+1):
        yield i ** 2

gen = squares() # gen就是一个生成器了

gen #看output的generator object

<generator object squares at 0x000001FC31DC0270>

for x in gen:
    print(x, end=' ')

Generating squares from 1 to 100
1 4 9 16 25 36 49 64 81 100

# 更为简洁的写法
gen2 = (x ** 2 for x in range(10))

gen2

<generator object <genexpr> at 0x000001FC31DDC040>

生成器表达式也可以取代列表推导式，作为函数参数

sum(x ** 2 for x in range(3)) # 1**2 + 2**2

dict((i,i**2) for i in range(5))

{0: 0, 1: 1, 2: 4, 3: 9, 4: 16}

itertools模块

常用函数
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Ujs3bw2i-1602751803065)(attachment:image.png)]

import itertools

first_letter = lambda x: x[0]

first_letter

<function __main__.<lambda>(x)>

names = ['Alan', 'Adam','Wes','Will','Albert','Steven']

for letter, lnames in itertools.groupby(names, key=first_letter):
    print(letter,list(lnames))

A ['Alan', 'Adam']
W ['Wes', 'Will']
A ['Albert']
S ['Steven']

错误和异常处理

def attempt_float(x):
    try:
        return float(x)
    except:
        return x

attempt_float('1.2345')

1.2345

attempt_float('something')

'something'

可能只处理某个类型的错误，别的不处理

def attempt_float2(x):
    try:
        return float(x)
    except ValueError:
        return x

attempt_float2((1,2))

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-190-32826742a5f3> in <module>
----> 1 attempt_float2((1,2))


<ipython-input-189-5aff4a682a3c> in attempt_float2(x)
      1 def attempt_float2(x):
      2     try:
----> 3         return float(x)
      4     except ValueError:
      5         return x


TypeError: float() argument must be a string or a number, not 'tuple'

可以用元组包含多个异常

def attempt_float3(x):
    try:
        return float(x)
    except (ValueError, TypeError):
        return x

用finally使得无论try的代码是否成功都可以执行某段代码

f = open(path, 'w')
try:
    write_to_file(f)
finally:
    f.close()

else会让在try成功的情况下执行代码

f = open(path,'w')
try:
    write_to_file(f)
except:
    print('Failed')
else:
    print('Succeeded')
finally:
    f.close()

IPython的异常

	%run	examples/ipython_bug.py

使用%run一个脚本或一条语句时抛出异常，IPython默认打印完整的调用栈（traceback）
%xmode控制打印信息的数量
%debug和%pdb magics可以用来调试

文件和操作系统

path='C:/Users/Sky/Desktop/test2.txt'

f=open(path,encoding='utf-8')

for line in f:
    print(line)

He is a handsome guy

Her is the best computer programmer

It will learn machine learning very fast and well

f.close()

lines = [x.rstrip()for x in open(path,encoding='utf-8')]

lines

['He is a handsome guy',
 'Her is the best computer programmer',
 'It will learn machine learning very fast and well']

使用with语句更好！因为with可以自动关闭文件

with open(path,encoding='utf-8') as f:
    for line in f:
        print(line)

He is a handsome guy

Her is the best computer programmer

It will learn machine learning very fast and well

文件的读/写模式
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SKGwcQEm-1602751803067)(attachment:image.png)]

读方法：

read：从文件返回字符
seek：将文件位置更带到制定的位置，eg.seek(4)
tell：给出当前读取到的位置

写方法：

write或wrielines
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ymnEFBBc-1602751803069)(attachment:image.png)]

f = open(path,encoding='utf-8')

f.read(10)

'He is a ha'

f2 = open(path,'rb')

f2.read(10)

b'He is a ha'

f.tell()

f2.tell()

# 用sys模块检查默认的编码
import sys

sys.getdefaultencoding()

'utf-8'

f.seek(2)

f.read(3)

' is'

f.close()
f2.close()

lines=[x for x in open(path)]

lines

['He is a handsome guy\n',
 'Her is the best computer programmer\n',
 'It will learn machine learning very fast and well']

with open(path,'a') as handle:
    handle.writelines(x for x in open(path))

with open(path, encoding='utf-8') as f:
    lines = f.readlines()

lines

['He is a handsome guy\n',
 'Her is the best computer programmer\n',
 'It will learn machine learning very fast and wellHe is a handsome guy\n',
 'Her is the best computer programmer\n',
 'It will learn machine learning very fast and well']