数据结构和算法
解压序列赋值给多变量
>>> data = ['ACME', 50, 91.1, (2012, 12, 21)]
>>> name, shares, price, date = data
>>> name
'ACME'
>>> date
(2012, 12, 21)
>>> name, shares, price, (year, mon, day) = data
>>> year
2012
>>> _, shares, price, _ = data #解压一部分丢弃一部分时可使用任意变量占位
>>> price
91.1
>>> s = 'hello' #除了列表、元组、也可以用于字符串、文件对象、迭代器生成器
>>> a, b, c, d, e = s
>>> b
'e'
>>>
解压可迭代对象赋值给多变量
>>> record = ('Dave', 'dave@xxx.com', '0433-xxxx', '04333-yyyy')
>>> name, email, *phone = record
>>> name
'Dave'
>>> phone
['0433-xxxx', '04333-yyyy'] #此方法解压出的phone永远是列表类型
>>>
>>> line = 'nobody:*:-2:-2:Unprivileged User:/var/empty:/usr/bin/false'
>>> uname, *fields, homedir, sh = line.split(':') #字符串分割
>>> uname
'nobody'
>>> homedir
'/var/empty'
>>> sh
'/usr/bin/false'
>>>
>>> record = ('ACME', 50, 123.45, (12, 18, 2012))
>>> name, *_, (*_, year) = record
>>> name
'ACME'
>>> year
2012
>>>
保留最后N个元素
>>> from collections import deque
>>> q = deque(maxlen=3) #新建固定大小的队列
>>> q.append(1)
>>> q.append(2)
>>> q.append(3)
>>> q
deque([1, 2, 3], maxlen=3)
>>> q.append(4) #新元素加入且队列已满,则移除最先进入队列的元素
>>> q = deque() #若不设置大小,则新建一个无限大小的队列
>>> q.append(1)
>>> q.append(2)
>>> q.appendleft(3)
>>> q
deque([3, 1, 2])
>>> q.pop()
2
>>> q
deque([3, 1])
>>> q.popleft()
3
>>> q
deque([1])
>>>
查找最大或最小的N个元素
>>> import heapq
>>> nums = [1, 8, 2, 23, 7 , -4, 18, 23, 42, 37, 2]
>>> heapq.nlargest(3, nums) #最大的三个元素
[42, 37, 23]
>>> heapq.nsmallest(3, nums) #最小的三个元素
[-4, 1, 2]
>>> portfolio = [{'name': 'xxx', 'price': 10},{'name': 'yyy', 'price': 15}, {'name': 'zzz', 'price': 20}]
>>> cheap = heapq.nsmallest(1, portfolio, key=lambda s: s['price'])
>>> cheap #根据关键字进行查找最小的元素
[{'name': 'xxx', 'price': 10}]
>>> expensive = heapq.nlargest(1, portfolio, key=lambda s: s['price'])
>>> expensive #根据关键字进行查找最大的元素
[{'name': 'zzz', 'price': 20}]
>>>
因为heapq的底层实现是将集合数据进行堆排序后放入列表
>>> heap = list(nums)
>>> heapq.heapify(heap)
>>> heap
[-4, 2, 1, 23, 7, 2, 18, 23, 42, 37, 8]
>>> heapq.heappop(heap)
-4
>>> heapq.heappop(heap)
1
>>> heapq.heappop(heap)
2
>>>
1.如果要查找的元素个数相对较小时,可以使用nlargest()和nsmallest()
2.如果仅仅想查找唯一最大最小值时,可以使用min()和max()
3.如果要查找元素个数与集合大小接近时,可以排序再切片或者使用sorted
实现一个优先级队列
class PriorityQueue:
def __init__(self):
self._queue = []
self._index = 0
def push(self, item, priority):
# priority:负数,目的是优先级从高到底
# _index:优先级相同时,先push的优先级更高
heapq.heappush(self._queue, (-priority, self._index, item))
self._index += 1
def pop(self):
return heapq.heappop(self._queue)[-1]
class Item:
def __init__(self, name):
self.name = name
def __repr__(self):
return 'Item({!r})'.format(self.name)
def main():
q = PriorityQueue()
q.push(Item('foo1'), 1)
q.push(Item('foo3'), 3)
q.push(Item('foo4'), 4)
q.push(Item('foo2'), 2)
print(q.pop())
print(q.pop())
print(q.pop())
print(q.pop())
========================================================
Output:
Item('foo4')
Item('foo3')
Item('foo2')
Item('foo1')
字典中的键映射多个值
>>> from collections import defaultdict
>>> d = defaultdict(list)
>>> d['a'].append(1)
>>> d['a'].append(2)
>>> d['b'].append(4)
>>> d
defaultdict(<class 'list'>, {'a': [1, 2], 'b': [4]})
>>> d = defaultdict(set)
>>> d['a'].add(1)
>>> d['a'].add(2)
>>> d['b'].add(4)
>>> d
defaultdict(<class 'set'>, {'a': {1, 2}, 'b': {4}})
>>>
>>> d = defaultdict(list) #代码更简洁
>>> for key, value in pairs:
d[key].append(value)
字典排序
为了能控制一个字典中元素的顺序,可以使用collections中的OrderedDict
>>> d = OrderedDict()
>>> d['xxx'] = 1
>>> d['yyy'] = 2
>>> d['zzz'] = 3
>>> d
OrderedDict([('xxx', 1), ('yyy', 2), ('zzz', 3)])
OrderedDict的大小是普通字典的两倍,因为内部需要维护另一个链表,每次当新元素插入时,它会被放到链表的尾部。
字典运算
求最小值、最大值、排序等
使用zip()将键和值进行反转
>>> prices = {
'xxx': 45.23,
'yyy': 612.78,
'zzz': 205.55,
'mmm': 27.20,
'nnn': 10.75
}
>>> min_price = min(zip(prices.values(), prices.keys()))
>>> min_price
(10.75, 'nnn')
>>> max_price = max(zip(prices.values(), prices.keys()))
>>> max_price
(612.78, 'yyy')
>>> prices_sorted = sorted(zip(prices.values(), prices.keys()))
>>> prices_sorted
[(10.75, 'nnn'), (27.2, 'mmm'), (45.23, 'xxx'), (205.55, 'zzz'), (612.78, 'yyy')]
>>>
>>> prices = {'xxx': 45.23, 'zzz': 45.23} #值相同时,才会比较键
>>> min(zip(prices.values(), prices.keys()))
(45.23, 'xxx')
>>> max(zip(prices.values(), prices.keys()))
(45.23, 'zzz')
查找两字典的相同点
>>> a = {
'x': 1,
'y': 2,
'z': 3
}
>>> b = {
'w': 10,
'x': 11,
'y': 2
}
>>> a.keys() & b.keys()
{'x', 'y'}
>>> a.keys() - b.keys()
{'z'}
>>> a.items() & b.items()
{('y', 2)}
>>> c = {key:a[key] for key in a.keys() - {'z', 'w'}} #用现有字典删除几个指定键构造新字典
>>> c
{'x': 1, 'y': 2}
>>>
删除序列相同元素并保持顺序
def dedupe(items, key=None):
seen = set()
for item in items:
val = item if key is None else key(item)
if val not in seen:
yield item
seen.add(val)
def main():
a = [{'x': 1, 'y': 2}, {'x': 1, 'y': 3}, {'x': 1, 'y': 2}, {'x': 2, 'y': 4}]
ret = list(dedupe(a, key=lambda d: (d['x'], d['y'])))
print(ret)
ret = list(dedupe(a, key=lambda d: d['x']))
print(ret)
if __name__ == '__main__':
main()
Output:
[{'x': 1, 'y': 2}, {'x': 1, 'y': 3}, {'x': 2, 'y': 4}] #消除了重复项 {'x': 1, 'y': 2}
[{'x': 1, 'y': 2}, {'x': 2, 'y': 4}] #消除了所有'x': 1的重复项
命名切片
>>> record = '0123456789011223344556677889900'
>>> SHARES = slice(10,15) #将切片命名为SHARES
>>> PRICE = slice(16,20) #将切片命名未PRICE
>>> SHARES
slice(10, 15, None)
>>> record[SHARES]
'01122'
>>> PRICE
slice(16, 20, None)
>>> record[PRICE]
'3445'
>>> a = slice(5, 50, 2) #定义切片的开始,停止和步长
>>> a
slice(5, 50, 2)
>>> a.start
5
>>> a.stop
50
>>> a.step
2
>>> s = 'HelloWorld'
>>> a.indices(len(s)) #更新a的定义,(5, 50, 2) -> (5, 10, 2)
(5, 10, 2)
>>> for i in range(*a.indices(len(s))):
print(s[i])
W
r
d
序列中出现次数最多的元素
>>> words = [
'look', 'into', 'my', 'eyes', 'look', 'into','my','eyes',
'the', 'eyes','the','eyes','the','eyes','not','around','the',
'eyes','dont','look', 'around','the','eyes','look','into',
'my','eyes','you','under'
]
>>> from collections import Counter
>>> word_counts = Counter(words) #整合每个元素出现的次数
>>> word_counts
Counter({'eyes': 8, 'the': 5, 'look': 4, 'into': 3, 'my': 3, 'around': 2, 'not': 1, 'dont': 1, 'you': 1, 'under': 1})
>>> word_counts.most_common(3) #出现频率最高的3个元素
[('eyes', 8), ('the', 5), ('look', 4)]
>>> word_counts['look']
4
>>> word_counts['look'] += 1 #可以通过加法手动增加计数
>>> word_counts['look']
5
>>> morewords = ['why', 'are','you','not','looking','in','my','eyes']
>>> word_counts.update(morewords) #也可以通过update方法追加元素计数
>>> word_counts
Counter({'eyes': 9, 'look': 5, 'the': 5, 'my': 4, 'into': 3, 'not': 2, 'around': 2, 'you': 2, 'dont': 1, 'under': 1, 'why': 1, 'are': 1, 'looking': 1, 'in': 1})
>>> a = Counter(words)
>>> b = Counter(morewords)
>>> a
Counter({'eyes': 8, 'the': 5, 'look': 4, 'into': 3, 'my': 3, 'around': 2, 'not': 1, 'dont': 1, 'you': 1, 'under': 1})
>>> b
Counter({'why': 1, 'are': 1, 'you': 1, 'not': 1, 'looking': 1, 'in': 1, 'my': 1, 'eyes': 1})
>>> c = a + b #Counter实例可以进行数学运算
>>> c
Counter({'eyes': 9, 'the': 5, 'look': 4, 'my': 4, 'into': 3, 'not': 2, 'around': 2, 'you': 2, 'dont': 1, 'under': 1, 'why': 1, 'are': 1, 'looking': 1, 'in': 1})
>>> d = a - b
>>> d
Counter({'eyes': 7, 'the': 5, 'look': 4, 'into': 3, 'my': 2, 'around': 2, 'dont': 1, 'under': 1})
>>>
通过某个关键字排序字典列表
>>> rows = [
{'fname':'Brian', 'lname': 'Jones', 'uid': 1003},
{'fname':'David', 'lname': 'Beazley', 'uid': 1002},
{'fname':'John', 'lname': 'Cleese', 'uid': 1001},
{'fname':'Big', 'lname': 'Jones', 'uid': 1004}
]
>>> from operator import itemgetter
>>> rows_by_fname = sorted(rows, key=itemgetter('fname')) #使用itemgetter()
>>> rows_by_fname
[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}, {'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]
>>> rows_by_uid = sorted(rows, key=itemgetter('uid'))
>>> rows_by_uid
[{'fname': 'John', 'lname': 'Cleese', 'uid': 1001}, {'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}, {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}]
>>> rows_by_lfname = sorted(rows, key=itemgetter('lname', 'fname')) #itemgetter支持多个keys
>>> rows_by_lfname
[{'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}, {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}]
>>> rows_by_fname = sorted(rows, key=lambda r: r['fname']) #itemgetter可以用lambda表达式代替,但是itemgetter运行快一些
>>> rows_by_fname
[{'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}, {'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}]
>>> rows_by_lfname = sorted(rows, key=lambda r: (r['lname'],r['fname']))
>>> rows_by_lfname
[{'fname': 'David', 'lname': 'Beazley', 'uid': 1002}, {'fname': 'John', 'lname': 'Cleese', 'uid': 1001}, {'fname': 'Big', 'lname': 'Jones', 'uid': 1004}, {'fname': 'Brian', 'lname': 'Jones', 'uid': 1003}]
>>> min(rows, key=itemgetter('uid')) #同样适用于min和max
{'fname': 'John', 'lname': 'Cleese', 'uid': 1001}
>>> max(rows, key=itemgetter('uid'))
{'fname': 'Big', 'lname': 'Jones', 'uid': 1004}
>>>
排序不支持原生比较的对象
>>> class User:
def __init__(self, user_id):
self.user_id = user_id
def __repr__(self):
return 'User({})'.format(self.user_id)
>>> users = [User(23), User(3), User(99)]
>>>> sorted(users, key=lambda u: u.user_id)
[User(3), User(23), User(99)]
>>> from operator import attrgetter
>>> sorted(users, key=attrgetter('user_id')) #attrgetter可以代替lambda,运行更快且可以多字段比较
[User(3), User(23), User(99)]
>>> min(users, key=attrgetter('user_id')) #同样适用于min和max
User(3)
>>> max(users, key=attrgetter('user_id'))
User(99)
通过某个字段记录分组
>>> rows = [
{'addr': 'xxx', 'date': 'xx/xx'},
{'addr': 'yyy', 'date': 'xx/yy'},
{'addr': 'zzz', 'date': 'xx/zz'},
{'addr': 'iii', 'date': 'xx/ii'},
{'addr': 'jjj', 'date': 'xx/zz'},
{'addr': 'kkk', 'date': 'xx/zz'},
{'addr': 'mmm', 'date': 'xx/xx'},
{'addr': 'nnn', 'date': 'xx/yy'},
]
>>> from operator import itemgetter
>>> from itertools import groupby
>>> rows.sort(key=itemgetter('date')) #必须先根据指定字段进行排序,才能做后续分组操作
>>> for date, items in groupby(rows, key=itemgetter('date')):
print(date)
for i in items:
print('', i)
xx/ii
{'addr': 'iii', 'date': 'xx/ii'}
xx/xx
{'addr': 'xxx', 'date': 'xx/xx'}
{'addr': 'mmm', 'date': 'xx/xx'}
xx/yy
{'addr': 'yyy', 'date': 'xx/yy'}
{'addr': 'nnn', 'date': 'xx/yy'}
xx/zz
{'addr': 'zzz', 'date': 'xx/zz'}
{'addr': 'jjj', 'date': 'xx/zz'}
{'addr': 'kkk', 'date': 'xx/zz'}
>>>
过滤序列元素
列表推导,如果输入非常大会占用大量内存
>>> mylist = [1, 4, -5, 10, -7, 2, 3, -1]
>>> [n for n in mylist if n > 0]
[1, 4, 10, 2, 3]
>>> [n for n in mylist if n < 0]
[-5, -7, -1]
生成器表达式
>>> pos = (n for n in mylist if n > 0)
>>> pos
<generator object <genexpr> at 0x000002CC5F62A7B0>
>>> a = list(pos)
>>> a
[1, 4, 10, 2, 3]
如果过滤时需要处理异常,可以将过滤代码放到函数中再使用filter()函数,filter()返回一个迭代器,需要用list()转换为列表类型
>>> value = ['1', '2', '-3', '-', '4', 'N/A', '5']
>>> def is_int(val):
try:
x = int(val)
return True
except ValueError:
return False
>>> ivals = list(filter(is_int, value))
>>> ivals
['1', '2', '-3', '4', '5']
还可以再过滤时转换数据
>>> mylist
[1, 4, -5, 10, -7, 2, 3, -1]
>>> import math
>>> [math.sqrt(n) for n in mylist if n > 0]
[1.0, 2.0, 3.1622776601683795, 1.4142135623730951, 1.7320508075688772]
>>>
还可以在过滤时将不符合条件的值用新值代替
>>> mylist
[1, 4, -5, 10, -7, 2, 3, -1]
>>> clip_neg = [n if n > 0 else 0 for n in mylist]
>>> clip_neg
[1, 4, 0, 10, 0, 2, 3, 0]
>>> clip_neg = [n if n < 0 else 0 for n in mylist]
>>> clip_neg
[0, 0, -5, 0, -7, 0, 0, -1]
itertools.compress()
先创建Boolean序列,指定哪些元素符合条件,然后compress()函数根据这个序列去选择对应位置为True的元素,compress()返回一个迭代器,需要用list()转换为列表类型
>>> addr = [
'xxx',
'yyy',
'zzz',
'iii',
'jjj',
'kkk',
'mmm',
'nnn',
]
>>> counts = [0, 3, 10, 4, 1, 7, 6, 1]
>>> from itertools import compress
>>> more5 = [n > 5 for n in counts]
>>> more5
[False, False, True, False, False, True, True, False]
>>> list (compress(addr, more5))
['zzz', 'kkk', 'mmm']
>>>
从字典中提取子集
字典推导
>>> prices = {
'xxx': 45.23,
'yyy': 612.78,
'zzz': 205.55,
'mmm': 37.20,
'nnn': 10.75
}
>>> p1 = {key: value for key, value in prices.items() if value > 200}
>>> p1
{'yyy': 612.78, 'zzz': 205.55}
>>> tech_names = {'xxx', 'yyy', 'zzz', 'iii'}
>>> p2 = {key: value for key, value in prices.items() if key in tech_names}
>>> p2
{'xxx': 45.23, 'yyy': 612.78, 'zzz': 205.55}
>>>
映射名称到序列元素
>>> from collections import namedtuple
>>> Subscriber = namedtuple('Subscriber', ['addr', 'joined'])
>>> sub = Subscriber('xxxxx', 'yyyyyy')
>>> sub
Subscriber(addr='xxxxx', joined='yyyyyy')
>>> sub.addr
'xxxxx'
>>> sub.joined
'yyyyyy'
>>> len(sub)
2
>>> addr, joined = sub
>>> addr
'xxxxx'
>>> joined
'yyyyyy'
>>>
用命名元组替代下标操作
下标操作:
>>> def compute_cost(records):
total = 0.0
for rec in records:
total += rec[1] * rec[2]
return total
命名元组操作:
>>> from collections import namedtuple
>>> Stock = namedtuple('Stock', ['name', 'shares', 'price'])
>>> def compute_cost(records):
total = 0.0
for rec in records:
s = Stock(*rec)
total += s.shares * s.price #用命名元组使代码表意更清晰
return total
命名元组可以作为字典的替代,比字典存储节省内存空间且高效,但命名元组不可更改
>>> Stock = namedtuple('Stock', ['name', 'shares', 'price'])
>>> s = Stock('xxx', 100, 123.45)
>>> s
Stock(name='xxx', shares=100, price=123.45)
>>> s.shares = 75
Traceback (most recent call last):
File "<pyshell#519>", line 1, in <module>
s.shares = 75
AttributeError: can't set attribute
可以使用_replace()方法:
>>> s = s._replace(shares = 75)
>>> s
Stock(name='xxx', shares=75, price=123.45)
>>>
转换并同时计算数据
>>> nums = [1, 2, 3, 4, 5]
>>> s = sum(x * x for x in nums)
>>> s
55
>>>
合并多个字典或映射
假设从两个字典中查找,先从a中找,如果找不到再从b中找
>>> a = {'x': 1, 'z': 3}
>>> b = {'y': 2, 'z': 4}
>>> from collections import ChainMap
>>> c = ChainMap(a, b)
>>> c['x'] #从a中找到的
1
>>> c['y'] #从b中找到的
2
>>> c['z'] #从c中找到的
3
>>> c['z'] = 10 #对于字典的更新删除操作总是影响的第一个字典
>>> list(c.values())
[2, 10, 1]
>>> a
{'x': 1, 'z': 10}
>>> b
{'y': 2, 'z': 4}
>>>