11.切片
slice()
record = '....................100.......513.25..........'
cost = int(record[20:32]) * float(record[40:48])
更pythonic方式是对切片命名
SHARES = slice(20,32)
PRICE = slice(40,48)
cost = int(record[SHARES]) * float(record[PRICE])
>>>a = slice(5, 50, 2)
>>>a.start
5
>>>a.stop
50
>>>a.step
2
可以用indices()将实际大小映射到a.stop
>>>s='HelloWorld'
>>> a.indices(len(s))
(5, 10, 2)
12找出序列中出现频率最高的元素
collections.Counter()
>>> words = [
... 'look', 'into', 'my', 'eyes', 'look', 'into', 'my', 'eyes',
... 'the', 'eyes', 'the', 'eyes', 'the', 'eyes', 'not', 'around', 'the',
... 'eyes', "don't", 'look', 'around', 'the', 'eyes', 'look', 'into',
... 'my', 'eyes', "you're", 'under'
... ]
>>> from collections import Counter
>>> word_counts = Counter(words)
>>>word_counts.most_common(3)
[('eyes', 8), ('the', 5), ('look', 4)]
>>> word_counts
Counter({'eyes': 8, 'the': 5, 'look': 4, 'into': 3, 'my': 3, 'around': 2, "you're": 1, "don't": 1, 'under': 1, 'not': 1})
如果需要再添加另一个序列的统计,可以用update()
>>> morewords = ['why','are','you','not','looking','in','my','eyes']
>>> word_counts.update(morewords)
>>> _#终端中'_'代表前一个输出,在此是word_counts
Counter({'eyes': 9, 'the': 5, 'look': 4, 'my': 4, 'into': 3, 'not': 2, 'around': 2, "you're": 1, "don't": 1, 'in': 1, 'you': 1, 'looking': 1, 'are': 1, 'under': 1, 'why': 1})
对Counter()可以直接进行+-操作
>>> a = Counter(words)
>>> b = Counter(morewords)
>>> a
Counter({'eyes': 8, 'the': 5, 'look': 4, 'into': 3, 'my': 3, 'around': 2, "you're": 1, "don't": 1, 'under': 1, 'not': 1})
>>> b
Counter({'eyes': 1, 'looking': 1, 'are': 1, 'in': 1, 'not': 1, 'you': 1, 'my': 1, 'why': 1})
>>> a+b
Counter({'eyes': 9, 'the': 5, 'look': 4, 'my': 4, 'into': 3, 'not': 2, 'around': 2, "you're": 1, "don't": 1, 'in': 1, 'why': 1, 'looking': 1, 'are': 1, 'under': 1, 'you': 1})
>>> a-b
Counter({'eyes': 7, 'the': 5, 'look': 4, 'into': 3, 'my': 2, 'around': 2, "you're": 1, "don't": 1, 'under': 1})
13对含有公共key的dict排序
operator.itemgetter()
>>> rows = [
... {'fname':'Brian', 'lname': 'Jones', 'uid': 1003},
... {'fname':'David', 'lname': 'Beazley', 'uid': 1002},
... {'fname':'John', 'lname': 'Cleese', 'uid': 1001},
... {'fname':'Big', 'lname': 'Jones', 'uid': 1004}
... ]
>>> from operator import itemgetter
>>> sorted(rows, key=itemgetter('uid'))
[{'lname': 'Cleese', 'uid': 1001, 'fname': 'John'}, {'lname': 'Beazley', 'uid': 1002, 'fname': 'David'}, {'lname': 'Jones', 'uid': 1003, 'fname': 'Brian'}, {'lname': 'Jones', 'uid': 1004, 'fname': 'Big'}]
#itemgetter也支持多个参数
>>> sorted(rows, key=itemgetter('lname','fname'))
[{'lname': 'Beazley', 'uid': 1002, 'fname': 'David'}, {'lname': 'Cleese', 'uid': 1001, 'fname': 'John'}, {'lname': 'Jones', 'uid': 1004, 'fname': 'Big'}, {'lname': 'Jones', 'uid': 1003, 'fname': 'Brian'}]
当然我们也可以采用常用的方法实现
>>> sorted(rows, key=lambda r: r['fname'])
[{'lname': 'Jones', 'uid': 1004, 'fname': 'Big'}, {'lname': 'Jones', 'uid': 1003, 'fname': 'Brian'}, {'lname': 'Beazley', 'uid': 1002, 'fname': 'David'}, {'lname': 'Cleese', 'uid': 1001, 'fname': 'John'}]
>>> sorted(rows, key=lambda r: (r['lname'],r['fname']))
[{'lname': 'Beazley', 'uid': 1002, 'fname': 'David'}, {'lname': 'Cleese', 'uid': 1001, 'fname': 'John'}, {'lname': 'Jones', 'uid': 1004, 'fname': 'Big'}, {'lname': 'Jones', 'uid': 1003, 'fname': 'Brian'}]
但是itemgetter()方法通常运行会更快
14对象的排序
operator.attrgetter
>>> class User(object):
... def __init__(self,user_id):
... self.user_id=user_id
... def __repr__(self):
... return 'User({})'.format(self.user_id)
...
>>> users = [User(23), User(3), User(99)]
>>> users
[User(23), User(3), User(99)]
常用方式
>>> sorted(users, key=lambda u: u.user_id)
[User(3), User(23), User(99)]
还有另一种选择
>>> from operator import attrgetter
>>> sorted(users, key=attrgetter('user_id'))
[User(3), User(23), User(99)]
和itemgetter()相似,attrgetter()方法通常运行会更快
15基于某个域将记录分组
itertools.groupby()
rows = [
{'address':'5412 CLARK', 'date': '07/01/2012'},
{'address':'5148 CLARK', 'date': '07/04/2012'},
{'address':'5800 58TH', 'date': '07/02/2012'},
{'address':'2122 CLARK', 'date': '07/03/2012'},
{'address':'5645 RAVENSWOOD', 'date': '07/02/2012'},
{'address':'1060 ADDISON', 'date': '07/02/2012'},
{'address':'4801 BROADWAY', 'date': '07/01/2012'},
{'address':'1039 GRANVILLE', 'date': '07/04/2012'},
]
>>>rows.sort(key=itemgetter('date'))
>>> for date, items in groupby(rows, key=itemgetter('date')):
... print date
... for i in items:
... print i
...
07/01/2012
{'date': '07/01/2012', 'address': '5412 CLARK'}
{'date': '07/01/2012', 'address': '4801 BROADWAY'}
07/02/2012
{'date': '07/02/2012', 'address': '5800 58TH'}
{'date': '07/02/2012', 'address': '5645 RAVENSWOOD'}
{'date': '07/02/2012', 'address': '1060 ADDISON'}
07/03/2012
{'date': '07/03/2012', 'address': '2122 CLARK'}
07/04/2012
{'date': '07/04/2012', 'address': '5148 CLARK'}
{'date': '07/04/2012', 'address': '1039 GRANVILLE'}
由于groupby()只检测连续的序列,所以首先应该排序
也可以采用1.6节的方法defaultdict()
from collections import defaultdict
rows_by_date = defaultdict(list)
for row in rows:
rows_by_date[row['date']].append(row)