数据分析中常用的Python技巧

数据分析中常用的Python技巧

1. 条件表达式

import math

def get_log(x):
    # 普通写法
    if x > 0:
        y = math.log(x)
    else:
        y = float('nan')
    return y
x = 5
log_val1 = get_log(x)
# 使用条件表达式
log_val2 = math.log(x) if x > 0 else float('nan')

print(log_val1)
print(log_val2)
1.6094379124341003
1.6094379124341003

2. 列表推导式

print('找出1000内的偶数(for循环):')
l1 = []
for i in range(1000):
    if i % 2 == 0:
        l1.append(i)
print(l1)
找出1000内的偶数(for循环):
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 422, 424, 426, 428, 430, 432, 434, 436, 438, 440, 442, 444, 446, 448, 450, 452, 454, 456, 458, 460, 462, 464, 466, 468, 470, 472, 474, 476, 478, 480, 482, 484, 486, 488, 490, 492, 494, 496, 498, 500, 502, 504, 506, 508, 510, 512, 514, 516, 518, 520, 522, 524, 526, 528, 530, 532, 534, 536, 538, 540, 542, 544, 546, 548, 550, 552, 554, 556, 558, 560, 562, 564, 566, 568, 570, 572, 574, 576, 578, 580, 582, 584, 586, 588, 590, 592, 594, 596, 598, 600, 602, 604, 606, 608, 610, 612, 614, 616, 618, 620, 622, 624, 626, 628, 630, 632, 634, 636, 638, 640, 642, 644, 646, 648, 650, 652, 654, 656, 658, 660, 662, 664, 666, 668, 670, 672, 674, 676, 678, 680, 682, 684, 686, 688, 690, 692, 694, 696, 698, 700, 702, 704, 706, 708, 710, 712, 714, 716, 718, 720, 722, 724, 726, 728, 730, 732, 734, 736, 738, 740, 742, 744, 746, 748, 750, 752, 754, 756, 758, 760, 762, 764, 766, 768, 770, 772, 774, 776, 778, 780, 782, 784, 786, 788, 790, 792, 794, 796, 798, 800, 802, 804, 806, 808, 810, 812, 814, 816, 818, 820, 822, 824, 826, 828, 830, 832, 834, 836, 838, 840, 842, 844, 846, 848, 850, 852, 854, 856, 858, 860, 862, 864, 866, 868, 870, 872, 874, 876, 878, 880, 882, 884, 886, 888, 890, 892, 894, 896, 898, 900, 902, 904, 906, 908, 910, 912, 914, 916, 918, 920, 922, 924, 926, 928, 930, 932, 934, 936, 938, 940, 942, 944, 946, 948, 950, 952, 954, 956, 958, 960, 962, 964, 966, 968, 970, 972, 974, 976, 978, 980, 982, 984, 986, 988, 990, 992, 994, 996, 998]
print('找出1000内的偶数(列表推导式):')
l2 = [i for i in range(1000) if i % 2 == 0]
print(l2)
找出1000内的偶数(列表推导式):
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200, 202, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 224, 226, 228, 230, 232, 234, 236, 238, 240, 242, 244, 246, 248, 250, 252, 254, 256, 258, 260, 262, 264, 266, 268, 270, 272, 274, 276, 278, 280, 282, 284, 286, 288, 290, 292, 294, 296, 298, 300, 302, 304, 306, 308, 310, 312, 314, 316, 318, 320, 322, 324, 326, 328, 330, 332, 334, 336, 338, 340, 342, 344, 346, 348, 350, 352, 354, 356, 358, 360, 362, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 384, 386, 388, 390, 392, 394, 396, 398, 400, 402, 404, 406, 408, 410, 412, 414, 416, 418, 420, 422, 424, 426, 428, 430, 432, 434, 436, 438, 440, 442, 444, 446, 448, 450, 452, 454, 456, 458, 460, 462, 464, 466, 468, 470, 472, 474, 476, 478, 480, 482, 484, 486, 488, 490, 492, 494, 496, 498, 500, 502, 504, 506, 508, 510, 512, 514, 516, 518, 520, 522, 524, 526, 528, 530, 532, 534, 536, 538, 540, 542, 544, 546, 548, 550, 552, 554, 556, 558, 560, 562, 564, 566, 568, 570, 572, 574, 576, 578, 580, 582, 584, 586, 588, 590, 592, 594, 596, 598, 600, 602, 604, 606, 608, 610, 612, 614, 616, 618, 620, 622, 624, 626, 628, 630, 632, 634, 636, 638, 640, 642, 644, 646, 648, 650, 652, 654, 656, 658, 660, 662, 664, 666, 668, 670, 672, 674, 676, 678, 680, 682, 684, 686, 688, 690, 692, 694, 696, 698, 700, 702, 704, 706, 708, 710, 712, 714, 716, 718, 720, 722, 724, 726, 728, 730, 732, 734, 736, 738, 740, 742, 744, 746, 748, 750, 752, 754, 756, 758, 760, 762, 764, 766, 768, 770, 772, 774, 776, 778, 780, 782, 784, 786, 788, 790, 792, 794, 796, 798, 800, 802, 804, 806, 808, 810, 812, 814, 816, 818, 820, 822, 824, 826, 828, 830, 832, 834, 836, 838, 840, 842, 844, 846, 848, 850, 852, 854, 856, 858, 860, 862, 864, 866, 868, 870, 872, 874, 876, 878, 880, 882, 884, 886, 888, 890, 892, 894, 896, 898, 900, 902, 904, 906, 908, 910, 912, 914, 916, 918, 920, 922, 924, 926, 928, 930, 932, 934, 936, 938, 940, 942, 944, 946, 948, 950, 952, 954, 956, 958, 960, 962, 964, 966, 968, 970, 972, 974, 976, 978, 980, 982, 984, 986, 988, 990, 992, 994, 996, 998]

3. Python常用容器类型

  • list
l = [1, 'a', 2, 'b']
print(type(l))
print('修改前:', l)

# 修改list的内容
l[0] = 3
print('修改后:', l)

# 末尾添加元素
l.append(4)
print('添加后:', l)

# 遍历list
print('遍历list(for循环):')
for item in l:
    print(item)

# 通过索引遍历list
print('遍历list(while循环):')
i = 0
while i != len(l):
    print(l[i])
    i += 1

# 列表合并
print('列表合并(+):', [1, 2] + [3, 4])

# 列表重复
print('列表重复(*):', [1, 2] * 5)

# 判断元素是否在列表中
print('判断元素存在(in):', 1 in [1, 2])
<class 'list'>
修改前: [1, 'a', 2, 'b']
修改后: [3, 'a', 2, 'b']
添加后: [3, 'a', 2, 'b', 4]
遍历list(for循环):
3
a
2
b
4
遍历list(while循环):
3
a
2
b
4
列表合并(+): [1, 2, 3, 4]
列表重复(*): [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
判断元素存在(in): True
  • tuple
t = (1, 'a', 2, 'b')
print(type(t))

#元组的内容不能修改,否则会报错
# t[0] = 3 

# 遍历tuple
print('遍历list(for循环):')
for item in t:
    print(item)

# 通过索引遍历tuple
print('遍历tuple(while循环):')
i = 0
while i != len(t):
    print(t[i])
    i += 1

# 解包 unpack
a, b, _, _ = t
print('unpack: ', c)

# 确保unpack接收的变量个数和tuple的长度相同,否则报错
# 经常出现在函数返回值的赋值时
# a, b, c = t
  • dictionary
d = {'小象学院': 'http://www.chinahadoop.cn/',
    '百度': 'https://www.baidu.com/',
    '阿里巴巴': 'https://www.alibaba.com/',
    '腾讯': 'https://www.tencent.com/'}

print('通过key获取value: ', d['小象学院'])

# 遍历key
print('遍历key: ')
for key in d.keys():
    print(key)

# 遍历value
print('遍历value: ')
for value in d.values():
    print(value)

# 遍历item
print('遍历item: ')
for key, value in d.items():
    print(key + ': ' + value)

# format输出格式
print('format输出格式:')
for key, value in d.items():
    print('{}的网址是{}'.format(key, value))
  • set
print('创建set:')
my_set = {1, 2, 3}
print(my_set)
my_set = set([1, 2, 3, 2])
print(my_set)

print('添加单个元素:')
my_set.add(3)
print('添加3', my_set)

my_set.add(4)
print('添加4', my_set)

print('添加多个元素:')
my_set.update([4, 5, 6])
print(my_set)

4. Counter

  • 初始化
import collections

c1 = collections.Counter(['a', 'b', 'c', 'a', 'b', 'b'])
c2 = collections.Counter({'a':2, 'b':3, 'c':1})
c3 = collections.Counter(a=2, b=3, c=1)

print(c1)
print(c2)
print(c3)
Counter({'b': 3, 'a': 2, 'c': 1})
Counter({'b': 3, 'a': 2, 'c': 1})
Counter({'b': 3, 'a': 2, 'c': 1})
  • 更新内容
# 注意这里是做“加法”,不是“替换”
c1.update({'a': 4, 'c': -2, 'd': 4})
print(c1)
Counter({'a': 6, 'd': 4, 'b': 3, 'c': -1})
  • 访问内容
print('a=', c1['a'])
print('b=', c1['b'])
# 对比和dict的区别
print('e=', c1['e'])
a= 6
b= 3
e= 0
  • element()方法
for element in c1.elements():
    print(element)
d
d
d
d
b
b
b
a
a
a
a
a
a
  • most_common()方法
c1.most_common(3)
[('a', 6), ('d', 4), ('b', 3)]

5. defaultdict

# 统计每个字母出现的次数
s = 'chinadoop'

# 使用Counter
print(collections.Counter(s))
Counter({'o': 2, 'd': 1, 'c': 1, 'p': 1, 'a': 1, 'n': 1, 'h': 1, 'i': 1})
# 使用dict
counter = {}
for c in s:
    if c not in counter:
        counter[c] = 1
    else:
        counter[c] += 1

print(counter.items())
dict_items([('d', 1), ('c', 1), ('p', 1), ('a', 1), ('o', 2), ('n', 1), ('h', 1), ('i', 1)])
# 使用defaultdict
counter2 = collections.defaultdict(int)
for c in s:
    counter2[c] += 1
print(counter2.items())
dict_items([('d', 1), ('c', 1), ('p', 1), ('a', 1), ('o', 2), ('n', 1), ('h', 1), ('i', 1)])
# 记录相同元素的列表
colors = [('yellow', 1), ('blue', 2), ('yellow', 3), ('blue', 4), ('red', 1)]
d = collections.defaultdict(list)
for k, v in colors:
    d[k].append(v)

print(d.items())
dict_items([('blue', [2, 4]), ('yellow', [1, 3]), ('red', [1])])

6. map()函数

import math

print('示例1,获取两个列表对应位置上的最小值:')
l1 = [1, 3, 5, 7, 9]
l2 = [2, 4, 6, 8, 10]
mins = map(min, l1, l2)
print(mins)

# map()函数操作时,直到访问数据时才会执行
for item in mins:
    print(item)

print('示例2,对列表中的元素进行平方根操作:')
squared = map(math.sqrt, l2)
print(squared)
print(list(squared))
示例1,获取两个列表对应位置上的最小值:
<map object at 0x0000000004E26940>
1
3
5
7
9
示例2,对列表中的元素进行平方根操作:
<map object at 0x0000000004E26A58>
[1.4142135623730951, 2.0, 2.449489742783178, 2.8284271247461903, 3.1622776601683795]

7. 匿名函数 lambda

# my_func = lambda a, b, c: a * b
# print(my_func)
# print(my_func(1, 2, 3))

# 结合map
print('lambda结合map')
l1 = [1, 3, 5, 7, 9]
l2 = [2, 4, 6, 8, 10]
result = map(lambda x, y: x * 2 + y, l1, l2)
print(list(result))
lambda结合map
[4, 10, 16, 22, 28]

8. Python操作CSV数据文件

import csv

with open('grades.csv') as csvfile:
    grades_data = list(csv.DictReader(csvfile))

print('记录个数:', len(grades_data))
print('前2条记录:', grades_data[:2])
print('列名:', list(grades_data[0].keys()))
记录个数: 2315
前2条记录: [{'assignment5_grade': '47.710397816995446', 'assignment1_grade': '92.73394640624123', 'assignment2_grade': '83.03055176561709', 'assignment1_submission': '2015-11-02 06:55:34.282000000', 'assignment4_grade': '53.01155312999494', 'assignment6_grade': '38.16831825359636', 'assignment4_submission': '2015-11-16 01:21:24.663000000', 'assignment3_submission': '2015-11-12 08:58:33.998000000', 'assignment5_submission': '2015-11-20 13:24:59.692000000', 'student_id': 'B73F2C11-70F0-E37D-8B10-1D20AFED50B1', 'assignment2_submission': '2015-11-09 02:22:58.938000000', 'assignment6_submission': '2015-11-22 18:31:15.934000000', 'assignment3_grade': '67.16444141249367'}, {'assignment5_grade': '49.5883128141676', 'assignment1_grade': '86.79082085792986', 'assignment2_grade': '86.29082085792986', 'assignment1_submission': '2015-11-29 14:57:44.429000000', 'assignment4_grade': '55.0981253490751', 'assignment6_grade': '44.62948153275085', 'assignment4_submission': '2015-12-13 17:32:30.941000000', 'assignment3_submission': '2015-12-10 08:54:55.904000000', 'assignment5_submission': '2015-12-19 23:26:39.285000000', 'student_id': '98A0FAE0-A19A-13D2-4BB5-CFBFD94031D1', 'assignment2_submission': '2015-12-06 17:41:18.449000000', 'assignment6_submission': '2015-12-21 17:07:24.275000000', 'assignment3_grade': '69.7726566863439'}]
列名: ['assignment5_grade', 'assignment1_grade', 'assignment2_grade', 'assignment1_submission', 'assignment4_grade', 'assignment6_grade', 'assignment4_submission', 'assignment3_submission', 'assignment5_submission', 'student_id', 'assignment2_submission', 'assignment6_submission', 'assignment3_grade']
avg_assign1 = sum([float(row['assignment1_grade']) for row in grades_data]) / len(grades_data) 
print('assignment1平均分数:', avg_assign1)
assignment1平均分数: 74.5357320747794
assign1_sub_month = set(row['assignment1_submission'][:7] for row in grades_data)
print(assign1_sub_month)
{'2016-02', '2015-09', '2016-01', '2016-04', '2016-03', '2016-06', '2016-08', '2015-10', '2016-05', '2016-07', '2015-12', '2015-11'}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值