数据分析案例01
xiaoyao
1.USA.gov Data from Bitly–来自Bitly的USA.gov数据
数据背景:2011年,URL缩短服务Bitly跟美国政府⽹站USA.gov合作,提供了⼀份从⽣成.gov或.mil短链接的⽤户那⾥收集来的匿名数据。在2011年,除实时数据之外,还可以下载⽂本⽂件形式的每⼩时快照。2017年这项服务已经关闭。 以每⼩时快照为例,⽂件中各⾏的格式为JSON(即JavaScriptObject Notation,这是⼀种常⽤的Web数据格式)。
from numpy. random import randn
import numpy as np
np. random. seed( 123 )
import os
import matplotlib. pyplot as plt
import pandas as pd
plt. rc( 'figure' , figsize= ( 10 , 6 ) )
np. set_printoptions( precision= 4 )
pd. options. display. max_rows = 20
import warnings
warnings. filterwarnings( 'ignore' )
"""
In [5]: path = 'datasets/bitly_usagov/example.txt'
In [6]: open(path).readline()
Out[6]: '{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11
(KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1,
"tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l":
"orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r":
"http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u":
"http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc":
1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
"""
'\nIn [5]: path = \'datasets/bitly_usagov/example.txt\'\n\nIn [6]: open(path).readline()\nOut[6]: \'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11\n(KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1,\n"tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l":\n"orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r":\n"http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u":\n"http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc":\n1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n\'\n'
import json
path = 'datasets/bitly_usagov/example.txt'
records = [ json. loads( line) for line in open ( path) ]
records[ 0 ]
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
'll': [42.576698, -70.954903]}
使用纯python代码对时区进行计数
time_zones = [ rec[ 'tz' ] for rec in records]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-9-f3fbbc37f129> in <module>
----> 1 time_zones = [rec['tz'] for rec in records]
<ipython-input-9-f3fbbc37f129> in <listcomp>(.0)
----> 1 time_zones = [rec['tz'] for rec in records]
KeyError: 'tz'
time_zones = [ rec[ 'tz' ] for rec in records if 'tz' in rec]
time_zones[ : 10 ]
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
def get_counts ( sequence) :
counts = { }
for x in sequence:
if x in counts:
counts[ x] += 1
else :
counts[ x] = 1
return counts
将此过程写入到函数中,是为了获得更高的复用性,体现出函数式编程
from collections import defaultdict
def get_counts2 ( sequence) :
counts = defaultdict( int )
for x in sequence:
counts[ x] += 1
return counts
counts = get_counts( time_zones)
counts[ 'America/New_York' ]
1251
len ( time_zones)
3440
for cou in enumerate ( counts) :
print ( cou)
(0, 'America/New_York')
(1, 'America/Denver')
(2, 'America/Sao_Paulo')
(3, 'Europe/Warsaw')
(4, '')
(5, 'America/Los_Angeles')
(6, 'Asia/Hong_Kong')
(7, 'Europe/Rome')
(8, 'Africa/Ceuta')
(9, 'Europe/Madrid')
(10, 'Asia/Kuala_Lumpur')
(11, 'Asia/Nicosia')
(12, 'Europe/London')
(13, 'Pacific/Honolulu')
(14, 'America/Chicago')
(15, 'Europe/Malta')
(16, 'Europe/Lisbon')
(17, 'Europe/Paris')
(18, 'Europe/Copenhagen')
(19, 'America/Mazatlan')
(20, 'Europe/Dublin')
(21, 'Europe/Brussels')
(22, 'America/Vancouver')
(23, 'Europe/Amsterdam')
(24, 'Europe/Prague')
(25, 'Europe/Stockholm')
(26, 'America/Anchorage')
(27, 'Asia/Bangkok')
(28, 'Europe/Berlin')
(29, 'America/Rainy_River')
(30, 'Europe/Budapest')
(31, 'Asia/Tokyo')
(32, 'Europe/Vienna')
(33, 'America/Phoenix')
(34, 'Asia/Jerusalem')
(35, 'Asia/Karachi')
(36, 'America/Bogota')
(37, 'America/Indianapolis')
(38, 'America/Montreal')
(39, 'Asia/Calcutta')
(40, 'Europe/Skopje')
(41, 'Asia/Beirut')
(42, 'Australia/NSW')
(43, 'Chile/Continental')
(44, 'America/Halifax')
(45, 'America/Edmonton')
(46, 'Europe/Bratislava')
(47, 'America/Recife')
(48, 'Africa/Cairo')
(49, 'Asia/Istanbul')
(50, 'Asia/Novosibirsk')
(51, 'Europe/Moscow')
(52, 'Europe/Sofia')
(53, 'Europe/Ljubljana')
(54, 'America/Mexico_City')
(55, 'Europe/Helsinki')
(56, 'Europe/Bucharest')
(57, 'Europe/Zurich')
(58, 'America/Puerto_Rico')
(59, 'America/Monterrey')
(60, 'Europe/Athens')
(61, 'America/Winnipeg')
(62, 'Europe/Riga')
(63, 'America/Argentina/Buenos_Aires')
(64, 'Asia/Dubai')
(65, 'Europe/Oslo')
(66, 'Asia/Yekaterinburg')
(67, 'Asia/Manila')
(68, 'America/Caracas')
(69, 'Asia/Riyadh')
(70, 'America/Montevideo')
(71, 'America/Argentina/Mendoza')
(72, 'Asia/Seoul')
(73, 'Europe/Uzhgorod')
(74, 'Australia/Queensland')
(75, 'Europe/Belgrade')
(76, 'America/Costa_Rica')
(77, 'America/Lima')
(78, 'Asia/Pontianak')
(79, 'America/Chihuahua')
(80, 'Europe/Vilnius')
(81, 'America/Managua')
(82, 'Africa/Lusaka')
(83, 'America/Guayaquil')
(84, 'Asia/Harbin')
(85, 'Asia/Amman')
(86, 'Africa/Johannesburg')
(87, 'America/St_Kitts')
(88, 'Pacific/Auckland')
(89, 'America/Santo_Domingo')
(90, 'America/Argentina/Cordoba')
(91, 'Asia/Kuching')
(92, 'Europe/Volgograd')
(93, 'America/La_Paz')
(94, 'Africa/Casablanca')
(95, 'Asia/Jakarta')
(96, 'America/Tegucigalpa')
def top_counts ( count_dict, n= 10 ) :
value_key_pairs = [ ( count, tz) for tz, count in count_dict. items( ) ]
value_key_pairs. sort( )
return value_key_pairs[ - n: ]
top_counts( counts)
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
from collections import Counter
counts = Counter( time_zones)
counts. most_common( 10 )
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
使用pandas对时区进行计数
import pandas as pd
frame = pd. DataFrame( records)
frame. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 3440 non-null object
1 c 2919 non-null object
2 nk 3440 non-null float64
3 tz 3440 non-null object
4 gr 2919 non-null object
5 g 3440 non-null object
6 h 3440 non-null object
7 l 3440 non-null object
8 al 3094 non-null object
9 hh 3440 non-null object
10 r 3440 non-null object
11 u 3440 non-null object
12 t 3440 non-null float64
13 hc 3440 non-null float64
14 cy 2919 non-null object
15 ll 2919 non-null object
16 _heartbeat_ 120 non-null float64
17 kw 93 non-null object
dtypes: float64(4), object(14)
memory usage: 500.8+ KB
frame. head( )
a c nk tz gr g h l al hh r u t hc cy ll _heartbeat_ kw 0 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... US 1.0 America/New_York MA A6qOVH wfLQtf orofrog en-US,en;q=0.8 1.usa.gov http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... http://www.ncbi.nlm.nih.gov/pubmed/22415991 1.331923e+09 1.331823e+09 Danvers [42.576698, -70.954903] NaN NaN 1 GoogleMaps/RochesterNY US 0.0 America/Denver UT mwszkS mwszkS bitly NaN j.mp http://www.AwareMap.com/ http://www.monroecounty.gov/etc/911/rss.php 1.331923e+09 1.308262e+09 Provo [40.218102, -111.613297] NaN NaN 2 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... US 1.0 America/New_York DC xxr3Qb xxr3Qb bitly en-US 1.usa.gov http://t.co/03elZC4Q http://boxer.senate.gov/en/press/releases/0316... 1.331923e+09 1.331920e+09 Washington [38.9007, -77.043098] NaN NaN 3 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... BR 0.0 America/Sao_Paulo 27 zCaLwp zUtuOu alelex88 pt-br 1.usa.gov direct http://apod.nasa.gov/apod/ap120312.html 1.331923e+09 1.331923e+09 Braz [-23.549999, -46.616699] NaN NaN 4 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... US 0.0 America/New_York MA 9b6kNl 9b6kNl bitly en-US,en;q=0.8 bit.ly http://www.shrewsbury-ma.gov/selco/ http://www.shrewsbury-ma.gov/egov/gallery/1341... 1.331923e+09 1.273672e+09 Shrewsbury [42.286499, -71.714699] NaN NaN
frame[ 'tz' ] [ : 10 ]
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
这里frame的输出形式是摘要视图(summary view), 主要用于较大的DataFrame对象。对于Series,可以使用value_counts方法
tz_counts = frame[ 'tz' ] . value_counts( )
tz_counts[ : 10 ]
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
clean_tz = frame[ 'tz' ] . fillna( 'Missing' )
clean_tz[ clean_tz == '' ] = 'Unknown'
tz_counts = clean_tz. value_counts( )
tz_counts[ : 10 ]
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
plt. figure( figsize= ( 10 , 4 ) )
<Figure size 720x288 with 0 Axes>
<Figure size 720x288 with 0 Axes>
type ( tz_counts)
pandas.core.series.Series
可视化示例数据中排名前十的时区
import seaborn as sns
subset = tz_counts[ : 10 ]
sns. barplot( y= subset. index, x= subset. values)
<matplotlib.axes._subplots.AxesSubplot at 0x2af1a108bc8>
frame[ 'a' ] [ 1 ]
'GoogleMaps/RochesterNY'
frame[ 'a' ] [ 50 ]
'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
type ( frame[ 'a' ] [ 50 ] )
str
frame[ 'a' ] [ 51 ]
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
frame[ 'a' ] [ 51 ] [ : 50 ]
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P9'
"""
描述
Python split() 通过指定分隔符对字符串进行切片,如果参数 num 有指定值,则分隔 num+1 个子字符串
语法
split() 方法语法:
str.split(str="", num=string.count(str)).
str.split(“o”)[0]得到的是第一个o之前的内容
str.split(“o”)[1]得到的是第一个o和第二个o之间的内容
str.split(“o”)[3]得到的是第三个o后和第四个o前之间的内容
str.split("[")[0]得到的是第一个 [ 之前的内容
"""
results = pd. Series( [ x. split( ) [ 0 ] for x in frame. a. dropna( ) ] )
results[ : 5 ]
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
dtype: object
results. value_counts( ) [ : 8 ]
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
cframe = frame[ frame. a. notnull( ) ]
cframe = cframe. copy( )
cframe. head( )
a c nk tz gr g h l al hh r u t hc cy ll _heartbeat_ kw 0 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... US 1.0 America/New_York MA A6qOVH wfLQtf orofrog en-US,en;q=0.8 1.usa.gov http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/... http://www.ncbi.nlm.nih.gov/pubmed/22415991 1.331923e+09 1.331823e+09 Danvers [42.576698, -70.954903] NaN NaN 1 GoogleMaps/RochesterNY US 0.0 America/Denver UT mwszkS mwszkS bitly NaN j.mp http://www.AwareMap.com/ http://www.monroecounty.gov/etc/911/rss.php 1.331923e+09 1.308262e+09 Provo [40.218102, -111.613297] NaN NaN 2 Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ... US 1.0 America/New_York DC xxr3Qb xxr3Qb bitly en-US 1.usa.gov http://t.co/03elZC4Q http://boxer.senate.gov/en/press/releases/0316... 1.331923e+09 1.331920e+09 Washington [38.9007, -77.043098] NaN NaN 3 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)... BR 0.0 America/Sao_Paulo 27 zCaLwp zUtuOu alelex88 pt-br 1.usa.gov direct http://apod.nasa.gov/apod/ap120312.html 1.331923e+09 1.331923e+09 Braz [-23.549999, -46.616699] NaN NaN 4 Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi... US 0.0 America/New_York MA 9b6kNl 9b6kNl bitly en-US,en;q=0.8 bit.ly http://www.shrewsbury-ma.gov/selco/ http://www.shrewsbury-ma.gov/egov/gallery/1341... 1.331923e+09 1.273672e+09 Shrewsbury [42.286499, -71.714699] NaN NaN
cframe[ 'os' ] = np. where( cframe[ 'a' ] . str . contains( 'Windows' ) ,
'Windows' , 'Not Windows' )
cframe[ 'os' ] [ : 5 ]
0 Windows
1 Not Windows
2 Windows
3 Not Windows
4 Windows
Name: os, dtype: object
by_tz_os = cframe. groupby( [ 'tz' , 'os' ] )
agg_counts = by_tz_os. size( ) . unstack( ) . fillna( 0 )
agg_counts[ : 10 ]
os Not Windows Windows tz 245.0 276.0 Africa/Cairo 0.0 3.0 Africa/Casablanca 0.0 1.0 Africa/Ceuta 0.0 2.0 Africa/Johannesburg 0.0 1.0 Africa/Lusaka 0.0 1.0 America/Anchorage 4.0 1.0 America/Argentina/Buenos_Aires 1.0 0.0 America/Argentina/Cordoba 0.0 1.0 America/Argentina/Mendoza 0.0 1.0
indexer = agg_counts. sum ( 1 ) . argsort( )
indexer[ : 10 ]
tz
24
Africa/Cairo 20
Africa/Casablanca 21
Africa/Ceuta 92
Africa/Johannesburg 87
Africa/Lusaka 53
America/Anchorage 54
America/Argentina/Buenos_Aires 57
America/Argentina/Cordoba 26
America/Argentina/Mendoza 55
dtype: int64
type ( agg_counts)
pandas.core.frame.DataFrame
count_subset = agg_counts. take( indexer[ - 10 : ] )
count_subset
os Not Windows Windows tz America/Sao_Paulo 13.0 20.0 Europe/Madrid 16.0 19.0 Pacific/Honolulu 0.0 36.0 Asia/Tokyo 2.0 35.0 Europe/London 43.0 31.0 America/Denver 132.0 59.0 America/Los_Angeles 130.0 252.0 America/Chicago 115.0 285.0 245.0 276.0 America/New_York 339.0 912.0
agg_counts. sum ( 1 ) . nlargest( 10 )
tz
America/New_York 1251.0
521.0
America/Chicago 400.0
America/Los_Angeles 382.0
America/Denver 191.0
Europe/London 74.0
Asia/Tokyo 37.0
Pacific/Honolulu 36.0
Europe/Madrid 35.0
America/Sao_Paulo 33.0
dtype: float64
plt. figure( )
<Figure size 720x432 with 0 Axes>
<Figure size 720x432 with 0 Axes>
type ( count_subset)
pandas.core.frame.DataFrame
count_subset = count_subset. stack( )
count_subset. name = 'total'
count_subset = count_subset. reset_index( )
count_subset[ : 10 ]
tz os total 0 America/Sao_Paulo Not Windows 13.0 1 America/Sao_Paulo Windows 20.0 2 Europe/Madrid Not Windows 16.0 3 Europe/Madrid Windows 19.0 4 Pacific/Honolulu Not Windows 0.0 5 Pacific/Honolulu Windows 36.0 6 Asia/Tokyo Not Windows 2.0 7 Asia/Tokyo Windows 35.0 8 Europe/London Not Windows 43.0 9 Europe/London Windows 31.0
sns. barplot( x= 'total' , y= 'tz' , hue= 'os' , data= count_subset)
<matplotlib.axes._subplots.AxesSubplot at 0x2af1b8198c8>
def norm_total ( group) :
group[ 'normed_total' ] = group. total / group. total. sum ( )
return group
results = count_subset. groupby( 'tz' ) . apply ( norm_total)
plt. figure( )
<Figure size 720x432 with 0 Axes>
<Figure size 720x432 with 0 Axes>
sns. barplot( x= 'normed_total' , y= 'tz' , hue= 'os' , data= results)
<matplotlib.axes._subplots.AxesSubplot at 0x2af1b6c5788>
g = count_subset. groupby( 'tz' )
results2 = count_subset. total / g. total. transform( 'sum' )