pandas—举例

In [1]:
import json
path='usagov_bitly_data2012-03-16-1331923249.txt'
records=[json.loads(line) for line in open(path)]
records[0]
Out[1]:
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'al': 'en-US,en;q=0.8',
 'c': 'US',
 'cy': 'Danvers',
 'g': 'A6qOVH',
 'gr': 'MA',
 'h': 'wfLQtf',
 'hc': 1331822918,
 'hh': '1.usa.gov',
 'l': 'orofrog',
 'll': [42.576698, -70.954903],
 'nk': 1,
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 't': 1331923247,
 'tz': 'America/New_York',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}
In [5]:
time_zone=[rec['tz'] for rec in records if 'tz' in rec]
time_zone[:10]   #获取时区
Out[5]:
['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']
In [17]:
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
frame=DataFrame(records)
frame.head(15)
Out[17]:
 _heartbeat_aalccyggrhhchhkwlllnkrttzu
0NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USDanversA6qOVHMAwfLQtf1.331823e+091.usa.govNaNorofrog[42.576698, -70.954903]1.0http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/...1.331923e+09America/New_Yorkhttp://www.ncbi.nlm.nih.gov/pubmed/22415991
1NaNGoogleMaps/RochesterNYNaNUSProvomwszkSUTmwszkS1.308262e+09j.mpNaNbitly[40.218102, -111.613297]0.0http://www.AwareMap.com/1.331923e+09America/Denverhttp://www.monroecounty.gov/etc/911/rss.php
2NaNMozilla/4.0 (compatible; MSIE 8.0; Windows NT ...en-USUSWashingtonxxr3QbDCxxr3Qb1.331920e+091.usa.govNaNbitly[38.9007, -77.043098]1.0http://t.co/03elZC4Q1.331923e+09America/New_Yorkhttp://boxer.senate.gov/en/press/releases/0316...
3NaNMozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8)...pt-brBRBrazzCaLwp27zUtuOu1.331923e+091.usa.govNaNalelex88[-23.549999, -46.616699]0.0direct1.331923e+09America/Sao_Paulohttp://apod.nasa.gov/apod/ap120312.html
4NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USShrewsbury9b6kNlMA9b6kNl1.273672e+09bit.lyNaNbitly[42.286499, -71.714699]0.0http://www.shrewsbury-ma.gov/selco/1.331923e+09America/New_Yorkhttp://www.shrewsbury-ma.gov/egov/gallery/1341...
5NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...en-US,en;q=0.8USShrewsburyaxNK8cMAaxNK8c1.273673e+09bit.lyNaNbitly[42.286499, -71.714699]0.0http://www.shrewsbury-ma.gov/selco/1.331923e+09America/New_Yorkhttp://www.shrewsbury-ma.gov/egov/gallery/1341...
6NaNMozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1...pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4PLLubanwcndER77zkpJBR1.331923e+091.usa.govNaNbnjacobs[51.116699, 15.2833]0.0http://plus.url.google.com/url?sa=z&n=13319232...1.331923e+09Europe/Warsawhttp://www.nasa.gov/mission_pages/nustar/main/...
7NaNMozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/2...bg,en-us;q=0.7,en;q=0.3NoneNaNwcndERNaNzkpJBR1.331923e+091.usa.govNaNbnjacobsNaN0.0http://www.facebook.com/1.331923e+09 http://www.nasa.gov/mission_pages/nustar/main/...
8NaNOpera/9.80 (X11; Linux zbov; U; en) Presto/2.1...en-US, enNoneNaNwcndERNaNzkpJBR1.331923e+091.usa.govNaNbnjacobsNaN0.0http://www.facebook.com/l.php?u=http%3A%2F%2F1...1.331923e+09 http://www.nasa.gov/mission_pages/nustar/main/...
9NaNMozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...pt-BR,pt;q=0.8,en-US;q=0.6,en;q=0.4NoneNaNzCaLwpNaNzUtuOu1.331923e+091.usa.govNaNalelex88NaN0.0http://t.co/o1Pd0WeV1.331923e+09 http://apod.nasa.gov/apod/ap120312.html
10NaNMozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...en-us,en;q=0.5USSeattlevNJS4HWAu0uD9q1.319564e+091.usa.govNaNo_4us71ccioa[47.5951, -122.332603]1.0direct1.331923e+09America/Los_Angeleshttps://www.nysdot.gov/rexdesign/design/commun...
11NaNMozilla/5.0 (Macintosh; U; Intel Mac OS X 10.4...en-us,en;q=0.5USWashingtonwG7OIHDCA0nRz41.331816e+091.usa.govNaNdarrellissa[38.937599, -77.092796]0.0http://t.co/ND7SoPyo1.331923e+09America/New_Yorkhttp://oversight.house.gov/wp-content/uploads/...
12NaNMozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.2)...en-us,en;q=0.5USAlexandriavNJS4HVAu0uD9q1.319564e+091.usa.govNaNo_4us71ccioa[38.790901, -77.094704]1.0direct1.331923e+09America/New_Yorkhttps://www.nysdot.gov/rexdesign/design/commun...
131.331923e+09NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
14NaNMozilla/5.0 (Windows; U; Windows NT 6.1; en-US...en-us,en;q=0.5USMarietta2rOUYcGA2rOUYc1.255770e+091.usa.govNaNbitly[33.953201, -84.5177]1.0direct1.331923e+09America/New_Yorkhttp://toxtown.nlm.nih.gov/index.php
In [8]:
tz_counts=frame['tz'].value_counts()
tz_counts
Out[8]:
America/New_York                  1251
                                   521
America/Chicago                    400
America/Los_Angeles                382
America/Denver                     191
Europe/London                       74
Asia/Tokyo                          37
Pacific/Honolulu                    36
Europe/Madrid                       35
America/Sao_Paulo                   33
Europe/Berlin                       28
Europe/Rome                         27
America/Rainy_River                 25
Europe/Amsterdam                    22
America/Indianapolis                20
America/Phoenix                     20
Europe/Warsaw                       16
America/Mexico_City                 15
Europe/Stockholm                    14
Europe/Paris                        14
America/Vancouver                   12
Pacific/Auckland                    11
America/Puerto_Rico                 10
Asia/Hong_Kong                      10
Europe/Prague                       10
Europe/Oslo                         10
Europe/Helsinki                     10
Europe/Moscow                       10
Asia/Istanbul                        9
Asia/Calcutta                        9
                                  ... 
Europe/Belgrade                      2
Africa/Johannesburg                  1
America/Lima                         1
Africa/Lusaka                        1
Australia/Queensland                 1
Asia/Riyadh                          1
America/Tegucigalpa                  1
Asia/Novosibirsk                     1
America/La_Paz                       1
America/Montevideo                   1
Asia/Kuching                         1
America/Mazatlan                     1
America/Argentina/Mendoza            1
Asia/Nicosia                         1
Europe/Ljubljana                     1
America/Costa_Rica                   1
Asia/Yekaterinburg                   1
America/Santo_Domingo                1
Asia/Pontianak                       1
Europe/Uzhgorod                      1
America/Argentina/Cordoba            1
Europe/Sofia                         1
Africa/Casablanca                    1
Asia/Manila                          1
Europe/Volgograd                     1
Europe/Skopje                        1
America/Caracas                      1
America/Argentina/Buenos_Aires       1
America/Monterrey                    1
America/St_Kitts                     1
Name: tz, Length: 97, dtype: int64
In [23]:
clean_tz=frame['tz'].fillna('Missing')
clean_tz[clean_tz=='']='Unknown'        #处理缺失数据
print(clean_tz)
tz_counts=clean_tz.value_counts()
tz_counts[:10]
0          America/New_York
1            America/Denver
2          America/New_York
3         America/Sao_Paulo
4          America/New_York
5          America/New_York
6             Europe/Warsaw
7                   Unknown
8                   Unknown
9                   Unknown
10      America/Los_Angeles
11         America/New_York
12         America/New_York
13                  Missing
14         America/New_York
15           Asia/Hong_Kong
16           Asia/Hong_Kong
17         America/New_York
18           America/Denver
19              Europe/Rome
20             Africa/Ceuta
21         America/New_York
22         America/New_York
23         America/New_York
24            Europe/Madrid
25        Asia/Kuala_Lumpur
26             Asia/Nicosia
27        America/Sao_Paulo
28                  Unknown
29                  Unknown
               ...         
3530    America/Los_Angeles
3531                Unknown
3532       America/New_York
3533       America/New_York
3534        America/Chicago
3535        America/Chicago
3536                Unknown
3537    America/Tegucigalpa
3538    America/Los_Angeles
3539    America/Los_Angeles
3540         America/Denver
3541    America/Los_Angeles
3542    America/Los_Angeles
3543                Missing
3544        America/Chicago
3545        America/Chicago
3546    America/Los_Angeles
3547       America/New_York
3548        America/Chicago
3549       Europe/Stockholm
3550       America/New_York
3551                Unknown
3552        America/Chicago
3553       America/New_York
3554       America/New_York
3555       America/New_York
3556        America/Chicago
3557         America/Denver
3558    America/Los_Angeles
3559       America/New_York
Name: tz, Length: 3560, dtype: object
Out[23]:
America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64
In [32]:
%pylab
tz_counts[:10].plot(kind='barh',rot=0)
Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x19f6f8d7278>
In [37]:
results=Series([x.split()[0] for x in frame.a.dropna()])
print(results[:5])
results.value_counts()[:6]    #数一下各有多少个
0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object
Out[37]:
Mozilla/5.0               2594
Mozilla/4.0                601
GoogleMaps/RochesterNY     121
Opera/9.80                  34
TEST_INTERNET_AGENT         24
GoogleProducer              21
dtype: int64
In [38]:
#统计Windows用户
cframe=frame[frame.a.notnull()]
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]
Out[38]:
array(['Windows', 'Not Windows', 'Windows', 'Not Windows', 'Windows'],
      dtype='<U11')
In [47]:
by_tz_os=cframe.groupby(['tz',operating_system])
by_tz_os.size().unstack().fillna(0)   # 牛!
Out[47]:
 Not WindowsWindows
tz  
 245.0276.0
Africa/Cairo0.03.0
Africa/Casablanca0.01.0
Africa/Ceuta0.02.0
Africa/Johannesburg0.01.0
Africa/Lusaka0.01.0
America/Anchorage4.01.0
America/Argentina/Buenos_Aires1.00.0
America/Argentina/Cordoba0.01.0
America/Argentina/Mendoza0.01.0
America/Bogota1.02.0
America/Caracas0.01.0
America/Chicago115.0285.0
America/Chihuahua1.01.0
America/Costa_Rica0.01.0
America/Denver132.059.0
America/Edmonton2.04.0
America/Guayaquil2.00.0
America/Halifax1.03.0
America/Indianapolis8.012.0
America/La_Paz0.01.0
America/Lima0.01.0
America/Los_Angeles130.0252.0
America/Managua0.03.0
America/Mazatlan1.00.0
America/Mexico_City7.08.0
America/Monterrey1.00.0
America/Montevideo0.01.0
America/Montreal3.06.0
America/New_York339.0912.0
.........
Europe/Berlin9.019.0
Europe/Bratislava1.02.0
Europe/Brussels1.03.0
Europe/Bucharest1.03.0
Europe/Budapest0.05.0
Europe/Copenhagen2.03.0
Europe/Dublin1.02.0
Europe/Helsinki2.08.0
Europe/Lisbon1.07.0
Europe/Ljubljana0.01.0
Europe/London43.031.0
Europe/Madrid16.019.0
Europe/Malta0.02.0
Europe/Moscow1.09.0
Europe/Oslo2.08.0
Europe/Paris4.010.0
Europe/Prague3.07.0
Europe/Riga1.01.0
Europe/Rome8.019.0
Europe/Skopje0.01.0
Europe/Sofia0.01.0
Europe/Stockholm2.012.0
Europe/Uzhgorod0.01.0
Europe/Vienna3.03.0
Europe/Vilnius0.02.0
Europe/Volgograd0.01.0
Europe/Warsaw1.015.0
Europe/Zurich4.00.0
Pacific/Auckland3.08.0
Pacific/Honolulu0.036.0

97 rows × 2 columns

In [7]:
#MovieLens 1M数据集
unames=['user_id','gender','age','occupation','zip']
users=pd.read_table('users.dat',sep='::',header=None,names=unames)
rnames=['user_id','movie_id','rating','timestamp']
ratings=pd.read_table('ratings.dat',sep='::',header=None,names=rnames)
mnames=['movie_id','title','genres']
movies=pd.read_table('movies.dat',sep='::',header=None,names=mnames)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:7: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  import sys
In [8]:
users.head()
Out[8]:
 user_idgenderageoccupationzip
01F11048067
12M561670072
23M251555117
34M45702460
45M252055455
In [9]:
ratings[:5]
Out[9]:
 user_idmovie_idratingtimestamp
0111935978300760
116613978302109
219143978301968
3134084978300275
4123555978824291
In [10]:
movies[:5]
Out[10]:
 movie_idtitlegenres
01Toy Story (1995)Animation|Children's|Comedy
12Jumanji (1995)Adventure|Children's|Fantasy
23Grumpier Old Men (1995)Comedy|Romance
34Waiting to Exhale (1995)Comedy|Drama
45Father of the Bride Part II (1995)Comedy
In [12]:
data=pd.merge(pd.merge(ratings,users),movies)
data
Out[12]:
 user_idmovie_idratingtimestampgenderageoccupationziptitlegenres
0111935978300760F11048067One Flew Over the Cuckoo's Nest (1975)Drama
1211935978298413M561670072One Flew Over the Cuckoo's Nest (1975)Drama
21211934978220179M251232793One Flew Over the Cuckoo's Nest (1975)Drama
31511934978199279M25722903One Flew Over the Cuckoo's Nest (1975)Drama
41711935978158471M50195350One Flew Over the Cuckoo's Nest (1975)Drama
51811934978156168F18395825One Flew Over the Cuckoo's Nest (1975)Drama
61911935982730936M11048073One Flew Over the Cuckoo's Nest (1975)Drama
72411935978136709F25710023One Flew Over the Cuckoo's Nest (1975)Drama
82811933978125194F25114607One Flew Over the Cuckoo's Nest (1975)Drama
93311935978557765M45355421One Flew Over the Cuckoo's Nest (1975)Drama
103911935978043535M18461820One Flew Over the Cuckoo's Nest (1975)Drama
114211933978038981M25824502One Flew Over the Cuckoo's Nest (1975)Drama
124411934978018995M451798052One Flew Over the Cuckoo's Nest (1975)Drama
134711934977978345M18494305One Flew Over the Cuckoo's Nest (1975)Drama
144811934977975061M25492107One Flew Over the Cuckoo's Nest (1975)Drama
154911934978813972M181277084One Flew Over the Cuckoo's Nest (1975)Drama
165311935977946400M25096931One Flew Over the Cuckoo's Nest (1975)Drama
175411935977944039M50156723One Flew Over the Cuckoo's Nest (1975)Drama
185811935977933866M25230303One Flew Over the Cuckoo's Nest (1975)Drama
195911934977934292F50155413One Flew Over the Cuckoo's Nest (1975)Drama
206211934977968584F35398105One Flew Over the Cuckoo's Nest (1975)Drama
218011934977786172M56149327One Flew Over the Cuckoo's Nest (1975)Drama
228111935977785864F25060640One Flew Over the Cuckoo's Nest (1975)Drama
238811935977694161F45102476One Flew Over the Cuckoo's Nest (1975)Drama
248911935977683596F56985749One Flew Over the Cuckoo's Nest (1975)Drama
259511935977626632M45098201One Flew Over the Cuckoo's Nest (1975)Drama
269611933977621789F251678028One Flew Over the Cuckoo's Nest (1975)Drama
279911932982791053F11019390One Flew Over the Cuckoo's Nest (1975)Drama
28102119351040737607M351920871One Flew Over the Cuckoo's Nest (1975)Drama
2910411932977546620M251200926One Flew Over the Cuckoo's Nest (1975)Drama
.................................
1000179493330843962757020M251594040Home Page (1999)Documentary
10001804802221821014866656M56140601Juno and Paycock (1930)Drama
1000181481223082962932391M181425301Detroit 9000 (1973)Action|Crime
100018248746244962781918F25470808Condition Red (1995)Action|Drama|Thriller
1000183505914344962484364M451622652Stranger, The (1994)Action
1000184594714344957190428F451697215Stranger, The (1994)Action
1000185507718683962417299M25220037Truce, The (1996)Drama|War
1000186594418681957197520F181027606Truce, The (1996)Drama|War
100018751054043962337582M50718977Brother Minister: The Assassination of Malcolm...Documentary
100018851854044963402617F35444485Brother Minister: The Assassination of Malcolm...Documentary
100018955324045959619841M251727408Brother Minister: The Assassination of Malcolm...Documentary
100019055434043960127592M251797401Brother Minister: The Assassination of Malcolm...Documentary
1000191522025433961546137M25791436Six Ways to Sunday (1997)Comedy
1000192575425434958272316F18160640Six Ways to Sunday (1997)Comedy
100019352275913961475931M181064050Tough and Deadly (1995)Action|Drama|Thriller
100019457955911958145253M25192688Tough and Deadly (1995)Action|Drama|Thriller
1000195531336565960920392M56055406Lured (1947)Crime
1000196532824384960838075F25491740Outside Ozona (1998)Drama|Thriller
1000197533433233960796159F561346140Chain of Fools (2000)Comedy|Crime
100019853341271960795494F561346140Silence of the Palace, The (Saimt el Qusur) (1...Drama
1000199533433825960796159F561346140Song of Freedom (1936)Drama
1000200542018433960156505F11914850Slappy and the Stinkers (1998)Children's|Comedy
100020154332863960240881F351745014Nemesis 2: Nebula (1995)Action|Sci-Fi|Thriller
1000202549435304959816296F351794306Smoking/No Smoking (1993)Comedy
1000203555621983959445515M45692103Modulations (1998)Documentary
1000204594921985958846401M181747901Modulations (1998)Documentary
1000205567527033976029116M351430030Broken Vessels (1998)Drama
1000206578028451958153068M181792886White Boys (1999)Drama
1000207585136075957756608F182055410One Little Indian (1973)Comedy|Drama|Western
1000208593829094957273353M25135401Five Wives, Three Secretaries and Me (1998)Documentary

1000209 rows × 10 columns

In [13]:
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings.head()
Out[13]:
genderFM
title  
$1,000,000 Duck (1971)3.3750002.761905
'Night Mother (1986)3.3888893.352941
'Til There Was You (1997)2.6756762.733333
'burbs, The (1989)2.7934782.962085
...And Justice for All (1979)3.8285713.689024
In [23]:
ratings_by_title=data.groupby('title').size()  #根据电影名字分组
ratings_by_title[:10]
Out[23]:
title
$1,000,000 Duck (1971)                37
'Night Mother (1986)                  70
'Til There Was You (1997)             52
'burbs, The (1989)                   303
...And Justice for All (1979)        199
1-900 (1994)                           2
10 Things I Hate About You (1999)    700
101 Dalmatians (1961)                565
101 Dalmatians (1996)                364
12 Angry Men (1957)                  616
dtype: int64
In [31]:
active_titles=ratings_by_title.index[ratings_by_title>=250]
active_titles       #评分数据大于250条的电影名称
Out[31]:
Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'X-Men (2000)', 'Year of Living Dangerously (1982)',
       'Yellow Submarine (1968)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Zero Effect (1998)', 'eXistenZ (1999)'],
      dtype='object', name='title', length=1216)
In [39]:
mean_ratings=mean_ratings.loc[active_titles]
mean_ratings
Out[39]:
genderFM
title  
'burbs, The (1989)2.7934782.962085
10 Things I Hate About You (1999)3.6465523.311966
101 Dalmatians (1961)3.7914443.500000
101 Dalmatians (1996)3.2400002.911215
12 Angry Men (1957)4.1843974.328421
13th Warrior, The (1999)3.1120003.168000
2 Days in the Valley (1996)3.4888893.244813
20,000 Leagues Under the Sea (1954)3.6701033.709205
2001: A Space Odyssey (1968)3.8255814.129738
2010 (1984)3.4468093.413712
28 Days (2000)3.2094242.977707
39 Steps, The (1935)3.9655174.107692
54 (1998)2.7017542.782178
7th Voyage of Sinbad, The (1958)3.4090913.658879
8MM (1999)2.9062502.850962
About Last Night... (1986)3.1886793.140909
Absent Minded Professor, The (1961)3.4693883.446809
Absolute Power (1997)3.4691363.327759
Abyss, The (1989)3.6592363.689507
Ace Ventura: Pet Detective (1994)3.0000003.197917
Ace Ventura: When Nature Calls (1995)2.2696632.543333
Addams Family Values (1993)3.0000002.878531
Addams Family, The (1991)3.1861703.163498
Adventures in Babysitting (1987)3.4557823.208122
Adventures of Buckaroo Bonzai Across the 8th Dimension, The (1984)3.3085113.402321
Adventures of Priscilla, Queen of the Desert, The (1994)3.9890713.688811
Adventures of Robin Hood, The (1938)4.1666673.918367
African Queen, The (1951)4.3242324.223822
Age of Innocence, The (1993)3.8270683.339506
Agnes of God (1985)3.5348843.244898
.........
White Men Can't Jump (1992)3.0287773.231061
Who Framed Roger Rabbit? (1988)3.5693783.713251
Who's Afraid of Virginia Woolf? (1966)4.0297034.096939
Whole Nine Yards, The (2000)3.2965523.404814
Wild Bunch, The (1969)3.6363644.128099
Wild Things (1998)3.3920003.459082
Wild Wild West (1999)2.2754492.131973
William Shakespeare's Romeo and Juliet (1996)3.5326093.318644
Willow (1988)3.6586833.453543
Willy Wonka and the Chocolate Factory (1971)4.0639533.789474
Witness (1985)4.1158543.941504
Wizard of Oz, The (1939)4.3550304.203138
Wolf (1994)3.0740742.899083
Women on the Verge of a Nervous Breakdown (1988)3.9343073.865741
Wonder Boys (2000)4.0437963.913649
Working Girl (1988)3.6067423.312500
World Is Not Enough, The (1999)3.3375003.388889
Wrong Trousers, The (1993)4.5882354.478261
Wyatt Earp (1994)3.1470593.283898
X-Files: Fight the Future, The (1998)3.4894743.493797
X-Men (2000)3.6823103.851702
Year of Living Dangerously (1982)3.9512203.869403
Yellow Submarine (1968)3.7142863.689286
You've Got Mail (1998)3.5424243.275591
Young Frankenstein (1974)4.2899634.239177
Young Guns (1988)3.3717953.425620
Young Guns II (1990)2.9347832.904025
Young Sherlock Holmes (1985)3.5147063.363344
Zero Effect (1998)3.8644073.723140
eXistenZ (1999)3.0985923.289086

1216 rows × 2 columns

In [45]:
top_female_ratings=mean_ratings.sort_values(by='F',ascending=False)
top_female_ratings[:10]      #女性最喜欢的十大电影排行
Out[45]:
genderFM
title  
Close Shave, A (1995)4.6444444.473795
Wrong Trousers, The (1993)4.5882354.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)4.5726504.464589
Wallace & Gromit: The Best of Aardman Animation (1996)4.5631074.385075
Schindler's List (1993)4.5626024.491415
Shawshank Redemption, The (1994)4.5390754.560625
Grand Day Out, A (1992)4.5378794.293255
To Kill a Mockingbird (1962)4.5366674.372611
Creature Comforts (1990)4.5138894.272277
Usual Suspects, The (1995)4.5133174.518248
In [51]:
#找出男女分歧最大的电影
mean_ratings['diff']=mean_ratings['M']-mean_ratings['F']
sorted_by_diff=mean_ratings.sort_values(by='diff')     #sort_values排序
print(sorted_by_diff[:5])
print(sorted_by_diff[::-1][:5])
sorted_by_diff[-5:]           #男女差别较大且 男性更喜欢的电影
gender                            F         M      diff
title                                                  
Dirty Dancing (1987)       3.790378  2.959596 -0.830782
Jumpin' Jack Flash (1986)  3.254717  2.578358 -0.676359
Grease (1978)              3.975265  3.367041 -0.608224
Little Women (1994)        3.870588  3.321739 -0.548849
Steel Magnolias (1989)     3.901734  3.365957 -0.535777
gender                                         F         M      diff
title                                                               
Good, The Bad and The Ugly, The (1966)  3.494949  4.221300  0.726351
Kentucky Fried Movie, The (1977)        2.878788  3.555147  0.676359
Dumb & Dumber (1994)                    2.697987  3.336595  0.638608
Longest Day, The (1962)                 3.411765  4.031447  0.619682
Cable Guy, The (1996)                   2.250000  2.863787  0.613787
Out[51]:
genderFMdiff
title   
Cable Guy, The (1996)2.2500002.8637870.613787
Longest Day, The (1962)3.4117654.0314470.619682
Dumb & Dumber (1994)2.6979873.3365950.638608
Kentucky Fried Movie, The (1977)2.8787883.5551470.676359
Good, The Bad and The Ugly, The (1966)3.4949494.2213000.726351
In [57]:
#不考虑性别因素 只找出分歧最大的电影
rating_std_by_title=data.groupby('title')['rating'].std()   
print(rating_std_by_title[:5])       #每个电影打分的方差
rating_std_by_title=rating_std_by_title.loc[active_titles]
print(type(rating_std_by_title))
rating_std_by_title.sort_values(ascending=False)[:5]
title
$1,000,000 Duck (1971)           1.092563
'Night Mother (1986)             1.118636
'Til There Was You (1997)        1.020159
'burbs, The (1989)               1.107760
...And Justice for All (1979)    0.878110
Name: rating, dtype: float64
<class 'pandas.core.series.Series'>
Out[57]:
title
Dumb & Dumber (1994)                     1.321333
Blair Witch Project, The (1999)          1.316368
Natural Born Killers (1994)              1.307198
Tank Girl (1995)                         1.277695
Rocky Horror Picture Show, The (1975)    1.260177
Name: rating, dtype: float64
In [1]:
#全美婴儿姓名分析
names1880=pd.read_csv('yob1880.txt',names=['name','sex','births'])
names1880.head()
Out[1]:
 namesexbirths
0MaryF7065
1AnnaF2604
2EmmaF2003
3ElizabethF1939
4MinnieF1746
In [62]:
names1880.groupby('sex')['births'].sum()
Out[62]:
sex
F     90993
M    110493
Name: births, dtype: int64
In [2]:
years=range(1880,2011)
pieces=[]
columns=['names','sex','births']
for year in years:
    path='yob%d.txt' %year
    frame=pd.read_csv(path,names=columns)
    frame['year']=year
    pieces.append(frame)
#print(pieces)
names=pd.concat(pieces,ignore_index=True) 
In [78]:
names.head()
Out[78]:
 namessexbirthsyear
0MaryF70651880
1AnnaF26041880
2EmmaF20031880
3ElizabethF19391880
4MinnieF17461880
In [3]:
total_births=names.pivot_table('births',index='year',columns='sex',aggfunc=sum)
total_births.head()
Out[3]:
sexFM
year  
188090993110493
188191955100748
1882107851113687
1883112322104632
1884129021114445
In [87]:
import matplotlib.pyplot as plt
total_births.plot(title="Total births by sex and year")
plt.show()

In [29]:
def add_prop(group):   #名字占比
    births=group.births.astype(float)
    group['prop']=births/births.sum()
    return group
names=names.groupby(['year','sex']).apply(add_prop)
names.head()
Out[29]:
 namessexbirthsyearprop
0MaryF706518800.077643
1AnnaF260418800.028618
2EmmaF200318800.022013
3ElizabethF193918800.021309
4MinnieF174618800.019188
In [31]:
def get_top1000(group):    #获取每年各男女前1000的
    return group.sort_values(by='births',ascending=False)[:1000]
grouped=names.groupby(['year','sex'])
top1000=grouped.apply(get_top1000)
top1000[:10]
Out[31]:
   namessexbirthsyearprop
yearsex      
1880F0MaryF706518800.077643
1AnnaF260418800.028618
2EmmaF200318800.022013
3ElizabethF193918800.021309
4MinnieF174618800.019188
5MargaretF157818800.017342
6IdaF147218800.016177
7AliceF141418800.015540
8BerthaF132018800.014507
9SarahF128818800.014155
In [32]:
boys=top1000[top1000['sex']=='M']
girls=top1000[top1000['sex']=='F']
total_births=top1000.pivot_table('births',index='year',columns='names',aggfunc=sum)
total_births.head()   #每年叫这些名字的人数
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
Out[32]:
namesAadenAaliyahAaravAaronAarushAbAbagailAbbAbbeyAbbie...ZoaZoeZoeyZoieZolaZollieZonaZoraZulaZuri
year                     
1880NaNNaNNaN102.0NaNNaNNaNNaNNaN71.0...8.023.0NaNNaN7.0NaN8.028.027.0NaN
1881NaNNaNNaN94.0NaNNaNNaNNaNNaN81.0...NaN22.0NaNNaN10.0NaN9.021.027.0NaN
1882NaNNaNNaN85.0NaNNaNNaNNaNNaN80.0...8.025.0NaNNaN9.0NaN17.032.021.0NaN
1883NaNNaNNaN105.0NaNNaNNaNNaNNaN79.0...NaN23.0NaNNaN10.0NaN11.035.025.0NaN
1884NaNNaNNaN97.0NaNNaNNaNNaNNaN98.0...13.031.0NaNNaN14.06.08.058.027.0NaN

5 rows × 6868 columns

In [33]:
import matplotlib.pyplot as plt
subset=total_births[['John','Harry','Mary','Marilyn']]
subset.plot(subplots=True,figsize=(12,10),grid=True,title="Number of births per year")
plt.show()

In [39]:
table=top1000.pivot_table('prop',index='year',columns='sex',aggfunc=sum) #前1000名字占比
table.plot(title='Sum of table1000.prop by year and sex',yticks=np.linspace(0,1.2,13),xticks=range(1880,2020,10))
plt.show()   #每年前1000个名字占比逐渐减少 说明命名更加多样化
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'year' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)
D:\Anaconda3\lib\site-packages\pandas\core\reshape\pivot.py:135: FutureWarning: 'sex' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  grouped = data.groupby(keys)

In [44]:
df=boys[boys['year']==2010]
df[:5]
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1000 entries, (2010, M, 1676644) to (2010, M, 1677645)
Data columns (total 5 columns):
names     1000 non-null object
sex       1000 non-null object
births    1000 non-null int64
year      1000 non-null int64
prop      1000 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.0+ MB
In [52]:
#最后一个字母的变革
get_last_letter=lambda x:x[-1]
last_letters=names.names.map(get_last_letter)   #map匿名函数
last_letters.name='last_letter'
last_letters.head()
Out[52]:
0    y
1    a
2    a
3    h
4    e
Name: last_letter, dtype: object
In [55]:
table=names.pivot_table('births',index=last_letters,columns=['sex','year'],aggfunc=sum)
subtable=table.reindex(columns=[1910,1960,2010],level='year')
subtable.head()
Out[55]:
sexFM
year191019602010191019602010
last_letter      
a108376.0691247.0670605.0977.05204.028438.0
bNaN694.0450.0411.03912.038859.0
c5.049.0946.0482.015476.023125.0
d6750.03729.02607.022111.0262112.044398.0
e133569.0435013.0313833.028655.0178823.0129012.0
In [56]:
subtable.sum()   #男女各年总数
Out[56]:
sex  year
F    1910     396416.0
     1960    2022062.0
     2010    1759010.0
M    1910     194198.0
     1960    2132588.0
     2010    1898382.0
dtype: float64
In [59]:
letter_prop=subtable/subtable.sum().astype(float)
fig,axes=plt.subplots(2,1,figsize=(10,8))
letter_prop['M'].plot(kind='bar',rot=0,ax=axes[0],title='Male')
letter_prop['F'].plot(kind='bar',rot=0,ax=axes[1],title='Female',legend=False)
plt.show()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值