testPythonnotebooktest

Introductory examples

1.usa.gov data from bit.ly

In [1]:
%pwd
Out[1]:
u'C:\\Users\\Pomodori\\CodeDreams'
In [2]:
%cd ../book_scripts
[Error 2] The system cannot find the file specified: u'../book_scripts'
C:\Users\Pomodori\CodeDreams
In [3]:
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
In [4]:
open(path).readline()
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-bcaecf00da5d> in <module>()
----> 1 open(path).readline()

IOError: [Errno 2] No such file or directory: 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
In [ ]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
In [ ]:
records[0]
In [ ]:
records[0]['tz']
In [ ]:
print(records[0]['tz'])

Counting time zones in pure Python

In [ ]:
time_zones = [rec['tz'] for rec in records]
In [ ]:
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
In [ ]:
time_zones[:10]
In [ ]:
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts
In [ ]:
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int) # values will initialize to 0
    for x in sequence:
        counts[x] += 1
    return counts
In [ ]:
counts = get_counts(time_zones)
In [ ]:
counts['America/New_York']
In [ ]:
len(time_zones)
In [ ]:
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]
In [ ]:
top_counts(counts)
In [ ]:
from collections import Counter
In [ ]:
counts = Counter(time_zones)
In [ ]:
counts.most_common(10)

Counting time zones with pandas

In [ ]:
%matplotlib inline
In [ ]:
from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4)
In [ ]:
import json
path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
lines = open(path).readlines()
records = [json.loads(line) for line in lines]
In [ ]:
from pandas import DataFrame, Series
import pandas as pd

frame = DataFrame(records)
frame
In [ ]:
frame['tz'][:10]
In [ ]:
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
In [ ]:
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
In [ ]:
plt.figure(figsize=(10, 4))
In [ ]:
tz_counts[:10].plot(kind='barh', rot=0)
In [ ]:
frame['a'][1]
In [ ]:
frame['a'][50]
In [ ]:
frame['a'][51]
In [ ]:
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
In [ ]:
results.value_counts()[:8]
In [ ]:
cframe = frame[frame.a.notnull()]
In [ ]:
operating_system = np.where(cframe['a'].str.contains('Windows'),
                            'Windows', 'Not Windows')
operating_system[:5]
In [ ]:
by_tz_os = cframe.groupby(['tz', operating_system])
In [ ]:
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
In [ ]:
# Use to sort in ascending order
indexer = agg_counts.sum(1).argsort()
indexer[:10]
In [ ]:
count_subset = agg_counts.take(indexer)[-10:]
count_subset
In [ ]:
plt.figure()
In [ ]:
count_subset.plot(kind='barh', stacked=True)
In [ ]:
plt.figure()
In [5]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-f3e1a1f283fc> in <module>()
----> 1 normed_subset = count_subset.div(count_subset.sum(1), axis=0)
      2 normed_subset.plot(kind='barh', stacked=True)

NameError: name 'count_subset' is not defined

MovieLens 1M data set

In [ ]:
import pandas as pd
encoding = 'latin1'

upath = os.path.expanduser('movielens/ml-1m/users.dat')
rpath = os.path.expanduser('movielens/ml-1m/ratings.dat')
mpath = os.path.expanduser('movielens/ml-1m/movies.dat')

unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
mnames = ['movie_id', 'title', 'genres']

users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)
ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)
movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)
In [ ]:
users[:5]
In [ ]:
ratings[:5]
In [ ]:
movies[:5]
In [ ]:
ratings
In [ ]:
data = pd.merge(pd.merge(ratings, users), movies)
data
In [ ]:
data.ix[0]
In [ ]:
mean_ratings = data.pivot_table('rating', rows='title',
                                cols='gender', aggfunc='mean')
mean_ratings[:5]
In [ ]:
ratings_by_title = data.groupby('title').size()
In [ ]:
ratings_by_title[:5]
In [ ]:
active_titles = ratings_by_title.index[ratings_by_title >= 250]
In [ ]:
active_titles[:10]
In [ ]:
mean_ratings = mean_ratings.ix[active_titles]
mean_ratings
In [ ]:
mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':
                           'Seven Samurai (Shichinin no samurai) (1954)'})
In [ ]:
top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)
top_female_ratings[:10]

Measuring rating disagreement

In [ ]:
mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']
In [ ]:
sorted_by_diff = mean_ratings.sort_index(by='diff')
sorted_by_diff[:15]
In [ ]:
# Reverse order of rows, take first 15 rows
sorted_by_diff[::-1][:15]
In [ ]:
# Standard deviation of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()
# Filter down to active_titles
rating_std_by_title = rating_std_by_title.ix[active_titles]
# Order Series by value in descending order
rating_std_by_title.order(ascending=False)[:10]

US Baby Names 1880-2010

In [ ]:
from __future__ import division
from numpy.random import randn
import numpy as np
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12, 5))
np.set_printoptions(precision=4)
%pwd
In [ ]:
!head -n 10 baby_names/names/yob1880.txt
In [ ]:
import pandas as pd
names1880 = pd.read_csv('baby_names/names/yob1880.txt', names=['name', 'sex', 'births'])
names1880
In [ ]:
names1880.groupby('sex').births.sum()
In [ ]:
# 2010 is the last available year right now
years = range(1880, 2011)

pieces = []
columns = ['name', 'sex', 'births']

for year in years:
    path = 'baby_names/names/yob%d.txt' % year
    frame = pd.read_csv(path, names=columns)

    frame['year'] = year
    pieces.append(frame)

# Concatenate everything into a single DataFrame
names = pd.concat(pieces, ignore_index=True)
In [ ]:
names = pd.read_pickle('baby_names/all_names')
In [ ]:
names
In [ ]:
total_births = names.pivot_table('births', rows='year',
                                 cols='sex', aggfunc=sum)
In [ ]:
total_births.tail()
In [ ]:
total_births.plot(title='Total births by sex and year')
In [ ]:
def add_prop(group):
    # Integer division floors
    births = group.births.astype(float)

    group['prop'] = births / births.sum()
    return group
names = names.groupby(['year', 'sex']).apply(add_prop)
In [ ]:
names
In [ ]:
np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)
In [ ]:
def get_top1000(group):
    return group.sort_index(by='births', ascending=False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1000 = grouped.apply(get_top1000)
In [ ]:
pieces = []
for year, group in names.groupby(['year', 'sex']):
    pieces.append(group.sort_index(by='births', ascending=False)[:1000])
top1000 = pd.concat(pieces, ignore_index=True)
In [ ]:
top1000.index = np.arange(len(top1000))
In [ ]:
top1000
In [ ]:
boys = top1000[top1000.sex == 'M']
girls = top1000[top1000.sex == 'F']
In [ ]:
total_births = top1000.pivot_table('births', rows='year', cols='name',
                                   aggfunc=sum)
total_births
In [ ]:
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]
subset.plot(subplots=True, figsize=(12, 10), grid=False,
            title="Number of births per year")
Measuring the increase in naming diversity
In [ ]:
plt.figure()
In [ ]:
table = top1000.pivot_table('prop', rows='year',
                            cols='sex', aggfunc=sum)
table.plot(title='Sum of table1000.prop by year and sex',
           yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))
In [ ]:
df = boys[boys.year == 2010]
df
In [ ]:
prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()
prop_cumsum[:10]
In [ ]:
prop_cumsum.values.searchsorted(0.5)
In [ ]:
df = boys[boys.year == 1900]
in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()
in1900.values.searchsorted(0.5) + 1
In [ ]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1

diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
In [ ]:
def get_quantile_count(group, q=0.5):
    group = group.sort_index(by='prop', ascending=False)
    return group.prop.cumsum().values.searchsorted(q) + 1
diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)
diversity = diversity.unstack('sex')
diversity.head()
In [ ]:
diversity.plot(title="Number of popular names in top 50%")
The "Last letter" Revolution
In [ ]:
# extract last letter from name column
get_last_letter = lambda x: x[-1]
last_letters = names.name.map(get_last_letter)
last_letters.name = 'last_letter'

table = names.pivot_table('births', rows=last_letters,
                          cols=['sex', 'year'], aggfunc=sum)
In [ ]:
subtable = table.reindex(columns=[1910, 1960, 2010], level='year')
subtable.head()
subtable.sum()
In [ ]:
letter_prop = subtable / subtable.sum().astype(float)
In [ ]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(2, 1, figsize=(10, 8))
letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')
letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',
                      legend=False)
In [ ]:
plt.subplots_adjust(hspace=0.25)
In [ ]:
letter_prop = table / table.sum().astype(float)

dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T
dny_ts.head()
In [ ]:
plt.close('all')
In [ ]:
dny_ts.plot()
Boy names that became girl names (and vice versa)
In [ ]:
all_names = top1000.name.unique()
mask = np.array(['lesl' in x.lower() for x in all_names])
lesley_like = all_names[mask]
lesley_like
In [ ]:
filtered = top1000[top1000.name.isin(lesley_like)]
filtered.groupby('name').births.sum()
In [ ]:
table = filtered.pivot_table('births', rows='year',
                             cols='sex', aggfunc='sum')
table = table.div(table.sum(1), axis=0)
table.tail()
In [ ]:
plt.close('all')
In [ ]:
table.plot(style={'M': 'k-', 'F': 'k--'})
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值