图灵程序丛书 —《数据科学入门》— Ch9-Ch10 获取数据、数据工作

最新推荐文章于 2021-03-05 12:23:08 发布

NJYR21

最新推荐文章于 2021-03-05 12:23:08 发布

阅读量447

点赞数

分类专栏： TURING -数据科学入门文章标签： python 数据工作数据处理

本文链接：https://blog.csdn.net/NJYR21/article/details/79502799

版权

TURING -数据科学入门专栏收录该内容

6 篇文章 0 订阅

订阅专栏

Ch9-Ch10 获取数据、数据工作

此系列记录《数据科学入门》学习笔记（停止）

Chap 9 获取数据（只有9.2）

9.2 读取文件

9.2.1 文本文件基础

# 'r'意味只读
file_for_reading = open('reading_file.txt', 'r')

# 'r'意味写入，会破坏已存在的文件
file_for_writing = open('reading_file.txt', 'r')

# 'a'意味添加，加入文件的末尾
file_for_appending = open('appending_file.txt', 'a')

# 关闭文件
file_for_writing.close()

# 实际操作中会经常忘记关闭文件，通过在with程序块里操作，文件会在结尾处自动关闭
with open(filename, 'r') as f:
    data = function_that_gets_data_from(f)

# 可以通过for语句对文件的行进行迭代
starts_with_bash = 0

with open('input.txt', 'r') as f:
    for line in file:
        if re.match("^#", line):
            starts_with_bash += 1

# 提取邮箱@后面的部分，并计数
from collections import Counter

def get_domain(email_address):
    """split on '@' and return the last piece"""
    return email_address.lower().split("@")[-1]

with open('C://Users//nijie//Desktop//email_addresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "@" in line)

# dahAodSAaoho@163.com
# saZhoZfg@qq.com
# saZhEohfii@dahia.agiuh
# dagihfoai@gmail.edu.cn
# dahGZSiIHInkncd@126.com
# ahdahi@163.com
# aZsaChojhi@126.com
# haigZdaiygti@163.com

domain_counts
# Counter({'126.com': 2,
#          '163.com': 3,
#          'dahia.agiuh': 1,
#          'gmail.edu.cn': 1,
#          'qq.com': 1})

9.2.2 限制的文件

import csv

with open('C://Users//nijie//Desktop//tab_delimited_stock_prices.txt', 'r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2])
        print(date, symbol, closing_price)

# 6/20/2014 AAPL 90.91
# 6/20/2014 MSFT 41.68
# 6/20/2014 FB 64.5
# 6/19/2014 AAPL 91.86
# 6/19/20 14 MSFT 41.51
# 6/19/2014 FB 64.34

# 当文件存在表头，以字典的形式读入，第一行为字典
with open('C://Users//nijie//Desktop//colon_delimited_stock_prices.txt', 'r', encoding="utf-8") as f:
    reader = csv.DictReader(f, delimiter=':')
    for row in reader:
        date = row['date']
        symbol = row['symbol']
        closing_price = float(row['closing_price'])
        print(date, symbol, closing_price)

# 6/20/2014 AAPL 90.91
# 6/20/2014 MSFT 41.68
# 6/20/2014 FB 64.5
# 6/19/2014 AAPL 91.86
# 6/19/2014 MSFT 41.51
# 6/19/2014 FB 64.34

Chap 10 数据工作

10.1 探索你的数据

10.1.1 探索一维数据

import math
from collections import Counter
import matplotlib.pyplot as plt
from pylab import *

def bucketize(point, bucket_size):
    """floor the point to the next lower multyple of bucket_size"""
    return bucket_size * math.floor(point / bucket_size)

def make_histogram(points, bucket_size):
    """buckets the points and counts how many in each bucket"""
    return Counter(bucketize(point, bucket_size) for point in points)

def plot_histogram(points, bucket_size, title=""):
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    histogram = make_histogram(points, bucket_size)
    plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
    plt.title(title)
    plt.show()

import math

def normal_cdf(x, mu=0, sigma=1):
    return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2

def inverse_normal_cdf(p, mu=0, sigma=1, tolerance=0.00001):
    # 如果非标准型，先调整单位使之服从标准型
    if mu != 0 or sigma != 1:
        return mu + sigma * inverse_normal_cdf(p, tolerance=tolerance)
    
    low_z, low_p = -10, 0   # normal_cdf(-10) -> 0
    hi_z, hi_p   = 10, 1    # normal_cdf(10) -> 1
    while hi_z - low_z > tolerance:
        mid_z = (low_z + hi_z) / 2   # midpoint
        mid_p = normal_cdf(mid_z)    # 考虑cdf在那里的值
        if mid_p < p:
            # midpoint仍然太低，搜索比他大的值
            low_z, low_p = mid_z, mid_p
        elif mid_p > p:
            # midpoint仍然太高，搜索比他小的值
            hi_z, hi_p = mid_z, mid_p
        else:
            break
            
    return mid_z

import random

random.seed(0)
# -100到100之间均匀抽取
uniform = [200 * random.random() - 100 for _ in range(10000)]
# 均值为0标准差为57的正态分布
normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

plot_histogram(uniform, 10, '均匀分布的直方图')
plot_histogram(normal, 10, '正态分布的直方图')

10.1.2 二维数据

def random_normal():
    """returns a random draw from a standard normal distribution"""
    return inverse_normal_cdf(random.random())

xs = [random_normal() for _ in range(1000)]
ys1 = [ x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]

# ys1和ys2的分布相同
plot_histogram(ys1, 10)
plot_histogram(ys2, 10)

plt.scatter(xs, ys1, marker='+', color='k', label='ys1')
plt.scatter(xs, ys2, marker='*', color='g', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title('差别很大的联合分布')
plt.show()

import numpy as np
print(np.corrcoef(xs, ys1)[0, 1])
print(np.corrcoef(xs, ys2)[0, 1])
# 0.9010493686379657
# -0.8920981526879997

10.1.3 多维数据

# 相关系数矩阵

def correlation(x, y):
    cov = sum((x_i - mean(x)) * (y_i - mean(y)) for x_i, y_i in zip(x, y)) / (len(x) - 1)
    disx = sum((x_i - mean(x)) ** 2 for x_i in x) / (len(x) - 1)
    disy = sum((y_i - mean(y)) ** 2 for y_i in y) / (len(x) - 1)
    return sum(cov / np.sqrt(disx) / np.sqrt(disy))

def get_column(x, j):
    return x[:, j]

def make_matrix(num_rows, num_cols, entry_fn):
    return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]

def correlation_matrix(data):
    """returns the num_columns x num_columns matrix whose (i, j)th entry is the corralation between column i and j data"""
    _, num_columns = shape(data)
    
    def matrix_entry(i, j):
        return correlation(get_column(data, i), get_column(data, j))
    
    return make_matrix(num_columns, num_columns, matrix_entry)

data = np.vstack((list(range(10)), list(range(10, 20))))
print(data)
print(correlation_matrix(data.T))
print(np.corrcoef(data))
# [[ 0  1  2  3  4  5  6  7  8  9]
#  [10 11 12 13 14 15 16 17 18 19]]
# [[0.9999999999999999, 0.9999999999999999], [0.9999999999999999, 0.9999999999999999]]
# [[1. 1.]
# [1. 1.]]

# 散点图阵
x = range(50)
x1 = [random.gauss(mu=5 + i, sigma=1) for i in range(50)]
x2 = [random.uniform(a=1 + i ** 2, b=10) for i in range(50)]
x3 = [random.gauss(mu=5 - i, sigma=1) for i in range(50)]
x4 = [random.uniform(a=5 + 2 * i, b=10) for i in range(50)]
data = (np.vstack((x1, x2, x3, x4))).T

import matplotlib.pyplot as plt

_, num_columns = shape(data)
fig, ax = plt.subplots(num_columns, num_columns)

for i in range(num_columns):
    for j in range(num_columns):
        
        # x轴上column_j对y轴上column_i的散点
        if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i))
            
        # 只有当i == j时显示序列名
        else: ax[i][j].annotate("series" + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va='center')
            
        # 除了图的左侧和下方之外，隐藏图的标记
        if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
        if j > 0: ax[i][j].yaxis.set_visible(False)
            
# 修复右下方和左下方的图标记
# 因为只有他们是文本，是错误的
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())
plt.show()

10.2 清理与修改

from dateutil.parser import parse

# def parse_row(input_row, parses):
#     """given a list of parsers (some of which may be None) apply the appropriate one to each element of the input_row"""
#     return [parser(value) if parser is not None else value for value, parser in zip(input_now, parsers)]

def parse_rows_with(reader, parsers):
    """wap a redder to apply the parsers to each of its rows"""
    for row in reader:
        yield parse_row(row, parsers)
        
def try_or_none(f):
    """wraps f to return None if f raises an exception assumes f takes only one inpot"""
    def f_or_none(x):
        try: return f(x)
        except: return None
    return f_or_none

def parse_row(input_row, parsers):
    return [try_or_none(parser)(value) if parser is not None else value for value, parser in zip(input_row, parsers)]

# date symbol closing_price
# 6/20/2014 AAPL 90.91
# 6/20/2014 MSFT 41.68
6/20/3014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT n/a
6/19/2014 FB 64.34

import csv
import dateutil.parser

data = []

with open('C://Users//nijie//Desktop//comma_delimited_stock_prices.csv', 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
        data.append(line)
        
for row in data:
    if any(x is None for x in row):
        print(row)
# [None, 'symbol', None]
# [datetime.datetime(2014, 6, 19, 0, 0), 'MSFT', None]

10.3 数据处理

# 数据是瞎打的
import datetime

data = [{'closing_price': 102.06,
         'date': datetime.datetime(2014, 8, 29, 0, 0),
         'symbol': 'AAPL'},
        {'closing_price': 156.59,
         'date': datetime.datetime(2014, 8, 29, 0, 0),
         'symbol': 'MSFT'},
        {'closing_price': 72.91,
         'date': datetime.datetime(2014, 8, 29, 0, 0),
         'symbol': 'FB'},
         {'closing_price': 105.66,
         'date': datetime.datetime(2014, 8, 30, 0, 0),
         'symbol': 'AAPL'},
        {'closing_price': 145.17,
         'date': datetime.datetime(2014, 8, 30, 0, 0),
         'symbol': 'MSFT'},
        {'closing_price': 80.43,
         'date': datetime.datetime(2014, 8, 30, 0, 0),
         'symbol': 'FB'},
         {'closing_price': 101.15,
         'date': datetime.datetime(2014, 8, 31, 0, 0),
         'symbol': 'AAPL'},
        {'closing_price': 146.32,
         'date': datetime.datetime(2014, 8, 31, 0, 0),
         'symbol': 'MSFT'},
        {'closing_price': 63.87,
         'date': datetime.datetime(2014, 8, 31, 0, 0),
         'symbol': 'FB'}
        ]

# 求AAPL最高收盘价
max_aapl_price = max(row['closing_price'] for row in data if row['symbol'] == 'AAPL')
max_aapl_price
# 105.66

# 按照股票代码分组,输出每个股票的最高收盘价
from collections import defaultdict

by_symbol = defaultdict(list)
for row in data:
    by_symbol[row['symbol']].append(row)

# 使用字典解析找到每个股票代码的最大值
max_price_by_symbol = {symbol: max(row['closing_price'] for row in grouped_rows)
                       for symbol, grouped_rows in by_symbol.items()}
max_price_by_symbol
# {'AAPL': 105.66, 'FB': 80.43, 'MSFT': 156.59}

def picker(field_name):
    """returns a function that picks field out of a dict"""
    return lambda row: row[field_name]

def pluck(field_name, rows):
    """turn a list of dicts into the list of field_name values"""
    return map(picker(field_name), rows)

def group_by(grouper, rows, value_transform=None):
    # 键是分组情况的输出，值是行的列表
    grouped = defaultdict(list)
    for row in rows:
        grouped[grouper(row)].append(row)
        
    if value_transform is None:
        return grouped
    else:
        return {key: value_transform(rows) for key, rows in grouped.items()}
    
max_price_by_symbol = group_by(picker('symbol'), data, lambda rows: max(pluck('closing_price', rows)))
max_price_by_symbol
# {'AAPL': 105.66, 'FB': 80.43, 'MSFT': 156.59}

# 计算单日百分比
def percent_price_change(yesterday, today):
    return today['closing_price'] / yesterday['closing_price'] - 1

def day_over_day_changes(grouped_rows):
    # 按日期对行进行排序
    ordered = sorted(grouped_rows, key=picker('date'))
    # ordered = sorted(grouped_rows, key=lambda x: (picker('symbol')(x), picker('date')(x)))
    
    # 对偏移量应用zip函数得到连续两天的成对表示
    return [{'symbol': today['symbol'],
             'date': today['date'],
             'change': percent_price_change(yesterday, today)}
            for yesterday, today in zip(ordered, ordered[1:])]

# 键是股票代码，值是一个‘change’字典列表
changes_by_symbol = group_by(picker('symbol'), data, day_over_day_changes)

#收集所有‘change’字典放入一个大列表中,字典套字典，需要两个for
all_changes = [change 
               for changes in changes_by_symbol.values() 
               for change in changes]

display(all_changes)
print(max(all_changes, key=picker('change')))
print(min(all_changes, key=picker('change')))
# 以下为输出结果
[{'change': 0.03527336860670194,
  'date': datetime.datetime(2014, 8, 30, 0, 0),
  'symbol': 'AAPL'},
 {'change': -0.04268408101457499,
  'date': datetime.datetime(2014, 8, 31, 0, 0),
  'symbol': 'AAPL'},
 {'change': -0.07292930583051294,
  'date': datetime.datetime(2014, 8, 30, 0, 0),
  'symbol': 'MSFT'},
 {'change': 0.007921746917407324,
  'date': datetime.datetime(2014, 8, 31, 0, 0),
  'symbol': 'MSFT'},
 {'change': 0.10314085859278577,
  'date': datetime.datetime(2014, 8, 30, 0, 0),
  'symbol': 'FB'},
 {'change': -0.2058933233867961,
  'date': datetime.datetime(2014, 8, 31, 0, 0),
  'symbol': 'FB'}]
{'symbol': 'FB', 'date': datetime.datetime(2014, 8, 30, 0, 0), 'change': 0.10314085859278577}
{'symbol': 'FB', 'date': datetime.datetime(2014, 8, 31, 0, 0), 'change': -0.2058933233867961}

# 为了组合百分比的变化，我们对每一项加1，把他们相乘再减去1,
# 例如： (1 + 10%) * (1 - 20%) - 1 = 1.1 * 0.8 - 1 = -12%

def combine_pct_changes(pct_change1, pct_change2):
    return (1 + pct_change1) * (1 + pct_change2) - 1

def overall_change(changes):
    return reduce(combine_pct_changes, pluck('change', changes))

overall_change_by_month = group_by(lambda row: row['date'].month, all_changes, overall_change)

10.3 数据调整

<table>
  <tr>
    <th width=25%, bgcolor = lightblue> 观测对象</th>
    <th width=25%, bgcolor = lightblue> 身高（英寸）</th>
    <th width=25%, bgcolor = lightblue> 身高（厘米）</th>
    <th width=25%, bgcolor = lightblue> 体重（磅）</th>
  </tr>
  <tr>
    <td bgcolor=lightgrey> A </td>
    <td> 63 </td>
    <td> 160  </td>
    <td> 150  </td>
  </tr>
  <tr>
    <td bgcolor=lightgrey> B </td>
    <td> 67 </td>
    <td> 170.2 </td>
    <td> 160  </td>
  </tr>
  <tr>
    <td bgcolor=lightgrey> C </td>
    <td> 70 </td>
    <td> 177.8  </td>
    <td> 171  </td>
  <tr>
</table>

import math

def distance(v, w):
    return math.sqrt(sum((v_i - w_i) ** 2 for v_i, w_i in zip(v, w)))

# 英寸为单位
a_to_b = distance([63, 150], [67, 160]);print(a_to_b)
a_to_c = distance([63, 150], [70, 171]);print(a_to_c)
b_to_c = distance([67, 160], [70, 171]);print(b_to_c)

print('----------------------------------------------')

# 厘米为单位
a_to_b = distance([160, 150], [170.2, 160]);print(a_to_b)
a_to_c = distance([160, 150], [177.8, 171]);print(a_to_c)
b_to_c = distance([170.2, 160], [177.8, 171]);print(b_to_c)

# 10.770329614269007
# 22.135943621178654
# 11.40175425099138
----------------------------------------------
# 14.284257068535268
# 27.52889391167034
# 13.370115930686627

10.5 降维（只有书上所有代码）

# 标准化
def get_column(data, j):
    return data[:, j]

def standard_deviation(v):
    n = len(v)
    v_bar = mean(v)
    return math.sqrt(sum((v_i - v_bar) ** 2) / (n - 1))

def make_matrix(num_rows, num_cols, entry_fn):
    return [[entry_fn(i, j) for j in range(num_cols)] for i in range(num_rows)]

def scale(data_matrix):
    """returns the means and standar deviations of each column"""
    num_rows, nun_cols = shape(data_matrix)
    means = [mean(get_column(data_matrix, j)) for j in range(num_cols)]
    stdevs = [standard_deviation(get_column(data_matrix, j)) for j in range(num_cols)]
    return mans, stdevs

def rescale(data_matrix):
    """rescales the input data so that each column has mean 0 and standard deviation 1 leaves alone columns with no deviation"""
    means, stdevs = scale(data_matrix)
    
    def rescaled(i, j):
        if stdevs[j] > 0:
            return (data_matrix[i, j] - means[j]) / stedvs[j]
# 将数据均值化为零

def de_mean_matrix(A):
    """returns the result of subtracting from every value in A the mean value of its column. 
       the resulting matrix has mean theta in every column"""
    nr, nc = shape(A)
    columns_means, _ = scale(A)
    return make_matrix(nr, nc, lambda i, j: A[i][j] - columns_mean[j])


import math

def magnitude(w):
    return math.sqrt(sum(w_i ** 2 for w_i in w))

def direction(w):
    mag = magnitude(w)
    return [w_i / mag for w_i in w]

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def directional_variance_i(x_i, w):
    """the variance of the row x_i in the direction determined by w"""
    return dot(x_i, direction(w)) ** 2

def directional_variance(x, w):
    """the variance of the data in the direction determined w"""
    return sun(directional_variance_i(x_i, w) for x_i in x)

# 通过梯度函数可以找出使方差最大的方向
def directional_variance_gradient_i(x_i, w):
    """the contribution of row_x_i to the gradient of the direction_w variance"""
    projection_length = dot(x_i, direction(w))
    return [2 * projection_length * x_ij for x_ij in x_i]

def directional_variance_gradient(x, w):
    return vector_sum(directional_variance_gradient_i(x_i, w) for x_i in x)


        else:
            return data_matrix[i, j]
        
    num_rows, num_cols = shape(data_matrix)
    return make_matrix(num_rows, num_cols, rescaled)

def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001):
    """use gradient descent to find theta that minimizes target fuction"""
    
    step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001]
    
    theta = theta_0                     # 设定theta为初始值
    target_fn = safe(target_fn)         # target_fn的安全版
    value = target_fn(theta)            # 试图最小化的值
    
    while True:              # while true为死循环，需要用break来退出
        gradient = gradient_fn(theta)
        next_thetas = [step(theta, gradient, -step_size) for step_size in step_sizes]
    
    # 选择一个使残差函数最小的值
    next_theta = min(next_thetas, key=target_fn)
    next_value = target_fn(next_theta)
    
    # 当‘收敛’时停止
    if abs(value - next_value) < tolerance:
        return theta
    else:
        theta, value = next_theta, next_value
        
def negate(f):
    """return a function that for any input x returns -f(x)"""
    return lambda *args, **kwargs: -f(*args, **kwargs)

def negate_all(f):
    """the same when f returns a list of numbers"""
    return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)]

def maximize_batch(target_fn, gradient_fn, theta-0, tolerance=0.000001):
    return minimize_batch(negate(target_fn), negate_all(gradient_fn), theta_0)

# 使directional_variance最大的方向就是第一主成分
from functools import partial

def first_princinple_component(x):
    guess = [1 for _ in x[0]]
    unscaled_maximizer = maximize_batch(
    partial(directional_variance, x),
    partial(directional_variance_gradient, x). guess)
    return direction(unscaled_maximizer)

def project(v, w):
    projection_length = dot(v, w)
    return scalar_multiply(projection_length, w)

def remove_projection_from_vector(v, w):
    return vector_substract(v, project(v, w))

def remove_projection(x, w):
    return [remove_projection_from_vector(x_i, w) for x_i in x]

def principal_component_analysis(x, num_components):
    components = []
    for _ in range(num_components):
        component = first_principal_component(x)
        components.append(component)
        x = remove_projection(x, component)
        
    return components

def transform_vector(v, components):
    return [dot(v, w) for w in components]

def transform(x, components):
    return [transform_vector(x_i, components) for x_i in x]

在学习过程中，我发现自己不太适合这本书的方法与模式，所以这本书的学习以及博客更新到此结束。

以上是Ch9、Ch10的相关内容

2018.03.10 YR