小白学python-数据清洗

数据清洗:

赔率、公路堵车模型的概念及应用

主成分分析PCA:新的的特征组合

车辆数据描述:one-hot编码会使特征值大量增加(维度变高视情况而定)

Logistic回归:AUC:曲线下的面积

求取素数以及赔率的代码:

import operator
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from time import time
import math


def is_prime(x):
    return 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)]


def is_prime3(x):
    flag = True
    for p in p_list2:
        if p > math.sqrt(x):
            break
        if x % p == 0:
            flag = False
            break
    if flag:
        p_list2.append(x)
    return flag


if __name__ == "__main__":
    a = 2
    b = 1000000

    # # 方法1:直接计算
    # t = time()
    # p = [p for p in range(a, b) if 0 not in [p % d for d in range(2, int(math.sqrt(p)) + 1)]]
    # print (time() - t)
    # print (p)

    # # 方法2:利用filter
    # t = time()
    # p = filter(is_prime, range(a, b))
    # print time() - t
    # print p
    #
    # # 方法3:利用filter和lambda
    # t = time()
    # is_prime2 = (lambda x: 0 not in [x % i for i in range(2, int(math.sqrt(x)) + 1)])
    # p = filter(is_prime2, range(a, b))
    # print time() - t
    # print p
    #
    # 方法4:定义
    t = time()
    p_list = []
    for i in range(2, b):
        flag = True
        for p in p_list:
            if p > math.sqrt(i):
                break
            if i % p == 0:
                flag = False
                break
        if flag:
            p_list.append(i)
    print(time() - t)
    print(p_list)

    # 方法5:定义和filter
    p_list2 = []
    t = time()
    filter(is_prime3, range(2, b))
    print(time() - t)
    print(p_list2)

    print('---------------------')
    a = 1180
    b = 1230
    a = 1600
    b = 1700
    p_list2 = []
    p = np.array(filter(is_prime3, range(2, b+1)))
    p = p[p >= a]
    print(p)
    p_rate = float(len(p)) / float(b-a+1)
    print('素数的概率:', p_rate, '\t')
    print('公正赔率:', 1/p_rate)
    print('合数的概率:', 1-p_rate, '\t')
    print('公正赔率:', 1 / (1-p_rate))

NagelSchreckenberg公路堵车模型代码:

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt


def clip(x, path):
    for i in range(len(x)):
        if x[i] >= path:
            x[i] %= path


if __name__ == "__main__":
    mpl.rcParams['font.sans-serif'] = [u'SimHei']
    mpl.rcParams['axes.unicode_minus'] = False

    path = 5000     # 环形公路的长度
    n = 100         # 公路中的车辆数目
    v0 = 50          # 车辆的初始速度
    p = 0.15         # 随机减速概率
    Times = 3000

    np.random.seed(0)
    x = np.random.rand(n) * path
    x.sort()
    v = np.tile([v0], n).astype(np.float)

    plt.figure(figsize=(10, 8), facecolor='w')
    for t in range(Times):
        plt.scatter(x, [t]*n, s=1, c='k', alpha=0.05)
        for i in range(n):
            if x[(i+1)%n] > x[i]:
                d = x[(i+1) % n] - x[i]   # 距离前车的距离
            else:
                d = path - x[i] + x[(i+1) % n]
            if v[i] < d:
                if np.random.rand() > p:
                    v[i] += 1
                else:
                    v[i] -= 1
            else:
                v[i] = d - 1
        v = v.clip(0, 150)
        x += v
        clip(x, path)
    plt.xlim(0, path)
    plt.ylim(0, Times)
    plt.xlabel(u'车辆位置', fontsize=16)
    plt.ylabel(u'模拟时间', fontsize=16)
    plt.title(u'环形公路车辆堵车模拟', fontsize=20)
    plt.tight_layout(pad=2)
    plt.show()

模糊查询与替换:

#!/usr/bin/python
# -*- encoding: utf-8

import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


def enum_row(row):
    print(row['state'])


def find_state_code(row):
    if row['state'] != 0:
        print(process.extractOne(row['state'], states, score_cutoff=80))


def capital(str):
    return str.capitalize()


def correct_state(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return ' '.join(map(capital, state_name.split(' ')))
    return row['state']


def fill_state_code(row):
    if row['state'] != 0:
        state = process.extractOne(row['state'], states, score_cutoff=80)
        if state:
            state_name = state[0]
            return state_to_code[state_name]
    return ''


if __name__ == "__main__":
    pd.set_option('display.width', 200)
    data = pd.read_excel('sales.xlsx', sheetname='sheet1', header=0)
    print('data.head() = \n', data.head())
    print('data.tail() = \n', data.tail())
    print('data.dtypes = \n', data.dtypes)
    print('data.columns = \n', data.columns)
    for c in data.columns:
        print(c)
    print
    data['total'] = data['Jan'] + data['Feb'] + data['Mar']
    print(data.head())
    print(data['Jan'].sum())
    print(data['Jan'].min())
    print(data['Jan'].max())
    print(data['Jan'].mean())

    print('=============')
    # 添加一行
    s1 = data[['Jan', 'Feb', 'Mar', 'total']].sum()
    print(s1)
    s2 = pd.DataFrame(data=s1)
    print(s2)
    print(s2.T)
    print(s2.T.reindex(columns=data.columns))
    # 即:
    s = pd.DataFrame(data=data[['Jan', 'Feb', 'Mar', 'total']].sum()).T
    s = s.reindex(columns=data.columns, fill_value=0)
    print(s)
    data = data.append(s, ignore_index=True)
    data = data.rename(index={15:'Total'})
    print(data.tail())

    # apply的使用
    print('==============apply的使用==========')
    data.apply(enum_row, axis=1)

    state_to_code = {"VERMONT": "VT", "GEORGIA": "GA", "IOWA": "IA", "Armed Forces Pacific": "AP", "GUAM": "GU",
                     "KANSAS": "KS", "FLORIDA": "FL", "AMERICAN SAMOA": "AS", "NORTH CAROLINA": "NC", "HAWAII": "HI",
                     "NEW YORK": "NY", "CALIFORNIA": "CA", "ALABAMA": "AL", "IDAHO": "ID",
                     "FEDERATED STATES OF MICRONESIA": "FM",
                     "Armed Forces Americas": "AA", "DELAWARE": "DE", "ALASKA": "AK", "ILLINOIS": "IL",
                     "Armed Forces Africa": "AE", "SOUTH DAKOTA": "SD", "CONNECTICUT": "CT", "MONTANA": "MT",
                     "MASSACHUSETTS": "MA",
                     "PUERTO RICO": "PR", "Armed Forces Canada": "AE", "NEW HAMPSHIRE": "NH", "MARYLAND": "MD",
                     "NEW MEXICO": "NM",
                     "MISSISSIPPI": "MS", "TENNESSEE": "TN", "PALAU": "PW", "COLORADO": "CO",
                     "Armed Forces Middle East": "AE",
                     "NEW JERSEY": "NJ", "UTAH": "UT", "MICHIGAN": "MI", "WEST VIRGINIA": "WV", "WASHINGTON": "WA",
                     "MINNESOTA": "MN", "OREGON": "OR", "VIRGINIA": "VA", "VIRGIN ISLANDS": "VI",
                     "MARSHALL ISLANDS": "MH",
                     "WYOMING": "WY", "OHIO": "OH", "SOUTH CAROLINA": "SC", "INDIANA": "IN", "NEVADA": "NV",
                     "LOUISIANA": "LA",
                     "NORTHERN MARIANA ISLANDS": "MP", "NEBRASKA": "NE", "ARIZONA": "AZ", "WISCONSIN": "WI",
                     "NORTH DAKOTA": "ND",
                     "Armed Forces Europe": "AE", "PENNSYLVANIA": "PA", "OKLAHOMA": "OK", "KENTUCKY": "KY",
                     "RHODE ISLAND": "RI",
                     "DISTRICT OF COLUMBIA": "DC", "ARKANSAS": "AR", "MISSOURI": "MO", "TEXAS": "TX", "MAINE": "ME"}
    states = state_to_code.keys()
    print(fuzz.ratio('Python Package', 'PythonPackage'))
    print(process.extract('Mississippi', states))
    print(process.extract('Mississipi', states, limit=1))
    print(process.extractOne('Mississipi', states))
    data.apply(find_state_code, axis=1)

    print('Before Correct State:\n', data['state'])
    data['state'] = data.apply(correct_state, axis=1)
    print('After Correct State:\n', data['state'])
    data.insert(5, 'State Code', np.nan)
    data['State Code'] = data.apply(fill_state_code, axis=1)
    print(data)

    # group by
    print('==============group by================')
    print(data.groupby('State Code'))
    print('All Columns:\n')
    print(data.groupby('State Code').sum())
    print('Short Columns:\n')
    print(data[['State Code', 'Jan', 'Feb', 'Mar', 'total']].groupby('State Code').sum())

    # 写入文件
    data.to_excel('sales_result.xls', sheet_name='Sheet1', index=False)

回归:

线性回归:

1、高斯分布

2、最大似然估计MLE

3、最小二乘法的本质

Logistic回归:

分类问题的首选算法

多分类:Softmax回归

目标函数

技术点:

1、梯度下降算法

2、最大似然估计

3、特征选择

Python 对于初者来说是一个很好的选择,因为它有着简洁明了的语法和丰富的资源。以下是一个适合 Python 小白习计划: 1. **基础知识**: - 习基础语法:变量、数据类型(整型、浮点型、字符串等)、运算符、控制流(条件语句、循环)。 - Python 基本结构:函数定义和调用,模块和包的理解。 2. **文本处理与文件操作**: - 使用 `open()` 函数读写文件,掌握基本的文件操作模式 (`r`, `w`, `a` 等)。 - 正则表达式(re模块)的习,用于文本处理和搜索替换。 3. **数据分析入门**: - 安装并熟悉 NumPy 和 Pandas 库,用于数据处理和分析。 - 初步了解列表推导式、Pandas DataFrame 结构以及数据清洗。 4. **函数编程**: - 习高阶函数、闭包和装饰器等概念,理解如何利用它们提高代码复用性和可读性。 5. **面向对象编程**: - 掌握类和对象的概念,会封装、继承和多态的基本使用。 - 会使用 Python 内置的 OOP 工具如 `super()` 和魔术方法 (`__init__`, `__str__`, `__del__` 等)。 6. **Python Web框架** (选修): - 如果对 Web 开发感兴趣,可以选择 Flask 或 Django 中的一个作为入门,了解 HTTP 请求响应、路由和视图函数。 7. **实战项目**: - 通过实际项目应用所知识,如爬虫、数据分析小工具或简单的网页应用。 8. **持续习与巩固**: - 阅读 Python 书籍和官方文档,关注 Python 新版本更新。 - 参加在线论坛或社区交流,解决遇到的问题。 - 定期做练习题和小项目,保持动手实践的习惯。 记得边边做,理论结合实践是最好的习方式。在每个阶段结束后,都不要忘记总结回顾和做一些自我测试。祝你在 Python习旅程中顺利!
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值