Python for Beginners
# 计算昨天或上周的日期使用timedelta
from datetime import datetime, timedelta
today = datetime.now()
one_day = timedelta(days=1)
yesterday = today - one_day
print('Yesterday was: ' + str(yesterday))
one_week = timedelta(week=1)
last_week = today - one_week
print('Last week was: ' + str(last_week))
# 将日期分段显示
from datetime import datetime
current_date = datetime.now()
print('Day: ' + str(current_date.day))
print('Month: ' + str(current_date.month))
print('Year: ' + str(current_date.year))
print('Hour: ' + str(current_date.hour))
print('Minuter: ' + str(current_date.minute))
print('Second: ' + str(current_date.second))
# 将字符串转换为日期
from datetime import datetime
birthday = input('When is your birthday(dd/mm/yyyy)?')
birthday_date = datetime.strptime(birthday, '%d/%m/%Y')
print('Birthday: ' + str(birthday_date))
# 利用了字典和匿名函数的特性实现switch功能
'''
switch语法
switch(expression){
case value :
语句块
break;
case value
语句块
break;
...
default :
语句块
}
'''
switch = {
0 : lambda x : x + 2,
1 : lambda x : x ** 2,
2 : lambda x : abs(x)
}
print('switch字典的输出结果为:{0}'.format(switch[0](-34)))
数组与列表的区别
数组存储的值要有相同的类型,列表可存储任何类型的值,可以发现在使用中大部分时间都是用到列表,除非你开始学习机器学习,编写自己的模型,会更多的用到数组。
# 使用数组
from array import array
scores = array('d') # 'double'代表数字float型数组
scores.append(97)
scores.append(98)
print(scores)
print(scores)
VSCode
编辑多选项可以使用 Ctrl + D
# 虚拟环境
# 安装虚拟环境
pip install virtualenv
# 创建虚拟环境
# Windows
python -m venv <目录名>
# OSX/Linux
virtualenv <目录名>
# 激活虚拟环境
# cmd
<目录名>\Scripts\Activate.bat
# PowerShell
<目录名>\Scripts\Activate.ps1
# bash shell
../<目录名>/Scripts/activate
# OSX/Linux
<目录名>/bin/activate
# 关闭虚拟环境
<目录名>\deactivate.bat
# 使用环境变量
# 永远不要将敏感信息写入代码
# pip install python-dotenv
# .env文件
DATEBASE = Sample_Connnection_String
# app.py
from dotenv import load_dotenv
import os
load_dotenv()
database = os.getenv('DATABASE')
print(database)
# 装饰器
def logger(func):
def wrapper():
print('Logging execution')
fun()
print('Done logging')
return wrapper
@logger
def sample():
print('-- Inside sample function')
sample()
More Python for Beginners
# 指定排序项
def sorter(item):
return item['name']
presenters = [
{'name': 'Susan', 'age': 50},
{'name': 'Christopher', 'age': 47}
]
presenters.sort(key=sorter)
print(presenters)
# lambda函数
presenters = [
{'name': 'Susan', 'age': 50},
{'name': 'Christopher', 'age': 47}
]
presenters.sort(key=lambda item: item['name'])
print(presenters)
# 类和构造函数
class Presenter():
def __init__(self, name):
self.name = name
@property
def name(self):
print('In the getter')
return self.__name
@name.setter
def name(self, value):
print('In the setter')
self.__name = value
presenter = Presenter('Chris')
presenter.name = 'Christopher'
print(presenter.name)
# 类继承
class Person:
def __init__(self, name):
self.name = name
def say_hello(self):
print('Hello, ' + self.name)
class Student(Person):
def __init__(self, name, school):
super().__init__(name) # 调用父级构造函数设置name
self.school = school
def sing_school_song(self):
print('Ode to ' + self.school)
def say_hello(self): # 子类中重写父类函数 不自动调用父类
super().say_hello() # 需要手动设置 执行父级say_hello()
print('I am rather tired')
def __str__(self): # 当你print一个对象的时候,触发__str__
return f'{self.name} attends {self.school}'
student = Student('Christopher', 'UVM')
student.say_hello()
student.sing_school_song()
print(student) # 触发__str__魔法方法
# isinstance可判断某个对象是否属于某个类,考虑继承关系
isinstance(student, Student) # True
isinstance(student, Person) # True
# issubclass判断子类
issubclass(Student, Person) # True
# 多重继承 从多个类进行继承
# Java C# 不支持多重继承 因为它会让你很快的混乱
# 登陆
class Loggable:
def __init__(self):
self.title = ''
def log(self):
print('Log message from ' + self.title)
# 数据库连接
class Connection:
def __init__(self):
self.server = ''
def connect(self):
print('Connecting to database on ' + self.server)
# 创建框架
def framework(item):
if isinstance(item, Connection): # 验证继承类
item.connect()
if isinstance(item, Loggable): # 验证继承类
item.log()
# 创建数据库类
class SqlDatabase(Connection, Loggable):
def __init__(self):
self.title = 'Sql Connection Demo'
self.server = 'Some_Server'
# 实例
sql_connection = SqlDatabase()
framework(sql_connection)
# 文件系统管理
# Python3.6+
from pathlib import Path
# 当前目录
cwd = Path.cwd()
print('Current working directory:\n' + str(cwd))
# 合成目录与文件名
new_file = Path.joinpath(cwd, 'new_file.txt')
print('Full path:\n' + str(new_file))
# 文件是否存在
print('Does that file exist? ' + str(new_file.exists()))
# 获取父级目录
parent = cwd.parent
# 判断是否目录
print('Is this a directory? ' + str(parent.is_dir()))
# 判断是否文件
print('Is this a file? ' + str(parent.is_file()))
# 获取目录中文件列表
print('\n----- directory contents -----')
for child in parent.iterdir():
is child.is_dir():
print(child)
demo_file = Path(Path.joinpath(cwd, 'demo.txt'))
# 获取文件名
print('file name: ' + demo_file.name)
# 获取扩展名
print('file suffix: ' + demo_file.suffix)
# 获取文件目录
print('file folder: ' + demo_file.parent.name)
# 获取文件大小
print('file size: ' + str(demo_file.stat().st_size))
# 文件I/O
# 打开文件
stream = open(file_name, mode, buffer_size)
# Modes
r - 以只读方式打开文件。文件的指针将会放在文件的开头。这是默认模式。
w - 打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
a - 打开一个文件用于追加。如果该文件已存在,文件指针将会放在文件的结尾。不存在,创建新文件进行写入。
x - 写入的文件必须是新文件,如果已存在则会引发错误。
+ - 可读写模式
t - 文本方式,默认
b - 二进制方式
stream = open('demo.txt')
# 是否可读
print(stream.readable())
# 读取第一个字符
print(stream.read(1))
# 读取一行
print(stream.readline())
# 关闭文件流
stream.close()
# 写文件
stream = open('output.txt', 'wt')
# 是否可写
print(str(stream.writable()))
# 写入一个字符
stream.write('H')
# 写入多个字符串。高级,不用遍历即可写入
stream.writelines(['ello', ' ', 'world'])
# 写入换行
stream.write('\n')
# 关闭文件流,清除数据
stream.close()
# 文件流操作
stream = open('output.txt', 'wt') #覆盖写入
# 写入字符串
stream.write('demo!')
# 打印文件指针位置
print(str(stream.tell()))
# 改变游标在文件流中的位置
stream.seek(0)
# 前四个字符将覆盖写入
stream.write('cool')
# 刷新缓冲区的,即将缓冲区中的数据立刻写入文件,同时清空缓冲区
stream.flush()
stream.close()
# 在try / finally中打开和写入文件
# 为避免出错,这种写法很重要
try:
stream = open('output.txt', 'wt')
stream.write('Lorem ipsum dolar')
finally:
stream.close()
# 使用with语句进行简化
with open('output.txt', 'wt') as stream:
stream.write('Lorem ipsum dolar')
# 异步编程
# 同步操作
from timeit import default_timer #测试代码的执行时间
import requests
def load_data(delay):
print(f'Starting {delay} second timer')
text = requests.get(f'https://httpbin.org/delay/{delay}').text
print(f'Completed {delay} second timer')
return text
def run_demo():
start_time = default_timer()
two_data = load_data(2)
three_data = load_data(3)
elapsed_time = default_timer() - start_time
print(f'The operation took {elapsed_time:.2} seconds')
def main():
run_demo()
main()
# 异步操作
from timeit import default_timer
import aiohttp
import asyncio # 异步http库
async def load_data(session, delay):
print(f'Starting {delay} second timer')
async with session.get(f'http://httpbin.org/delay/{delay}') as resp:
text = await resp.text()
print(f'Completed {delay} second timer')
return text
async def main():
# Start the timer
start_time = default_timer()
# Creating a single session
async with aiohttp.ClientSession() as session:
# Setup our tasks and get them running
two_task = asyncio.create_task(load_data(session, 2))
three_task = asyncio.create_task(load_data(session, 3))
# Simulate other processing
await asyncio.sleep(1)
print('Doing other work')
# Let's go get our values
two_result = await two_task
three_result = await three_task
# Print our results
elapsed_time = default_timer() - start_time
print(f'The operation took {elapsed_time:.2} seconds')
asyncio.run(main())
Even more Python for Beginners Data science tools
# Pandas
import pandas as pd
# Series是一个一维数组,类似于Python的列表
airports = pd.Series([
'Seattle-Tacoma',
'Dulles',
'London Heathrow',
'Schiphol',
'Changi',
'Pearson',
'Narita'
])
# 使用索引引用一个Series的单个值
airports[2]
# 循环遍历序列中的所有值
for value in airports:
print(value)
# DataFrame
# 在处理Pandas时,大多数时候我们使用的是二维数组
# DataFrame可以存储二维数组
airports = pd.DataFrame([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
])
# 使用columns参数指定列的名称
airports = pd.DataFrame([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns = ['Name', 'City', 'Country']
)
# DataFrame 验证操作
import pandas as pd
airports = pd.DataFrame([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns = ['Name', 'City', 'Country']
)
# 返回前3行
airports.head(3)
# 返回最后3行
airports.tail(3)
# 返回行数和列数
airports.shape
# 返回更详细的信息
airports.info()
'''
返回的信息包括:
- 行数和索引值的范围
- 列数
- 每列的信息,是否为空值,数据类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 3 columns):
Name 7 non-null object
City 7 non-null object
Country 7 non-null object
dtypes: object(3)
memory usage: 148.0+ bytes
'''
# DataFrame 查询操作
import pandas as pd
airports = pd.DataFrame([
['Seatte-Tacoma', 'Seattle', 'USA'],
['Dulles', 'Washington', 'USA'],
['London Heathrow', 'London', 'United Kingdom'],
['Schiphol', 'Amsterdam', 'Netherlands'],
['Changi', 'Singapore', 'Singapore'],
['Pearson', 'Toronto', 'Canada'],
['Narita', 'Tokyo', 'Japan']
],
columns = ['Name', 'City', 'Country']
)
# 返回指定列
airports['City']
# 返回多列
airports[['Name', 'Country']] # 列名放入列表中
# 根据位置获取值
airports.iloc[0,0] # 'Seatte-Tacoma'
airports.iloc[2,2] # 'United Kingdom'
# 返回所有行和所有列
airports.iloc[:,:]
# 返回前2行所有值
airports.iloc[0:2,:]
# 返回前2列所有值
airports.iloc[:,0:2]
# 返回第1列和第3列所有值
airports.iloc[:,[0,2]]
# 通过列名返回值
airports.loc[:,['Name','Country']]
# 读写CSV文件
import pandas as pd
# read_csv允许您将csv文件的内容读入DataFrame
airports_df = pd.read_csv('Data/airports.csv')
# 处理有错误的行
airports_df = pd.read_csv(
'Data/airportsInvalidRows.csv',
error_bad_lines=False
)
# 处理不包含列名的文件
airports_df = pd.read_csv(
'Data/airportsNoHeaderRows.csv',
header=None
)
# 使用names参数指定列名
airports_df = pd.read_csv(
'Data/airportsNoHeaderRows.csv',
header=None,
names=['Name', 'City', 'Country']
)
# 文件中缺少值,在DataFrame中显示为NaN
# 将DataFrame内容写入CSV文件
airports_df.to_csv('Data/MyNewCSVFile.csv')
# 如果不希望将索引列包含在csv文件中,请指定index=False
airports_df.to_csv(
'Data/MyNewCSVFileNoIndex.csv',
index=False
)
# DataFrame 列操作
# 删除一列
DataFrameName.drop(columns=['columnname'])
# 将Actual_arr_time列删除赋值给新df,但原df中Actual_arr_time列并未删除
new_df = delays_df.drop(columns=['Actual_arr_time'])
# 使用inplace参数从原始数据中删除列
delays_df.drop(columns=['Actual_arr_time'], inplace=True)
# 切片赋值
desc_df = delays_df.loc[:, ['Origin_airport','Dest_airport']]
# 预处理缺失值和重复行
# 查看缺失值
delays.df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 300000 non-null object
OP_UNIQUE_CARRIER 300000 non-null object
TAIL_NUM 299660 non-null object
OP_CARRIER_FL_NUM 300000 non-null int64
ORIGIN 300000 non-null object
DEST 300000 non-null object
CRS_DEP_TIME 300000 non-null int64
DEP_TIME 296825 non-null float64
DEP_DELAY 296825 non-null float64
CRS_ARR_TIME 300000 non-null int64
ARR_TIME 296574 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 300000 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 300000 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 30.9+ MB
'''
TAIL_NUM、DEP_TIME、DEP_DELAY、ARR_DELAY、ACTUAL_ELAPSED_TIME和AIR_TIME有缺失值。
# 使用dropna删除包含空值/缺失值的行,原数据未真正删除
delay_no_nulls_df = delays_df.dropna()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 295832 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 295832 non-null object
OP_UNIQUE_CARRIER 295832 non-null object
TAIL_NUM 295832 non-null object
OP_CARRIER_FL_NUM 295832 non-null int64
ORIGIN 295832 non-null object
DEST 295832 non-null object
CRS_DEP_TIME 295832 non-null int64
DEP_TIME 295832 non-null float64
DEP_DELAY 295832 non-null float64
CRS_ARR_TIME 295832 non-null int64
ARR_TIME 295832 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 295832 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 295832 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 32.7+ MB
'''
# inplace=True表示要删除df中原始数据
delays_df.dropna(inplace=True)
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 295832 entries, 0 to 299999
Data columns (total 16 columns):
FL_DATE 295832 non-null object
OP_UNIQUE_CARRIER 295832 non-null object
TAIL_NUM 295832 non-null object
OP_CARRIER_FL_NUM 295832 non-null int64
ORIGIN 295832 non-null object
DEST 295832 non-null object
CRS_DEP_TIME 295832 non-null int64
DEP_TIME 295832 non-null float64
DEP_DELAY 295832 non-null float64
CRS_ARR_TIME 295832 non-null int64
ARR_TIME 295832 non-null float64
ARR_DELAY 295832 non-null float64
CRS_ELAPSED_TIME 295832 non-null int64
ACTUAL_ELAPSED_TIME 295832 non-null float64
AIR_TIME 295832 non-null float64
DISTANCE 295832 non-null int64
dtypes: float64(6), int64(5), object(5)
memory usage: 32.7+ MB
'''
# 查找重复值,显示该行是否与前一行重复
# 使用duplicated查找重复行
aipports_df.duplicated()
# drop_duplicates将删除重复的行
airports_df.drop_duplicates(inplace=True)
# Scikit-learn
# 切分数据
import pandas as pd
# 导入CSV
delays_df = pd.read_csv('Data/Lots_of_flight_data.csv')
# 查看数据的行列数
# delays_df.shape
# 创建X DataFrame,其中只包含我们要用来训练模型的特性数据
X = delays_df.loc[:,['DISTANCE', 'CRS_ELAPSED_TIME']]
# 创建y DataFrame,其中只包含我们要用模型预测的值
y = delays_df.loc[:,['ARR_DELAY']]
# 切分训练数据和测试数据
'''
使用Scikit-learn train_test_split将30%的数据放在测试DataFrame中
另外的70%的行放入训练DataFrame中,用来训练我们的模型
注意:通过为random_state指定一个值,我们可以确保如果再次运行代码,相同的行将被移动到测试数据帧中。
这使得我们的结果是可重复的。
'''
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.3,
random_state=42
)
# 训练线性回归模型
import pandas as pd
from sklearn.model_selection import train_test_split
# 加载CSV文件
delays_df = pd.read_csv('Data/Lots_of_flight_data.csv')
# 删除具有空值的行
delays_df.dropna(inplace=True)
# 将特性数据放进X DataFrame
X = delays_df.loc[:,['DISTANCE', 'CRS_ELAPSED_TIME']]
# 将标签数据放进y DataFrame
y = delays_df.loc[:,['ARR_DELAY']]
# 切分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.3,
random_state=42
)
# 使用Scikit-learn LinearRegression.fit方法,根据X_train和y_train中存储的训练数据训练线性回归模型
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() # 创建Scikit-learn LinearRegression对象
regressor.fit(X_train, y_train) # 使用fit方法训练模型
# LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
# 这个regressor对象已经包含了经过训练的线性回归模型
# 测试模型
# 使用Scikit-learn LinearRegression predict让训练模型预测测试数据
# 将测试数据存储在X_Test
# 将把预测的结果存储在y_pred
y_pred = regressor.predict(X_test)
# 可以看到预测出来的y_pred与y_test中还是存在差距,需要调整模型。
# 评估模型的准确性
'''
现在我们已经有了一个经过训练的模型
可以使用许多方法来检查模型的准确性
但所有这些指标都是基于科学计算
我们可以使用Scikit-learn和numpy完成大部分工作
均方误差(MSE)
MSE是模型在预测观察结果时执行的平均误差。MSE越低,模型越好。
MSE是实际观测值与模型预测值之间的平均平方差。
MSE = mean((actuals - predicteds)^2)
'''
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
# Mean Squared Error: 2250.4445141530855
'''
均方根误差(RMSE)
RMSE是模型在预测观察结果时执行的平均误差。RMSE越低,模型越好。
从数学上讲,RMSE是均方误差的平方根
RMSE = sqrt(MSE)
我们可以使用包含大量数学函数的numpy库来计算MSE的平方根
'''
import numpy as np
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
# NumPy和Pandas的选用
type(y_pred)
# numpy.ndarray
type(y_test)
# pandas.core.frame.DataFrame
# 一维numpy数组的创建类似于pandas Series
# 但用了两个不同的结构体array()和Series()
import numpy as np
airports_array = np.array(['Pearson','Changi','Narita'])
print(airports_array)
print(airports_array[2])
# ['Pearson' 'Changi' 'Narita']
# Narita
airports_series = pd.Series(['Pearson','Changi','Narita'])
print(airports_series)
print(airports_series[2])
# 0 Pearson
# 1 Changi
# 2 Narita
# dtype: object
# Narita
# 二维数组
# 区别为打印时Pandas显示数字索引,NumPy为隐式的
airports_array = np.array([
['YYZ','Pearson'],
['SIN','Changi'],
['NRT','Narita']])
print(airports_array)
print(airports_array[0,0])
# [['YYZ' 'Pearson']
# ['SIN' 'Changi']
# ['NRT' 'Narita']]
# YYZ
airports_df = pd.DataFrame([['YYZ','Pearson'],['SIN','Changi'],['NRT','Narita']])
print(airports_df)
print(airports_df.iloc[0,0])
# 0 1
# 0 YYZ Pearson
# 1 SIN Changi
# 2 NRT Narita
# YYZ
# 如果需要DataFrame的功能,可以将数据从numpy对象转换为pandas对象
# numpy和pandas是可以相互转换的,使用时多用type()来判断类型
# 这样就可以联合使用两个库中的所有功能
predicted_df = pd.DataFrame(y_pred)
predicted_df.head()