# -*- coding: utf-8 -*-
"""
Created on Fri Jul 15 00:52:07 2016
@author: Alan Guo
"""
a = 1
a.bit_length()
type(a)
a.as_integer_ration()
import decimal
from decimal import Decimal
decimal.getcontext()
decimal.getcontext().prec = 4
e = Decimal(1) / Decimal(11)
t = 'this is a string object'
t.capitalize()
t.split()
t.find('string')
t.replace(' ','|')
import re # regular expression
series = """
'01/18/2014 13:00:00', 100,'1st'
"""
dt = re.compile("'[0-9/:\s]+'")
result = dt.findall(series)
from datetime import datetime
pydt = datetime.strptime(result[0].replace("'",""),
'%m/%d/%Y %H:%M:%S')
pydt
datetime.datetime(2014, 1, 18, 13, 0)
print pydt
# 日期格式转化
date='Wed, 11 Apr 2012 09:37:05 +0800'
dd=datetime.strptime(date,'%a, %d %b %Y %H:%M:%S %z')
dd.strftime('%Y-%m-%d %H:%M:%S')
'''%a Abbreviated weekday name
%A Full weekday name
%b Abbreviated month name
%B Full month name
%c Date and time representation appropriate for locale
%d Day of month as decimal number (01 - 31)
%H Hour in 24-hour format (00 - 23)
%I Hour in 12-hour format (01 - 12)
%j Day of year as decimal number (001 - 366)
%m Month as decimal number (01 - 12)
%M Minute as decimal number (00 - 59)
%p Current locale's A.M./P.M. indicator for 12-hour clock
%S Second as decimal number (00 - 59)
%U Week of year as decimal number, with Sunday as first day of week (00 - 51)
%w Weekday as decimal number (0 - 6; Sunday is 0)
%W Week of year as decimal number, with Monday as first day of week (00 - 51)
%x Date representation for current locale
%X Time representation for current locale
%y Year without century, as decimal number (00 - 99)
%Y Year with century, as decimal number
%z, %Z Time-zone name or abbreviation; no characters if time zone is unknown
%% Percent sign
'''
# tuple
t = (1, 2.5, 'data')
t[2]
t.count('data')
t.index(1) # 找到元素的位置
# list
I = [1, 2.5, 'data']
l = list(t)
l.append([4,3]) # append list at the end
# [1, 2.5, 'data', [4, 3]]
l.extend([1.0, 1.5, 2.0]) # append the elements of list
# [..... [4, 3], 1.0, 1.5, 2.0]
l.insert(1, 'insert') # insert object before index position
# [1, 'insert', 2.5,...]
l.remove('data') # remove first occurence of object
# [1, 'insert', 2.5, [4, 3]...]
p = l.pop(3) # remove and return object at index
print l, p
# [1, 'insert', 2.5, 1.0...] [4,3]
l[2:5] # 3rd to 5th elements
for i in range(1, 10):
if i % 2 == 0: # % is for modulo
print "%d is even" % i
elif i % 3 == 0:
print "%d is multiple of 3" % i
else:
print "%d is odd" % i
total = 0
while total < 100:
total += 1
print (total)
m = [i**2 for i in range(5)]
m
def even(x):
return x % 2 == 0
map (even, range(10))
map(lambda x: x ** 2, range(10))
filter(even, range(15))
reduce(lambda x, y: x + y, range(15))
# Dict
d = {'Name': 'Angela Merkel',
'Country': 'Germany',
'Profession': 'Chancelor',
'Age': 60
}
type(d)
d.keys()
d.values()
d.items()
for item in d.iteritems():
print(item)
# Sets
s = set(['u', 'd', 'ud', 'du', 'd', 'du'])
t = set(['d', 'dd', 'uu', 'u'])
s.union(t) # all of s and t
s.intersection(t) # both in s and t
s.difference(t) # in s but not t
s.symmetric_difference(t) # in either one but not both
from random import randint
l = [randint(0, 10) for i in range(1000)]
# 1,000 random integers between 0 and 10
len(l) # number of elements in l
s = set(l)
from copy import deepcopy
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = 3 * [deepcopy(v), ]
import numpy as np
a = np.array([0, 0.5, 1.0, 1.5, 2.0])
type(a)
a[:2] # indexing as with list objects in 1 dimension
a.sum()
a.std()
a.cumsum()
np.sqrt(a)
b = np.array([a, a*2])
b[0] # first row
b[0, 2] # third element of first row
b.sum()
b.sum(axis=0) # column-wise sum
b.sum(axis=1)
b.tolist()
b.flatten() ;
b.flat # 用来循环,遍历, iterable
b.min(0) # or b.min(1)
b.prod()
b.cumprod()
b.any() ; b.all()
b.size # 元素的总数量
c = np.zeros((2, 3, 4), dtype='i',order='C') # also: np.ones()
d = np.ones_like(c, dtype='f', order='C') # also: np.zeros_like()
x = np.random.standard_normal((5,1000000))
y = 2 * x + 3 # linear equation y = a * x + b
C = np.array((x, y), order='C')
F = np.array((x, y), order='F')
x = 0.0; y = 0.0 # memory cleanup
C[:2].round(2)
a = np.arange(0, 4)
b = np.arange(1, 5)
np.add(a, b)
np.add(a, b, a)
x1 = 3; x2 = 2
x1//x2 # = 1 (floor divide)
a = np.arange(0, 12)
a.shape = 3,4
a.shape
a.reshape(12,)
a.shape = 4,-1
a.reshape(3,-1)
a.tofile('a.bin')
b = np.fromfile("a.bin", dtype = np.int32)
np.save("a.npy", a)
c = np.load('a.npy')
np.savez(" ",a, b, ) # 保存多数组至压缩文件
a = np.arange(0, 12, 0.5).reshape(4, -1)
np.savetxt("a.txt",a) # 默认以空格分隔
np.loadtxt('a.txt')
np.savetxt('a.txt', a, fmt='%d', delimiter=',')
np.loadtxt('a.txt', delimiter=',') #读入的时候也需要指定
a = np.arange(8)
b = np.add.accumulate(a)
c = a + b
f = file('result.npy', 'wb')
np.save(f, a) # 按顺序将a, b, c保存进文件对象f
np.save(f, b)
np.save(f, c)
f.close()
f = file('result.np', 'rb')
np.load(f) # 顺序从文件对象f中读取内容
np.load(f)
np.load(f)
# Data Visualization
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# %matplotlib inline
np.random.seed(1000)
y = np.random.standard_normal(20)
x = range(len(y))
plt.plot(x, y) # 也可写成 plt.plot(y)
plt.plot(y.cumsum())
plt.grid(True) # adds a grid
plt.axis('tight') # adjusts the axis ranges
plt.plot(y.cumsum())
plt.grid(True)
plt.xlim(-1, 20)
plt.ylim(np.min(y.cumsum())-1, np.max(y.cumsum())+1)
"""
===============================
"""
np.random.seed(2000)
y = np.random.standard_normal((20, 2)).cumsum(axis=0)
plt.figure(figsize=(7, 4))
# the figsize parameter defines the size of the figure in (width, height)
plt.plot(y.cumsum(), 'b', lw=1.5)
plt.plot(y.cumsum(), 'yo')
plt.grid(True)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
plt.figure(figsize=(7,4))
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 1], lw=1.5, label='2nd')
plt.plot(y, 'ro')
plt.grid(True)
plt.legend(loc=0) # 0: best possible
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')
fig, ax1 = plt.subplots()
# Plot first data set using first (left) axis
plt.plot(y[:, 0], 'b', lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=8)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value 1st')
plt.title('A Simple Plot')
ax2 = ax1.twinx()
# plot second data set using second (right) axis
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')
plt.plot(y[:, 1], 'ro')
plt.legend(loc=0)
plt.ylabel('value 2nd')
fig, ax1 = plt.subplots()
plt.figure(figsize=(7,5))
plt.subplot(211)
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.ylabel('value')
plt.title('A Simple Plot')
plt.subplot(212)
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')
plt.plot(y[:, 1], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.bar(np.arange(len(y)), y[:,1], width=0.5,
color='g', label='2nd')
## Other Plot Styles
# Scatter Plot
y = np.random.standard_normal((1000,2))
plt.figure(figsize=(7,5))
plt.plot(y[:,0], y[:,1], 'ro')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
plt.figure(figsize=(7,5))
plt.scatter(y[:, 0], y[:, 1], marker='o', color='r')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
c = np.random.randint(0, 10, len(y))
plt.figure(figsize=(7,5))
plt.scatter(y[:,0], y[:,1],c=c,marker='o')
plt.colorbar()
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')
# Histogram
plt.figure(figsize=(7,4))
plt.hist(y, label=['1st', '2nd'], bins=25,color=['r','b'])
# Stacked hist
plt.hist(y, label=['1st', '2nd'], color=['b','g'],
stacked=True, bins=20)
# box plot
fig,ax = plt.subplots(figsize=(7,4))
plt.boxplot(y)
plt.grid(True)
plt.setp(ax, xticklabels=['1st', '2nd'])
##
import matplotlib.finance as mpf
start = (2014, 5, 1)
end = (2014, 6, 30)
quotes = mpf.quotes_historical_yahoo_ochl('^GDAXI', start, end)
quotes[:2]
fig, ax = plt.subplots(figsize=(8,5))
fig.subplots_adjust(bottom=0.2)
mpf.candlestick_ochl(ax, quotes, width=0.6, colorup='r', colordown='g')
plt.grid(True)
ax.xaxis_date()
# dates on the x-axis
ax.autoscale_view()
plt.setp(plt.gca().get_xticklabels(),rotation=30)
###
quotes = np.array(mpf.quotes_historical_yahoo_ochl('YHOO',start,end))
fig, (ax1, ax2) = plt.subplots(2,sharex=True, figsize=(8,6))
mpf.candlestick_ochl(ax1, quotes, width=0.6, colorup='r', colordown='g')
ax1.set_title('Yahoo Inc.')
ax1.set_ylabel('index level')
ax1.grid(True)
ax1.xaxis_date()
plt.bar(quotes[:, 0]-0.25, quotes[:,5],width=0.5)
ax2.set_ylabel('volume')
ax2.grid(True)
ax2.autoscale_view()
plt.setp(plt.gca().get_xticklabels(),rotation=30)
## 3D Plotting
strike = np.linspace(50, 150, 24)
ttm = np.linspace(0.5, 2.5, 24)
strike, ttm = np.meshgrid(strike, ttm)
iv = (strike - 100)**2 / (100 * strike)/ttm
# generate fake implied volatilities
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(9, 6))
ax = fig.gca(projection='3d')
surf = ax.plot_surface(strike, ttm, iv, rstride=2, cstride=2,
cmap=plt.cm.coolwarm, linewidth=0.5,
antialiased=True)
ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')
ax.set_zlabel('implied volatility')
fig.colorbar(surf, shrink=0.5, aspect=5)
## ############## Volatility surface
fig = plt.figure(figsize=(8,5))
ax = fig.add_subplot(111, projection='3d')
ax.view_init(30, 60)
ax.scatter(strike, ttm, iv, zdir='z', s=25,
c='b', marker='^')
ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')
ax.set_zlabel('implied volatility')
#########################################
# pandas basic
import numpy as np
import pandas as pd
df = pd.DataFrame([10, 20, 30, 40], columns=['numbers'],
index=['a', 'b', 'c', 'd'])
df
df.index # the index value
df.columns # the column names
df.ix['c'] # selection via index
df.ix[['a', 'd']] # selection of multiple indices
df.ix[['a', 'd']] # selection via Index object
df.sum() # sum per column
df.max()
df.min()
df.apply(lambda x: x**2) # square of every element
df**2 # agian square, this time Numpy-like
df['floats'] = (1.5, 2.5, 3.5, 4.5)
# new column is generated
df['floats'] # selection of new column
df['names'] = pd.DataFrame(['Yves', 'Guido', 'Felix', 'Francesc'],
index=['d', 'a', 'b', 'c'])
df.append({'number':100, 'floats':5.75, 'names':'Henry'},
ignore_index=True)
# temporary object, df not changed, and index not changed
df = df.append(pd.DataFrame({'numbers':100, 'float':5.75,
'names':'Henry'}, index=['z']))
# it's better this way
df
df = df.join(pd.DataFrame([1, 4, 9, 16, 25],
index=['a', 'b', 'c', 'd', 'y'],
columns=['squares']),
how = 'outer')
df[['numbers', 'squares']].mean() # column-wise mean
df[['numbers', 'squares']].std() # column-wise std
df2[df2['E'].isin(['two','four'])]
df.sub(s, axis='index') # s也是个index
######
a = np.random.standard_normal((9, 4))
a.round(6)
df = pd.DataFrame(a)
df.columns = [['No1', 'No2', 'No3', 'No4']]
df['No3'][3] ## value in column No2 at index 3
dates = pd.date_range('2015-1-1', periods=9, freq='M')
dates
df.index = dates
### from dataframe to array
np.array(df).round(6)
### basic functions with dataframe
df.sum()
df.mean()
df.cumsum()
df.describe()
np.sqrt(df)
np.sqrt(df).sum()
%matplotlib inline
df.cumsum().plot(lw=2.0)
df.plot(secondary_y='weight', figsize=(14,7))
#### 画两个y轴
type(df)
df['No1']
import matplotlib.pyplot as plt
df['No1'].cumsum().plot(style='r', lw=2.)
plt.xlabel('date')
plt.ylabel('value')
### GroupBy Operations
df['Quarter'] = ['Q1', 'Q1','Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
groups = df.groupby('Quarter')
groups.mean()
groups.max()
groups.size()
df['Odd_Even'] = ['Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even',
'Odd', 'Even', 'Odd']
groups = df.groupby(['Quarter', 'Odd_Even'])
groups.size()
groups.mean()
## Read financial data
import pandas.io.data as web
DAX = web.DataReader(name='SPY', data_source='yahoo',
start='2000-1-1')
DAX.info()
DAX.tail()
DAX['Close'].plot(figsize=(8,5))
%time DAX['Return'] = np.log(DAX['Close']/DAX['Close'].shift(1))
DAX[['Close', 'Return']].tail()
DAX[['Close', 'Return']].plot(subplots=True, color='b',
figsize=(8,5))
DAX['42d'] = pd.rolling(DAX['Close'], window=42).mean()
DAX['252d'] = pd.rolling(DAX['Close'], window=252).mean()
DAX[['Close', '42d', '252d']].tail.()
DAX[['Close', '42d', '252d']].plot(figsize=(8, 5))
###
import math
## Regression Analysis
import pandas as pd
from urllib import urlretrieve
es_url = 'http://www.stoxx.com/download/historical_values/hbrbcpe.txt'
vs_url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
urlretrieve(es_url, './data/es.txt')
urlretrieve(vs_url, './data/vs.txt')
!s -o ./data/*.txt
# Windows: use dir
lines = open('./data/es.txt','r').readlines()
lines = [line.replace(' ', '') for line in lines]
for line in lines[3883:3890]:
print line[41:]
## To make the data set easier to import
new_file = open('./data/es50.txt', 'w')
# opens a new file
new_file.writelines('date' + lines[3][:-1]
+ ';DEL'+ lines[3][-1])
# writes the corrected third line of the original file
# as first line of new file
new_file.writelines(lines[4:])
# write the remaining lines of the original file
new_file.close()
##
new_lines = open('./data/es50.txt','r').readlines()
new_lines[:5]
# now import with read_csv function
es = pd.read_csv('./data/es50.txt', index_col=0,
parse_dates=True, sep=';', dayfirst=True)
np.round(es.tail())
## now delete helper column
del es['DEL']
es.info()
## directly import from url
cols = ['SX5P', 'SX5E', 'SXXP', 'SXXE', 'SXXF',
'SXXA', 'DKSF', 'DKXF']
es = pd.read_csv(es_url, index_col=0, parse_dates=True,
sep=';', dayfirst=True, header=None, skiprows=4, names=cols)
es.tail()
######
vs = pd.read_csv('./data/vs.txt', index_col=0, header=2,
parse_dates=True, sep=',', dayfirst=True)
vs.info()
### creat a new dataframe
import datetime as dt
data = pd.DataFrame({'EUROSTOXX' :
es['SX5E'][es.index > dt.datetime(1999, 1, 1)]})
data = data.join(pd.DataFrame({'VSTOXX' :
vs['V2TX'][vs.index > dt.datetime(1999, 1, 1)]}))
data = data.fillna(method='ffill')
data.info()
data.tail()
data.plot(subplots=True, grid=True, style='b', figsize=(8, 6))
rets = np.log(data / data.shift(1))
rets.head()
rets.plot(subplots=True, grid=True, style='b', figsize=(8,6))
rets = rets.dropna()
ix_infs = np.where(np.isinf(rets))
rets.drop(rets.index[ix_infs[0]],inplace=True)
#方法一:直接
del DF['column-name'] # 直接删除
# 方法二:采用drop方法,有下面三种等价的表达式:
DF= DF.drop('column_name', 1);
DF.drop('column_name',axis=1, inplace=True)
DF.drop([DF.columns[[0,1, 3]]], axis=1,inplace=True) # Note: zero indexed
### regression
xdat = rets['EUROSTOXX']
ydat = rets['VSTOXX']
model = pd.ols(y=ydat, x=xdat)
model
model.beta
plt.plot(xdat, ydat, 'r.')
ax = plt.axis() # grab axis values
x = np.linspace(ax[0], ax[1] + 0.01)
plt.plot(x, model.beta[1] + model.beta[0]*x, 'b', lw=2)
plt.grid(True)
plt.axis('tight')
plt.xlabel('EURO STOXX 50 returns')
plt.ylabel('VSTOXX returns')
## Rolling correlation
rets.corr()
pd.rolling_corr(rets['EUROSTOXX'], rets['VSTOXX'],
window=252).plot(grid=True, style='b')
##High-Frequency Data
import numpy as np
import pandas as pd
import datetime as dt
from urllib import urlretrieve
%matplotlib inline
## web-based API
url1 = 'http://hopey.netfonds.no/posdump.php?'
url2 = 'date=%s%s%s&paper=AAPL.O&csv_format=csv'
url = url1 + url2
year = '2014'
month = '09'
days = ['22', '23', '24', '25']
# dates might need to be updated
AAPL = pd.DataFrame()
for day in days:
AAPL = AAPL.append(pd.read_csv(url%(year, month, day),
index_col=0, header=0, parse_dates=True))
AAPL.columns = ['bid', 'bdepth', 'bdeptht',
'offer', 'odepth', 'odeptht']
# shorter column names
AAPL.info()
#### writing objects to disk
path = 'D:\wd.python\forise\'
import numpy as np
from random import gauss
a = [gauss(1.5, 2) for i in range(1000000)]
# generation of normally distributed randoms
import pickle
pkl_file = open('data.pkl', 'wb')
# open file for writing
# Note: existing file might be overwritten
%time pickle.dump(a, pkl_file)
pkl_file
pkl_file.close()
# read data to memory
pkl_file = open(path + 'data.pkl', 'rb') # open file for reading
%time b = pickle.load(pkl_file)
b[:5]
np.allclose(np.array(a), np.array(b))
## write two objects
pkl_file = open(path + 'data.pkl', 'wb')
pickle.dump(np.array(a), pkl_file)
pickle.dump(np.array(a)**2, pkl_file)
## read two objects from one file, FIFO rule,
## load twice
### reading and writing text files
rows = 5000
a = np.random.standard_normal((rows, 5)) # dummy data
a.round(4)
import pandas as pd
t = pd.date_range(start='2014/1/1', periods=rows, freq='H')
# set of hourly datetime range
csv_file = open(path + 'data.csv', 'w')
# open file for writing
header = 'data,no1,no2,no3,no4.no5\n'
csv_file.write(header)
for t_, (no1, no2, no3, no4, no5) in zip(t,a):
s = '%s,%f,%f,%f,%f,%f\n' % (t_, no1, no2, no3, no4, no5)
csv_file.write(s)
csv_file = open(path + 'data.csv', 'r') # open file for reading
for i in range(5):
print csv_file.readline(),
# or we can read all the content by readlines
csv_file = open(path + 'data.csv', 'r')
content = csv_file.readlines()
for line in content[:5]:
print line,
csv_file.close()
### SQL Databases
import sqlite3 as sq3
query = 'CREATE TABLE numbs (Date date, No1 real, No2 real)'
con = sq3.connect(path + 'numbs.db')
con.execute(query)
con.commit() # to make the query more effective, call the method commit
## create data
import datetime as dt
con.execute('INSERT INTO numbs VALUES(?, ?, ?)',
(dt.datetime.now(), 0.12, 7.3))
data = np.random.standard_normal((10000, 2)).round(5)
for row in data:
con.execute('INSERT INTO numbs VALUES(?, ?, ?)'
(dt.datetime.now(), row[0], row[1]))
con.commit()
### Writing and Reading NumPy Arrays
import numpy as np
dtimes = np.arange('2015-01-01 10:00:00', '2021-12-31 22:00:00',
dtype='datetime64[m]') # minute intervals
len(dtimes)
dty = np.dtype([('Date', 'datetime64[m]'), ('No1', 'f'), ('No2', 'f')])
data = np.zeros(len(dtimes), dtype=dty)
data['Date'] = dtimes
a = np.random.standard_normal((len(dtimes), 2)).round(5)
data['No1'] = a[:,0]
data['No2'] = a[:,1]
%time np.save(path + 'array', data) # suffix .npy as added
%time np.load(path + 'array.npy')
### Chapter 9 : Mathematical tools
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from datetime import datetime
from xlrd import xldate_as_tuple
d=datetime(xldate_as_tuple(x,0)) # x就是你那串数字的变量
# 用Pandas完成Excel中常见的任务
# http://python.jobbole.com/80972/
df.query('name == ["Kulas Inc","Barton LLC"]').head()
df[df["sku"].map(lambda x: x.startswith('B1'))]
df[df['sku'].str.contains('B1')].head()
## 如何利用字典循环, loop with dictionary
col_names = list('abcd')
dic = {}
for col in col_names:
temp = range(len(col_names)) # could be a Series or DataFrame
dic.update({col:temp})
df = pd.DataFrame(dic, index=range(len(col_names)))
dic.keys() # list in python 2, but Dict in python 3
dic.values() # list w
# 批量更改列名
df.columns
# Index([u'$a', u'$b', u'$c', u'$d', u'$e'], dtype=object)
df.rename(columns=lambda x: x[1:], inplace=True)
df = df.rename(columns=lambda x: x.replace('$', '')
df = df.rename(columns={'$a': 'a', '$b': 'b'})
df.columns
# Index([u'a', u'b', u'c', u'd', u'e'], dtype=object)
# NaN
df = pd.DataFrame(data=np.nan, columns=list('abcd'))
# http://www.jb51.net/article/49457.htm
# sqlalchemy, python 和 mysql 交互
from sqlalchemy.types import String
data.to_sql('data_dtype', engine, dtype={'Col_1': String})
pd.read_sql_table('data', engine, index_col='id')
pd.read_sql_table('data', engine, columns=['Col_1', 'Col_2'])
pd.read_sql_table('data', engine, parse_dates=['Date'])
pd.read_sql_table('data', engine, parse_dates={'Date': '%Y-%m-%d'})
pd.read_sql_query('SELECT * FROM data', engine)
pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine)
# 对于pandas.Series的操作, descriptive statistics
Series.cumprod()
Series.cummin()
Series.cummax(), .abs(), .all(),.any(),.autocorr(),
.between(),.clip(),.clip_lower(),.clip_upper(),.count(),.cov(),
Series.describe()
Series.diff(), .factorize(), .kurt(),.mad(),
Series.nlargest(),.nsmallest() # Faster than .sort_values(ascending=False\True).head(n)
Series.pct_change, .prod(), .quantile(), .rank(), .sem()
Series.var(),
Series.unique() # 生成array,不重复的数 还有.nunique(),number of unique()
Series.is_monotonic, is_unique,is_monotonic_increasing/decreasing
Series.value_counts
# 日期时间方法处理汇总
import datetime
now = datetime.datetime.now() # datetime
import time
time.time() # 时间戳 timestamp
time.localtime() # time tuple
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
datetime.date.now().date() # date
datetime.date.today() # date
# 获取明天/前N天
datetime.date.today() + datetime.timedelta(days=1) # 明天
# 获取当天开始和结束时间
datetime.datetime.combine(datetime.date.today(), datetime.time.min()) # datetime.time.max
# 获取两个datetime的时间差
(datetime.datetime(2017,3,16,21,30) - datetime.datetime.now()).total_seconds()
## datetime.timedelta().total_seconds()
# 获取本周/本月/上月最后一天
today = datetime.date.today()
sunday = today + datetime.timedelta(6 - today.weekday())
# 本月
import calendar
today = datetime.date.today()
_,last_day_num = calendar.monthrange(today.year, today.month) # 返回tuple(本月第一天是星期几,这个月总共多少天)
last_day = datetime.date(today.year, today.month, last_day_num)
# 获取上个月的最后一天
today = datetime.date.today()
first = datetime.date(day=1, month=today.month, year=today.year)
lastMonth = first - datetime.timedelta(days=1)
# string 转换
datetime.datetime.strptime("2014-12-31 18:20:10","%Y-%m-%d %H:%M:%S")
datetime.datetime.now().timetuple() # datetime to timetuple
# timetuple to datetime
# timetuple -> timestamp -> datetime
# datetime -> date
datetime.datetime.now().date()
# date -> datetime
datetime.date.today()
today = datetime.date.today()
datetime.datetime.combine(today,datetime.time())
datetime.datetime.combine(today,datetime.time.min)
# datetime <=> timestamp
# datetime -> timestamp
now = datetime.datetime.now()
timestamp = time.mktime(now.timetuple())
# timestamp -> datetime
datetime.datetime.fromtimestamp(time.time())
# Pandsas string method / working with text data
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
s.str.upper()
s.str.len()
idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip() # 用来去掉空格
idx.str.lstrip()
idx.str.rstrip()
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') # 两次str
Python 学习笔记
最新推荐文章于 2023-05-17 15:21:24 发布