Python 学习笔记

# -*- coding: utf-8 -*-
"""
Created on Fri Jul 15 00:52:07 2016

@author: Alan Guo

"""
a = 1
a.bit_length()
type(a)
a.as_integer_ration()




import decimal
from decimal import Decimal
decimal.getcontext()
decimal.getcontext().prec = 4
e = Decimal(1) / Decimal(11)

t = 'this is a string object'
t.capitalize()
t.split()
t.find('string')
t.replace(' ','|')

import re  # regular expression
series = """
'01/18/2014 13:00:00', 100,'1st'
"""
dt = re.compile("'[0-9/:\s]+'")

result = dt.findall(series)

from datetime import datetime 
pydt = datetime.strptime(result[0].replace("'",""),
                          '%m/%d/%Y %H:%M:%S')
pydt
datetime.datetime(2014, 1, 18, 13, 0)
print pydt
# 日期格式转化
date='Wed, 11 Apr 2012 09:37:05 +0800'
dd=datetime.strptime(date,'%a, %d %b %Y %H:%M:%S %z')
dd.strftime('%Y-%m-%d %H:%M:%S')

'''%a Abbreviated weekday name   
%A Full weekday name   
%b Abbreviated month name   
%B Full month name   
%c Date and time representation appropriate for locale   
%d Day of month as decimal number (01 - 31)   
%H Hour in 24-hour format (00 - 23)   
%I Hour in 12-hour format (01 - 12)   
%j Day of year as decimal number (001 - 366)   
%m Month as decimal number (01 - 12)   
%M Minute as decimal number (00 - 59)   
%p Current locale's A.M./P.M. indicator for 12-hour clock   
%S Second as decimal number (00 - 59)   
%U Week of year as decimal number, with Sunday as first day of week (00 - 51)   
%w Weekday as decimal number (0 - 6; Sunday is 0)   
%W Week of year as decimal number, with Monday as first day of week (00 - 51)   
%x Date representation for current locale   
%X Time representation for current locale   
%y Year without century, as decimal number (00 - 99)   
%Y Year with century, as decimal number   
%z, %Z Time-zone name or abbreviation; no characters if time zone is unknown   
%% Percent sign  
'''

# tuple 

t = (1, 2.5, 'data')
t[2]
t.count('data')
t.index(1)  # 找到元素的位置

# list
I = [1, 2.5, 'data']
l = list(t)

l.append([4,3]) # append list at the end
# [1, 2.5, 'data', [4, 3]]
l.extend([1.0, 1.5, 2.0]) # append the elements of list
# [..... [4, 3], 1.0, 1.5, 2.0]
l.insert(1, 'insert') # insert object before index position
# [1, 'insert', 2.5,...]
l.remove('data') # remove first occurence of object
# [1, 'insert', 2.5, [4, 3]...]
p = l.pop(3) # remove and return object at index
print l, p
# [1, 'insert', 2.5, 1.0...] [4,3]
l[2:5] # 3rd to 5th elements



for i in range(1, 10):
    if i % 2 == 0: # % is for modulo
        print "%d is even" % i
    elif i % 3 == 0:
        print "%d is multiple of 3" % i
    else:
        print "%d is odd" % i
             
total = 0
while total < 100:
    total += 1 
print (total)

m = [i**2 for i in range(5)]
m

def even(x):
    return x % 2 == 0

map (even, range(10))

map(lambda x: x ** 2, range(10))

filter(even, range(15))

reduce(lambda x, y: x + y, range(15))

# Dict
d = {'Name': 'Angela Merkel',
     'Country': 'Germany',
     'Profession': 'Chancelor',
     'Age': 60
     }
type(d)    
d.keys()
d.values()
d.items()

for item in d.iteritems():
    print(item)

# Sets
s = set(['u', 'd', 'ud', 'du', 'd', 'du'])
t = set(['d', 'dd', 'uu', 'u'])
s.union(t)  # all of s and t
s.intersection(t)  # both in s and t
s.difference(t)  # in s but not t
s.symmetric_difference(t)  # in either one but not both

from random import randint
l = [randint(0, 10) for i in range(1000)]
    # 1,000 random integers between 0 and 10
len(l)  # number of elements in l

s = set(l)

from copy import deepcopy
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = 3 * [deepcopy(v), ]

import numpy as np
a = np.array([0, 0.5, 1.0, 1.5, 2.0])
type(a)
a[:2] # indexing as with list objects in 1 dimension
a.sum()
a.std()
a.cumsum()
np.sqrt(a)

b = np.array([a, a*2])
b[0]  # first row
b[0, 2]  # third element of first row
b.sum()
b.sum(axis=0)  # column-wise sum
b.sum(axis=1)
b.tolist()
b.flatten() ; 
b.flat # 用来循环,遍历, iterable
b.min(0) # or b.min(1)
b.prod()
b.cumprod()
b.any() ; b.all()
b.size # 元素的总数量

c = np.zeros((2, 3, 4), dtype='i',order='C') # also: np.ones()
d = np.ones_like(c, dtype='f', order='C') # also: np.zeros_like()


x = np.random.standard_normal((5,1000000))
y = 2 * x + 3 # linear equation y = a * x + b
C = np.array((x, y), order='C')
F = np.array((x, y), order='F')
x = 0.0; y = 0.0  # memory cleanup
C[:2].round(2)

a = np.arange(0, 4)
b = np.arange(1, 5)
np.add(a, b)
np.add(a, b, a)

x1 = 3; x2 = 2
x1//x2 # = 1 (floor divide)

a = np.arange(0, 12)
a.shape = 3,4
a.shape
a.reshape(12,)
a.shape = 4,-1
a.reshape(3,-1)
a.tofile('a.bin')
b = np.fromfile("a.bin", dtype = np.int32)

np.save("a.npy", a)
c = np.load('a.npy')

np.savez("  ",a, b,  ) # 保存多数组至压缩文件

a = np.arange(0, 12, 0.5).reshape(4, -1)
np.savetxt("a.txt",a)  # 默认以空格分隔
np.loadtxt('a.txt')

np.savetxt('a.txt', a, fmt='%d', delimiter=',')
np.loadtxt('a.txt', delimiter=',')   #读入的时候也需要指定

a = np.arange(8)
b = np.add.accumulate(a)
c = a + b
f = file('result.npy', 'wb')
np.save(f, a)  # 按顺序将a, b, c保存进文件对象f
np.save(f, b)
np.save(f, c)
f.close()
f = file('result.np', 'rb')
np.load(f)  # 顺序从文件对象f中读取内容
np.load(f)
np.load(f)


#  Data Visualization

import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
# %matplotlib inline
np.random.seed(1000)
y = np.random.standard_normal(20)
x = range(len(y))
plt.plot(x, y)  # 也可写成 plt.plot(y)

plt.plot(y.cumsum())
plt.grid(True)  # adds a grid
plt.axis('tight')  # adjusts the axis ranges

plt.plot(y.cumsum())
plt.grid(True)
plt.xlim(-1, 20)
plt.ylim(np.min(y.cumsum())-1, np.max(y.cumsum())+1)
"""
===============================

"""
np.random.seed(2000)
y = np.random.standard_normal((20, 2)).cumsum(axis=0)

plt.figure(figsize=(7, 4))
# the figsize parameter defines the size of the figure in (width, height)
plt.plot(y.cumsum(), 'b', lw=1.5)
plt.plot(y.cumsum(), 'yo')
plt.grid(True)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')

plt.figure(figsize=(7,4))
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 1], lw=1.5, label='2nd')
plt.plot(y, 'ro')
plt.grid(True)
plt.legend(loc=0) # 0: best possible
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')
plt.title('A Simple Plot')

fig, ax1 = plt.subplots() 
# Plot first data set using first (left) axis
plt.plot(y[:, 0], 'b', lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=8)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value 1st')
plt.title('A Simple Plot')
ax2 = ax1.twinx()
# plot second data set using second (right) axis
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')
plt.plot(y[:, 1], 'ro')
plt.legend(loc=0)
plt.ylabel('value 2nd')
fig, ax1 = plt.subplots()

plt.figure(figsize=(7,5))
plt.subplot(211)
plt.plot(y[:, 0], lw=1.5, label='1st')
plt.plot(y[:, 0], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.ylabel('value')
plt.title('A Simple Plot')
plt.subplot(212)
plt.plot(y[:, 1], 'g', lw=1.5, label='2nd')
plt.plot(y[:, 1], 'ro')
plt.grid(True)
plt.legend(loc=0)
plt.axis('tight')
plt.xlabel('index')
plt.ylabel('value')

plt.bar(np.arange(len(y)), y[:,1], width=0.5,
        color='g', label='2nd')
        
##  Other Plot Styles
# Scatter Plot
y = np.random.standard_normal((1000,2))
plt.figure(figsize=(7,5))
plt.plot(y[:,0], y[:,1], 'ro')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')

plt.figure(figsize=(7,5))
plt.scatter(y[:, 0], y[:, 1], marker='o', color='r')
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')

c = np.random.randint(0, 10, len(y))
plt.figure(figsize=(7,5))
plt.scatter(y[:,0], y[:,1],c=c,marker='o')
plt.colorbar()
plt.grid(True)
plt.xlabel('1st')
plt.ylabel('2nd')
plt.title('Scatter Plot')

# Histogram
plt.figure(figsize=(7,4))
plt.hist(y, label=['1st', '2nd'], bins=25,color=['r','b'])

# Stacked hist 
plt.hist(y, label=['1st', '2nd'], color=['b','g'],
         stacked=True, bins=20)

#  box plot
fig,ax = plt.subplots(figsize=(7,4))
plt.boxplot(y)
plt.grid(True)
plt.setp(ax, xticklabels=['1st', '2nd'])

##  
import matplotlib.finance as mpf
start = (2014, 5, 1)
end = (2014, 6, 30)
quotes = mpf.quotes_historical_yahoo_ochl('^GDAXI', start, end)
quotes[:2]

fig, ax = plt.subplots(figsize=(8,5))
fig.subplots_adjust(bottom=0.2)
mpf.candlestick_ochl(ax, quotes, width=0.6, colorup='r', colordown='g')
plt.grid(True)
ax.xaxis_date()
#  dates on the x-axis
ax.autoscale_view()
plt.setp(plt.gca().get_xticklabels(),rotation=30)

###
quotes = np.array(mpf.quotes_historical_yahoo_ochl('YHOO',start,end))
fig, (ax1, ax2) = plt.subplots(2,sharex=True, figsize=(8,6))
mpf.candlestick_ochl(ax1, quotes, width=0.6, colorup='r', colordown='g')
ax1.set_title('Yahoo Inc.')
ax1.set_ylabel('index level')
ax1.grid(True)
ax1.xaxis_date()
plt.bar(quotes[:, 0]-0.25, quotes[:,5],width=0.5)
ax2.set_ylabel('volume')
ax2.grid(True)
ax2.autoscale_view()
plt.setp(plt.gca().get_xticklabels(),rotation=30)

## 3D Plotting

strike = np.linspace(50, 150, 24)
ttm = np.linspace(0.5, 2.5, 24)
strike, ttm = np.meshgrid(strike, ttm)
iv = (strike - 100)**2 / (100 * strike)/ttm
# generate fake implied volatilities
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(9, 6))
ax = fig.gca(projection='3d')

surf = ax.plot_surface(strike, ttm, iv, rstride=2, cstride=2,
                       cmap=plt.cm.coolwarm, linewidth=0.5,
                       antialiased=True)
ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')
ax.set_zlabel('implied volatility')

fig.colorbar(surf, shrink=0.5, aspect=5)

## ##############  Volatility surface
fig = plt.figure(figsize=(8,5)) 
ax = fig.add_subplot(111, projection='3d')
ax.view_init(30, 60)

ax.scatter(strike, ttm, iv, zdir='z', s=25,
           c='b', marker='^')

ax.set_xlabel('strike')
ax.set_ylabel('time-to-maturity')                      
ax.set_zlabel('implied volatility')

#########################################
# pandas basic

import numpy as np
import pandas as pd
df = pd.DataFrame([10, 20, 30, 40], columns=['numbers'],
                  index=['a', 'b', 'c', 'd'])
df
df.index # the index value
df.columns # the column names
df.ix['c'] # selection via index
df.ix[['a', 'd']] # selection of multiple indices
df.ix[['a', 'd']] # selection via Index object
df.sum() # sum per column
df.max()
df.min()
df.apply(lambda x: x**2) # square of every element
df**2 # agian square, this time Numpy-like
df['floats'] = (1.5, 2.5, 3.5, 4.5)
# new column is generated
df['floats']  # selection of new column

df['names'] = pd.DataFrame(['Yves', 'Guido', 'Felix', 'Francesc'],
                            index=['d', 'a', 'b', 'c'])

df.append({'number':100, 'floats':5.75, 'names':'Henry'},
           ignore_index=True)
# temporary object, df not changed, and index not changed

df = df.append(pd.DataFrame({'numbers':100, 'float':5.75,
                             'names':'Henry'}, index=['z']))                       
# it's better this way
df
df = df.join(pd.DataFrame([1, 4, 9, 16, 25], 
                     index=['a', 'b', 'c', 'd', 'y'],
                    columns=['squares']),
                    how = 'outer')                         
df[['numbers', 'squares']].mean() # column-wise mean
df[['numbers', 'squares']].std()  # column-wise std

df2[df2['E'].isin(['two','four'])]
df.sub(s, axis='index') # s也是个index
######
a = np.random.standard_normal((9, 4))              
a.round(6)
df = pd.DataFrame(a)
df.columns = [['No1', 'No2', 'No3', 'No4']]
df['No3'][3] ## value in column No2 at index 3
dates = pd.date_range('2015-1-1', periods=9, freq='M')
dates 
df.index = dates
###  from dataframe to array
np.array(df).round(6)
### basic functions with dataframe
df.sum()
df.mean()
df.cumsum()
df.describe()
np.sqrt(df)
np.sqrt(df).sum()

%matplotlib inline
df.cumsum().plot(lw=2.0)
df.plot(secondary_y='weight', figsize=(14,7))
#### 画两个y轴
type(df)
df['No1']

import matplotlib.pyplot as plt
df['No1'].cumsum().plot(style='r', lw=2.)
plt.xlabel('date')
plt.ylabel('value')

### GroupBy Operations
df['Quarter'] = ['Q1', 'Q1','Q1', 'Q2', 'Q2', 'Q2', 'Q3', 'Q3', 'Q3']
groups = df.groupby('Quarter')
groups.mean()
groups.max()
groups.size()

df['Odd_Even'] = ['Odd', 'Even', 'Odd', 'Even', 'Odd', 'Even',
                    'Odd', 'Even', 'Odd']
groups = df.groupby(['Quarter', 'Odd_Even'])
groups.size()
groups.mean()

##  Read financial data
import pandas.io.data as web
DAX = web.DataReader(name='SPY', data_source='yahoo',
                     start='2000-1-1')
DAX.info()
DAX.tail()
DAX['Close'].plot(figsize=(8,5))

%time DAX['Return'] = np.log(DAX['Close']/DAX['Close'].shift(1))
DAX[['Close', 'Return']].tail()
DAX[['Close', 'Return']].plot(subplots=True, color='b',
                                figsize=(8,5))
DAX['42d'] = pd.rolling(DAX['Close'], window=42).mean()
DAX['252d'] = pd.rolling(DAX['Close'], window=252).mean()
DAX[['Close', '42d', '252d']].tail.()
DAX[['Close', '42d', '252d']].plot(figsize=(8, 5))

###
import math


## Regression Analysis
import pandas as pd
from urllib import urlretrieve

es_url = 'http://www.stoxx.com/download/historical_values/hbrbcpe.txt'
vs_url = 'http://www.stoxx.com/download/historical_values/h_vstoxx.txt'
urlretrieve(es_url, './data/es.txt')
urlretrieve(vs_url, './data/vs.txt')
!s -o ./data/*.txt
# Windows: use dir

lines = open('./data/es.txt','r').readlines()
lines = [line.replace(' ', '') for line in lines]
for line in lines[3883:3890]:
    print line[41:]

## To make the data set easier to import
new_file = open('./data/es50.txt', 'w')
# opens a new file
new_file.writelines('date' + lines[3][:-1]
                    + ';DEL'+ lines[3][-1])
# writes the corrected third line of the original file
# as first line of new file
new_file.writelines(lines[4:])                    
# write the remaining lines of the original file
new_file.close()

##
new_lines = open('./data/es50.txt','r').readlines()
new_lines[:5]
# now import with read_csv function
es = pd.read_csv('./data/es50.txt', index_col=0,
                parse_dates=True, sep=';', dayfirst=True)
np.round(es.tail())           
## now delete helper column
del es['DEL']   
es.info()  
## directly import from url
cols = ['SX5P', 'SX5E', 'SXXP', 'SXXE', 'SXXF',
        'SXXA', 'DKSF', 'DKXF']
es = pd.read_csv(es_url, index_col=0, parse_dates=True,
                 sep=';', dayfirst=True, header=None, skiprows=4, names=cols)
es.tail()
######
vs = pd.read_csv('./data/vs.txt', index_col=0, header=2,
                 parse_dates=True, sep=',', dayfirst=True)
vs.info()

### creat a new dataframe
import datetime as dt
data = pd.DataFrame({'EUROSTOXX' :
                    es['SX5E'][es.index > dt.datetime(1999, 1, 1)]})
data = data.join(pd.DataFrame({'VSTOXX' :
                vs['V2TX'][vs.index > dt.datetime(1999, 1, 1)]}))
data = data.fillna(method='ffill')                
data.info()
data.tail()
data.plot(subplots=True, grid=True, style='b', figsize=(8, 6))

rets = np.log(data / data.shift(1))
rets.head()
rets.plot(subplots=True, grid=True, style='b', figsize=(8,6))
rets = rets.dropna()
ix_infs = np.where(np.isinf(rets))
rets.drop(rets.index[ix_infs[0]],inplace=True)
#方法一:直接
del DF['column-name'] # 直接删除
# 方法二:采用drop方法,有下面三种等价的表达式:
DF= DF.drop('column_name', 1);
DF.drop('column_name',axis=1, inplace=True)
DF.drop([DF.columns[[0,1, 3]]], axis=1,inplace=True)   # Note: zero indexed

### regression
xdat = rets['EUROSTOXX']
ydat = rets['VSTOXX']
model = pd.ols(y=ydat, x=xdat)
model
model.beta

plt.plot(xdat, ydat, 'r.')
ax = plt.axis() # grab axis values
x = np.linspace(ax[0], ax[1] + 0.01)
plt.plot(x, model.beta[1] + model.beta[0]*x, 'b', lw=2)
plt.grid(True)
plt.axis('tight')
plt.xlabel('EURO STOXX 50 returns')
plt.ylabel('VSTOXX returns')

## Rolling correlation
rets.corr()
pd.rolling_corr(rets['EUROSTOXX'], rets['VSTOXX'],
                window=252).plot(grid=True, style='b')

##High-Frequency Data
import numpy as np
import pandas as pd
import datetime as dt
from urllib import urlretrieve
%matplotlib inline
## web-based API 
url1 = 'http://hopey.netfonds.no/posdump.php?'
url2 = 'date=%s%s%s&paper=AAPL.O&csv_format=csv'
url = url1 + url2

year = '2014'
month = '09'
days = ['22', '23', '24', '25'] 
# dates might need to be updated
AAPL = pd.DataFrame()
for day in days:
    AAPL = AAPL.append(pd.read_csv(url%(year, month, day),
                       index_col=0, header=0, parse_dates=True))
AAPL.columns = ['bid', 'bdepth', 'bdeptht',
                'offer', 'odepth', 'odeptht']
    # shorter column names
AAPL.info()

#### writing objects to disk
path = 'D:\wd.python\forise\'
import numpy as np
from random import gauss
 
a = [gauss(1.5, 2) for i in range(1000000)]
# generation of normally distributed randoms
import pickle
pkl_file = open('data.pkl', 'wb')
# open file for writing
# Note: existing file might be overwritten

%time pickle.dump(a, pkl_file)
pkl_file
pkl_file.close()

# read data to memory
pkl_file = open(path + 'data.pkl', 'rb') # open file for reading
%time b = pickle.load(pkl_file)
b[:5]

np.allclose(np.array(a), np.array(b))

## write two objects
pkl_file = open(path + 'data.pkl', 'wb')
pickle.dump(np.array(a), pkl_file)
pickle.dump(np.array(a)**2, pkl_file)
## read two objects from one file, FIFO rule,
## load twice

### reading and writing text files
rows = 5000
a = np.random.standard_normal((rows, 5)) # dummy data
a.round(4)

import pandas as pd
t = pd.date_range(start='2014/1/1', periods=rows, freq='H')
    # set of hourly datetime range

csv_file = open(path + 'data.csv', 'w')
    # open file for writing 
header = 'data,no1,no2,no3,no4.no5\n'
csv_file.write(header)
for t_, (no1, no2, no3, no4, no5) in zip(t,a):
    s = '%s,%f,%f,%f,%f,%f\n' % (t_, no1, no2, no3,  no4, no5)
    csv_file.write(s)
    
csv_file = open(path + 'data.csv', 'r') # open file for reading
for i in range(5):
    print csv_file.readline(),
# or we can read all the content by readlines
csv_file = open(path + 'data.csv', 'r')
content = csv_file.readlines()
for line in content[:5]:
    print line,
csv_file.close()

###  SQL Databases
import sqlite3 as sq3
query = 'CREATE TABLE numbs (Date date, No1 real, No2 real)'
con = sq3.connect(path + 'numbs.db')
con.execute(query)
con.commit()  # to make the query more effective, call the method commit

## create data
import datetime as dt
con.execute('INSERT INTO numbs VALUES(?, ?, ?)',
            (dt.datetime.now(), 0.12, 7.3))
data = np.random.standard_normal((10000, 2)).round(5)
for row in data:
    con.execute('INSERT INTO numbs VALUES(?, ?, ?)'
                (dt.datetime.now(), row[0], row[1]))
con.commit()

### Writing and Reading NumPy Arrays
import numpy as np
dtimes = np.arange('2015-01-01 10:00:00', '2021-12-31 22:00:00',
                   dtype='datetime64[m]') # minute intervals
len(dtimes)
dty = np.dtype([('Date', 'datetime64[m]'), ('No1', 'f'), ('No2', 'f')])
data = np.zeros(len(dtimes), dtype=dty)
data['Date'] = dtimes

a = np.random.standard_normal((len(dtimes), 2)).round(5)
data['No1'] = a[:,0]
data['No2'] = a[:,1]

%time np.save(path + 'array', data) # suffix .npy as added
%time np.load(path + 'array.npy')



### Chapter 9 : Mathematical tools
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


from datetime import datetime
from xlrd import xldate_as_tuple

d=datetime(xldate_as_tuple(x,0)) # x就是你那串数字的变量

# 用Pandas完成Excel中常见的任务
# http://python.jobbole.com/80972/
df.query('name == ["Kulas Inc","Barton LLC"]').head()
df[df["sku"].map(lambda x: x.startswith('B1'))]
df[df['sku'].str.contains('B1')].head()

## 如何利用字典循环, loop with dictionary
col_names = list('abcd')
dic = {}
for col in col_names:
	temp = range(len(col_names)) # could be a Series or DataFrame
	dic.update({col:temp})
df = pd.DataFrame(dic, index=range(len(col_names)))
dic.keys() # list in python 2, but Dict in python 3
dic.values() # list w

# 批量更改列名
df.columns
# Index([u'$a', u'$b', u'$c', u'$d', u'$e'], dtype=object)
df.rename(columns=lambda x: x[1:], inplace=True)
df = df.rename(columns=lambda x: x.replace('$', '')
df = df.rename(columns={'$a': 'a', '$b': 'b'})
df.columns
# Index([u'a', u'b', u'c', u'd', u'e'], dtype=object)
# NaN
df = pd.DataFrame(data=np.nan, columns=list('abcd'))	

# http://www.jb51.net/article/49457.htm
# sqlalchemy, python 和 mysql 交互
from sqlalchemy.types import String

data.to_sql('data_dtype', engine, dtype={'Col_1': String})
pd.read_sql_table('data', engine, index_col='id')
pd.read_sql_table('data', engine, columns=['Col_1', 'Col_2'])
pd.read_sql_table('data', engine, parse_dates=['Date'])
pd.read_sql_table('data', engine, parse_dates={'Date': '%Y-%m-%d'})

pd.read_sql_query('SELECT * FROM data', engine)
pd.read_sql_query("SELECT id, Col_1, Col_2 FROM data WHERE id = 42;", engine)

# 对于pandas.Series的操作, descriptive statistics
Series.cumprod()
Series.cummin()
Series.cummax(), .abs(), .all(),.any(),.autocorr(),
.between(),.clip(),.clip_lower(),.clip_upper(),.count(),.cov(),
Series.describe()
Series.diff(), .factorize(), .kurt(),.mad(),
Series.nlargest(),.nsmallest() # Faster than .sort_values(ascending=False\True).head(n)
Series.pct_change, .prod(), .quantile(), .rank(), .sem()
Series.var(), 
Series.unique() # 生成array,不重复的数 还有.nunique(),number of unique()
Series.is_monotonic, is_unique,is_monotonic_increasing/decreasing
Series.value_counts

# 日期时间方法处理汇总
import datetime
now = datetime.datetime.now() # datetime
import time
time.time() # 时间戳 timestamp
time.localtime() # time tuple
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
datetime.date.now().date() # date
datetime.date.today() # date
# 获取明天/前N天
datetime.date.today() + datetime.timedelta(days=1) # 明天
# 获取当天开始和结束时间
datetime.datetime.combine(datetime.date.today(), datetime.time.min()) # datetime.time.max
# 获取两个datetime的时间差
(datetime.datetime(2017,3,16,21,30) - datetime.datetime.now()).total_seconds() 
## datetime.timedelta().total_seconds()
# 获取本周/本月/上月最后一天
today = datetime.date.today()
sunday = today + datetime.timedelta(6 - today.weekday())
# 本月
import calendar
today = datetime.date.today()
_,last_day_num = calendar.monthrange(today.year, today.month) # 返回tuple(本月第一天是星期几,这个月总共多少天)
last_day = datetime.date(today.year, today.month, last_day_num)
# 获取上个月的最后一天
today = datetime.date.today()
first = datetime.date(day=1, month=today.month, year=today.year)
lastMonth = first - datetime.timedelta(days=1)
# string 转换
datetime.datetime.strptime("2014-12-31 18:20:10","%Y-%m-%d %H:%M:%S")
datetime.datetime.now().timetuple() # datetime to timetuple
# timetuple to datetime
# timetuple -> timestamp -> datetime
# datetime -> date
datetime.datetime.now().date()
# date -> datetime
datetime.date.today()
today = datetime.date.today()
datetime.datetime.combine(today,datetime.time())
datetime.datetime.combine(today,datetime.time.min)
# datetime <=> timestamp
# datetime -> timestamp
now = datetime.datetime.now()
timestamp = time.mktime(now.timetuple())
# timestamp -> datetime
datetime.datetime.fromtimestamp(time.time())

# Pandsas string method / working with text data 
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
s.str.upper()
s.str.len()
idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank'])
idx.str.strip() # 用来去掉空格
idx.str.lstrip()
idx.str.rstrip()
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') # 两次str



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值