from lxml.html import parse
from urllib2 import urlopen
import pandas as pd
def unpack(row, kind='td'):
elts = row.findall('.//%s' % kind)
return [val.text_content() for val in elts]
optionurl = 'http://finance.yahoo.com/q/op?s=AAPL&date=1429833600'
parsed = parse(urlopen(optionurl))
doc = parsed.getroot()
tables = doc.findall('.//table')
########################################
# process call option data
########################################
table = tables[1]
rows = table.findall('.//tr')
calldata = []
# generate header for the table
header_raw = unpack(rows[0],'th')
header = []
for colname in header_raw:
colname_raw = colname.split('\n')
if len(colname_raw)==1:
header.append(colname_raw[0])
else:
header.append(colname_raw[2].strip())
# get data
rows = rows[2:]
for row in rows:
data_raw = unpack(row,'td')
calldata.append([item.strip() for item in data_raw])
calls = pd.DataFrame(calldata, columns=header)
########################################
# process put option data
########################################
table = tables[2]
rows = table.findall('.//tr')
putdata = []
# generate header for the table
header_raw = unpack(rows[0],'th')
header = []
for colname in header_raw:
colname_raw = colname.split('\n')
if len(colname_raw)==1:
header.append(colname_raw[0])
else:
header.append(colname_raw[2].strip())
# get data
rows = rows[2:]
for row in rows:
data_raw = unpack(row,'td')
putdata.append([item.strip() for item in data_raw])
puts = pd.DataFrame(putdata, columns=header)
related links:
http://python4econ.blogspot.hk/2013/02/building-stock-options-historical.html
http://www.pythoncentral.io/python-beautiful-soup-example-yahoo-finance-scraper/
http://blog.nag.com/2013/10/implied-volatility-using-pythons-pandas.html
http://nycdatascience.com/students-work/python-multiprocessing-and-monte-carlo-option-pricing/