我不相信你用来提取信息的方法是最可靠的方法,但是我改变了你的代码来捕捉你需要的信息。我更新了正则表达式以检查括号,并在末尾添加了一个节来替换import urllib
import re
keystat = '
(.+?)'date = '
(.+?)' #obtain the date; only works for income statementtotal = '(.+?) ' #obtain data for any totals from statements
entry = '
(\(?.+?\)?)' #obtain data for any entries on statements that are not totalsdef keystatfunc(symbol):
url = 'http://finance.yahoo.com/q/ks?s=' + symbol + '+Key+Statistics'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
regex = '(.+?)'
pattern = re.compile(regex)
pattern2 = re.compile(keystat)
marketcap = re.findall(pattern, htmltext)
keystats = re.findall(pattern2, htmltext)
return (marketcap + keystats[1:31]) #creates a list with all the data on key statistics page)
def statement(symbol, period, statementtype): #period: "quarter" or "annually"; statementtype: is, bs, or cf (income statement, balance sheet, cash flow statement)
if period == "quarterly" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol
elif period == "annual" and statementtype == "bs":
url = 'http://finance.yahoo.com/q/bs?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "is":
url = 'http://finance.yahoo.com/q/is?s=' + symbol + '&annual'
elif period == "quarterly" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
elif period == "annual" and statementtype == "cf":
url = 'http://finance.yahoo.com/q/cf?s=' + symbol + '&annual'
htmlfile = urllib.urlopen(url)
htmltext = htmlfile.read()
pattern = re.compile(date)
pattern2 = re.compile(total)
pattern3 = re.compile(entry)
dates = re.findall(pattern, htmltext)
totals = re.findall(pattern2, htmltext)
entries = re.findall(pattern3, htmltext)
entriesFixed = []
for e in entries:
entriesFixed.append(e.replace(' ',''))
return (dates + totals + entriesFixed)
print keystatfunc("goog")