一、便捷数据获取
(一)网络数据如何获取
1、抓取
- urllib内建模块(urllib.request)
- Requests第三方库
- Scrapy框架
2、解析
- BeautifulSoup库
- re模块
(1)用Python获取数据
import requests
import re
import pandas as pd
def retrieve_dji_list():
try:
r = requests.get('https://money.cnn.com/data/dow30/')
except ConnectionError as err:
print(err)
search_pattern = re.compile('class="wsod_symbol">(.*?)<\/a>.*<span.*">(.*?)<\/span>.*\n.*class="wsod_stream">(.*?)<\/span>')
dji_list_in_text = re.findall(search_pattern, r.text)
dji_list = []
for item in dji_list_in_text:
dji_list.append({
'code': item[0], 'name': item[1], 'price': float(item[2])})
return dji_list
dji_list = retrieve_dji_list()
djidf = pd.DataFrame(dji_list)
print(djidf)
import requests
import re
import json
import pandas as pd
def retrieve_quotes_historical(stock_code):
quotes = []
url = 'https://finance.yahoo.com/quote/%s/history?p=%s' % (stock_code, stock_code)
try:
r = requests.get(url)
except ConnectionError as err:
print(err)
m = re.findall('"HistoricalPriceStore":{"prices":(.*?),"isPending"', r.text)
if m:
quotes = json.loads(m[0])
quotes = quotes[::-1]
return [item for item in quotes if 'type' not in item]
quotes = retrieve_quotes_historical('AXP')
quotesdf_ori = pd.DataFrame(quotes)
quotesdf = quotesdf_ori.drop(['adjclose'], axis = 1)
print(quotesdf)
(2)直接下载数据
a. CSV格式数据的存取
import pandas as pd
quotesdf = pd.read_csv('axp.csv')
print(quotesdf)
quotes = retrieve_quotes_historical('AXP')
df = pd.DataFrame(quotes)
df.to_csv('stockAXP.csv')
b. excel格式数据的存取
df