版权声明:未经允许,随意转载,请附上本文链接谢谢(づ ̄3 ̄)づ╭❤~
https://blog.csdn.net/xiaoduan_/article/details/80835265
新浪股票信息爬取
老师想要通过数据做股票预测,我那帮老师爬取了一点数据。
大一时候写的代码了,比较蠢。
不做解释了,只贴上当时的代码。留作纪念吧。
1. 爬取股票列表
如果要爬取各各股票的信息,首先要有股票的代码,所以先爬去股票的编号为后续做准备
# -*- coding:UTF-8 -*-
import requests
from lxml import etree
from pymongo import MongoClient
import re
import pandas as pd
client = MongoClient()
db = client.gupiao
my_set = db.gupiao_number
url = 'http://quote.eastmoney.com/stocklist.html#sz'
r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
number_list = s.xpath('//*[@id="quotesearch"]//a[@target]/text()')
count = 1
for i in number_list:
count += 1
print("正在插入第{}条数据".format(count))
name = re.split('\(|\)', i)[0:2]
my_set.insert(dict(name=name[0], number=name[1], ))
gupiao = re.match(r'(.*)(\d)', number_list[2])
print(len(number_list))
df = pd.DataFrame(number_list)
df.to_csv('number_list.csv', encoding='utf-8-sig')
2. 通过股票编号爬取各各股票近十年的各个季度的股票数据
# -*- coding:UTF-8 -*-
from datetime import datetime
import requests
from lxml import etree
from pymongo import MongoClient
client = MongoClient()
db = client.gupiaoDB
my_set = db.fulldata
db_number = client.gupiao
my_number = db_number.gupiao_number
cursor = my_number.find()
my_number_list = []
my_number_list_new = []
def get_gupiao_number():
for i in cursor:
my_number_list.append(i['number'])
print("获取股票编号成功")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
def get_data():
id_duan = '300285'
my_number_list_new = my_number_list[my_number_list.index(id_duan) + 1:]
for i in my_number_list_new:
try:
full_data_list = dict()
url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/' \
'stockid/{}/ctrl/2017/displaytype/4.phtml'.format(i)
r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
year_list = s.xpath('//*[@id="con02-1"]/table/td/a/text()')[1:]
print("爬取进程:{:.2%}\n".format((my_number_list_new.index(i)+1) / len(my_number_list_new)))
print("本次运行已经保存{}支股票\n".format(my_number_list_new.index(i)+1))
print("剩余{}支股票\n".format(len(my_number_list_new)-my_number_list_new.index(i)))
print("正在爬取{}号股票\n".format(i))
dict_2017 = get_2017_data(i, 2017, s)
if dict_2017:
full_data_list.update(dict_2017)
else:
continue
for j in year_list:
try:
dict_usua = get_usual_data(i, j)
full_data_list.update(dict_usua)
except Exception as e:
print("异常:{}".format(e))
continue
my_set.insert(dict(full_data_list))
print("{}号股票所有数据保存成功\t".format(i))
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
except Exception as e:
print("异常:{}".format(e))
continue
def get_2017_data(gupiao_number, year, s):
data_1 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[1]/a/text()')
data_2 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[2]/text()')[1:]
data_3 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[3]/text()')[1:]
data_4 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[4]/text()')[1:]
my_gupiao_name = s.xpath('//*[@id="toolbar"]/div[1]/h1/a/text()')
res_1 = dict(zip(data_1, data_2)) # 月份组合数据
res_2 = dict(zip(data_1, data_3))
res_3 = dict(zip(data_1, data_4))
time = dict()
if my_gupiao_name:
time['id'] = gupiao_number
time['name'] =my_gupiao_name[0]
time['{}-12-31'.format(year)] = res_1
time['{}-9-30'.format(year)] = res_2
time['{}-6-30'.format(year)] = res_3
if my_gupiao_name:
if not data_2:
print('{}号股票2017-12-31数据不存在'.format(gupiao_number))
del time['{}-12-31'.format(year)]
if not data_3:
print('{}号股票2017-9-30数据不存在'.format(gupiao_number))
del time['{}-9-30'.format(year)]
if not data_4:
print('{}号股票2017-6-30数据不存在'.format(gupiao_number))
del time['{}-6-30'.format(year)]
print("\t{}号股票2017年数据返回成功\t".format(gupiao_number))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
return time
else:
print("\t{}号股票2017年数据为空\t".format(gupiao_number))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
def get_usual_data(gupiao_number, year):
url = 'http://money.finance.sina.com.cn/corp/go.php/vFD_FinancialGuideLine/' \
'stockid/{}/ctrl/{}/displaytype/4.phtml'.format(gupiao_number,year)
r = requests.get(url)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
data_1 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[1]/a/text()')
data_2 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[2]/text()')[1:]
data_3 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[3]/text()')[1:]
data_4 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[4]/text()')[1:]
data_5 = s.xpath('//*[@id="con02-1"]/table[2]//tr/td[5]/text()')[1:]
my_gupiao_name = s.xpath('//*[@id="toolbar"]/div[1]/h1/a/text()')
res_1 = dict(zip(data_1, data_2)) # 月份组合数据
res_2 = dict(zip(data_1, data_3))
res_3 = dict(zip(data_1, data_4))
res_4 = dict(zip(data_1, data_5))
time = dict()
if my_gupiao_name:
time['id'] = gupiao_number
time['name'] = my_gupiao_name[0]
time['{}-12-31'.format(year)] = res_1
time['{}-9-30'.format(year)] = res_2
time['{}-6-30'.format(year)] = res_3
time['{}-3-31'.format(year)] = res_4
if my_gupiao_name:
print("\t{}号股票{}年数据返回成功\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
if not data_2:
print("\t{}号股票{}-12-31数据为空\t".format(gupiao_number, year))
del time['{}-12-31'.format(year)]
if not data_3:
print("\t{}号股票{}-9-30数据为空\t".format(gupiao_number, year))
del time['{}-9-30'.format(year)]
if not data_4:
print("\t{}号股票{}-6-30数据为空\t".format(gupiao_number, year))
del time['{}-6-30'.format(year)]
if not data_5:
print("\t{}号股票{}-3-31数据为空\t".format(gupiao_number, year))
del time['{}-3-31'.format(year)]
print("\t{}号股票{}年数据返回成功\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
return time
else:
print("\t{}号股票{}年数据为空\t".format(gupiao_number, year))
print(datetime.now().strftime('\t%Y-%m-%d %H:%M:%S\n'))
if __name__ == '__main__':
get_gupiao_number()
get_data()
- 通过股票列表爬取股票近期的复权数据
import random
import requests
from datetime import datetime
from lxml import etree
import pymysql
import pandas as pd
def get_gupiao_list():
df_number = pd.read_csv(r'C:\Users\15810\Desktop\python_code\Pctest\gupiao.csv',
encoding='utf-8', usecols=[2])
df_name = pd.read_csv(r'C:\Users\15810\Desktop\python_code\Pctest\gupiao.csv',
encoding='utf-8', usecols=[3])
df_number_list = list()
for i in df_number.values:
df_number_list.extend(list(i))
df_name_list = list()
for i in df_name.values:
df_name_list.extend(list(i))
gupiao_full_data = dict(zip(df_number_list, df_name_list))
return gupiao_full_data, df_number_list
def get_data(gupiao_full_data, df_number_list, table_name):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ',
'(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
]
ua = {'User-Agent': str(random.choices(ua_list))}
creat_table(table_name) # 创建表
for i in gupiao_full_data.keys():
# i = 601818
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
print("正在爬取第{}只股票".format(df_number_list.index(i) + 1))
print("进度{:.2%}".format((df_number_list.index(i) + 1) / len(df_number_list)))
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/{}.phtml?year=2017&jidu=4'
r = requests.get(url.format(i), headers=ua)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
year = s.xpath('//*[@id="con02-4"]/table[1]/tr/td/form/select[@name="year"]/option/text()')
jidu = s.xpath('//*[@id="con02-4"]/table[1]/tr/td/form/select[@name="jidu"]/option/text()')
jidu = [x + 1 for x in range(len(jidu))]
for j in year:
for k in jidu:
print("\t正在爬取{}股票{}年第{}季度数据".format(i, j, k))
tar_data(i, j, k, table_name)
# time.sleep(0.1)
def tar_data(gupiao_number, year, jidu, table_name):
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ',
'(KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)'
]
ua = {'User-Agent': str(random.choices(ua_list))}
url = 'http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/' \
'stockid/{}.phtml?year={}&jidu={}'
r = requests.get(url.format(gupiao_number, year, jidu), headers=ua)
r.encoding = r.apparent_encoding
s = etree.HTML(r.text)
# title = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/td//div/strong/text()') #表头
data = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/text()') # 表格数据
date = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/a/text()') # 日期
date = [x.strip() for x in date] # 日期处理
if len(date) == 0:
# data = s.xpath('//*[@id="FundHoldSharesTable"]/tr[1]/following-sibling::*/td/div/text()')
size = int(len(data) / 8) # 表格数据格式处理
full_data = [] # 整合后的最终数据列表
for i in range(size):
start = i * 8
end = start + 8
full_data.append(data[start: end])
for i in full_data:
i[0] = i[0].strip()
for i in range(len(full_data)):
full_data[i].insert(0, str(gupiao_number)) # 整合股票号
full_data[i].append(gupiao_full_data[gupiao_number])
insert_data(table_name, full_data[i]) # 插入数据
else:
size = int(len(data) / 9) # 表格数据格式处理
full_data = [] # 整合后的最终数据列表
for i in range(size):
start = i * 9
end = start + 9
full_data.append(data[start: end][2:])
if len(date) != size:
print("\t{}号股票{}年数据第{}季度跳过成功".format(gupiao_number, year, jidu))
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
return
for i in range(len(full_data)):
# full_data[i].append(date[i]) # 整合日期
full_data[i].insert(0, date[i]) # 整合日期
full_data[i].insert(0, str(gupiao_number)) # 整合股票号
full_data[i].append(gupiao_full_data[gupiao_number])
insert_data(table_name, full_data[i]) # 插入数据
print("\t数据插入成功")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S\n'))
def creat_table(table_name):
connection = pymysql.connect(host='localhost',
user='root',
password='0000',
db='fuquan',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
try:
with connection.cursor() as cursor:
sql = r'CREATE TABLE `{}` (' \
'`股票号` varchar(255) NOT NULL ,' \
'`日期` date NOT NULL ,' \
'`开盘价` float NULL ,' \
'`最高价` float NULL ,' \
'`收盘价` float NULL ,' \
'`最低价` float NULL ,' \
'`交易量(股)` float NULL ,' \
'`交易金额(元)` float NULL ,' \
'`复权因子` float NULL ,' \
'`股票名` varchar(255) NULL ,PRIMARY KEY (`股票号`, `日期`));'.format(table_name)
cursor.execute(sql)
print("创建{}表成功".format(table_name))
finally:
connection.close()
def insert_data(table_name, values):
connection = pymysql.connect(host='localhost',
user='root',
password='0000',
db='fuquan',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
try:
with connection.cursor() as cursor:
sql = 'INSERT INTO `{}` (`股票号`,`日期`, `开盘价`, `最高价`, `收盘价`,`最低价`, `交易量(股)`, `交易金额(元)`, `复权因子`,`股票名`) ' \
'VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'.format(table_name)
cursor.execute(sql, values)
connection.commit()
finally:
connection.close()
if __name__ == '__main__':
table_name = 'fuquan_table'
gupiao_full_data, df_number_list = get_gupiao_list()
get_data(gupiao_full_data, df_number_list, table_name)