Python爬虫
需求
- https://data.eastmoney.com/xg/xg/default.html
东财爬取新股上市内容
将前5页内容保存在数据库中,表名stock_new,除选中字段外还要加入id和created_time(yyyy-mm-dd)字段。
- 生成柱状图,统计各个板块上市数量,x轴显示红字部分,规则如下:
60开头的为主板
0开头的为中小板
3开头的为创业板
688开头的为科创板
8或4开头的为北交所
解决方案
import cx_Oracle
# 连接到Oracle数据库
db_conn = cx_Oracle.connect('scott', '123456', '127.0.0.1:1521/orcl')
# 创建游标
db_cursor=db_conn.cursor()
from datetime import time
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
start_time = datetime.now()
options = webdriver.EdgeOptions()
options.add_experimental_option('detach',True)
driver = webdriver.Edge(options=options)
#driver.get("https://selenium.dev/selenium/web/web-form.html")
driver.get("https://data.eastmoney.com/xg/xg/default.html")
#print(driver.title)
list_value = []
now_time = datetime.now()
# 格式化日期为 'yyyy-mm-dd' 格式
now_time = now_time.strftime('%Y-%m-%d')
end_page = 4
start_page = 0
#id = 0
def max_id():
sql = 'select max(id) from stock2'
db_cursor.execute(sql)
ID = db_cursor.fetchall()[0][0]
return ID
id = max_id()
def get_info(max_num,num,Id):
stock_trs = (driver.find_element(By.CLASS_NAME,'dataview-body').find_element(By.TAG_NAME,'tbody').find_elements(By.TAG_NAME,'tr'))
insert_sql = """INSERT INTO stock2 (id, StockCode, StockName, SubLimit, IssuePrice, SubDate, IssuePERatio, InRatio, WinningRate, ProfitPerLot, CreatedTime)
VALUES (:id, :StockCode, :StockName, :SubLimit, :IssuePrice, :SubDate, :IssuePERatio, :InRatio, :WinningRate, :ProfitPerLot, :CreatedTime)
"""
for stock_tr in stock_trs:
Id = Id+1
tds = stock_tr.find_elements(By.TAG_NAME,'td')
#print(tds[0].text,' ',tds[1].text,' ',tds[7].text,' ', tds[8].text,' ',tds[11].text,' ', tds[15].text,' ',tds[16].text,' ',tds[17].text,' ',tds[21].text)
list_tmp = (Id,tds[0].text,tds[1].text,tds[7].text, tds[8].text,tds[11].text, tds[15].text,tds[16].text,tds[17].text,tds[21].text,now_time)
#print(list_tmp)
db_cursor.execute(insert_sql, list_tmp)
#list_value.append(list_tmp)
#print(list_value)
#db_cursor.executemany(insert_sql, list_values)
# 提交事务
db_conn.commit()
# 退出条件
num = num +1
print("page_num:",num)
if num >= max_num:
return
click_next_a(num,Id)
def click_next_a(number,Id):
next_a = driver.find_element(By.LINK_TEXT,'下一页')
next_a.click()
time.sleep(5)
get_info(end_page,number,Id)
get_info(end_page,start_page,id)
# 关闭cursor和连接
db_cursor.close()
db_conn.close()
end_time = datetime.now()
print(end_time - start_time)
print("Finished")
page_num: 1
page_num: 2
page_num: 3
page_num: 4
0:00:39.097058
Finished
-- 建表信息
drop table stock2
CREATE TABLE stock2
(
id number,
StockCode VARCHAR2(50),
StockName VARCHAR2(50),
SubLimit varchar2(20),
IssuePrice varchar2(20),
SubDate varchar2(20),
IssuePERatio varchar2(20),
InRatio varchar2(20),
WinningRate varchar2(20),
ProfitPerLot varchar2(20),
CreatedTime varchar2(20)
);
matplotlib的使用
import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif'] = ['SimHei']#设置中文字体为黑体
plt.rcParams['axes.unicode_minus'] = False #正常显示负号
# 随机生成一些数据来模拟实际的上市数量
np.random.seed(0) # 确保每次生成的数据相同
data = {
'主板': np.random.randint(50, 150),
'中小板': np.random.randint(30, 100),
'创业板': np.random.randint(40, 120),
'科创板': np.random.randint(20, 80),
'北交所': np.random.randint(10, 50)
}
# 定义颜色映射
colors = {
'主板': 'red',
'中小板': 'blue',
'创业板': 'green',
'科创板': 'yellow',
'北交所': 'orange'
}
# 绘制柱状图
fig, ax = plt.subplots()
# 为每个板块绘制柱子
for i, (label, value) in enumerate(data.items()):
ax.barh(i, value, color=colors[label], label=label)
# 设置y轴标签,显示板块名称
ax.set_yticks(range(len(data)))
ax.set_yticklabels(data.keys())
# 设置x轴和y轴的标题
ax.set_xlabel('上市数量')
ax.set_ylabel('板块')
# 添加图例
ax.legend()
# 显示图形
plt.show()