爬取东方财富财报数据在这里插入代码片
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd
browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")
browser.maximize_window() #最大化窗口
wait = WebDriverWait(browser,10) #最大等待时间
def index_page(page):
‘’’
爬取页面数据
:param page:页数
:return:
‘’’
# url = “http://data.eastmoney.com/bbsj/201806/lrb.html”
url = “http://data.eastmoney.com/bbsj/202006/yjkb.html”
try:
browser.get(url=url)
print(“正在爬去第%s页”%page)
#判断是否是第一页。如果大于1,则输入跳转,否则加载完成
if page>1:
#确定页书输入框
input = wait.until(EC.presence_of_element_located((By.ID,“PageContgopage”)))
# input.click()
input.clear()
input.send_keys(page)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#PageCont >a.btn_link")))
submit.click()
time.sleep(2)
wait.until(EC.presence_of_element_located((By.ID,“dt_1”)))
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#PageCont>span.at"),str(page)))
element = browser.find_element_by_css_selector("#dt_1")
all_td = element.find_elements_by_tag_name(“td”)
lst = []
for td in all_td:
lst.append(td.text)
#查看数据总共有多少列
# print(lst)
# exit()
col = len(element.find_elements_by_css_selector(“tr:first-child td”))
lst = [lst[i:i+col] for i in range(0,len(all_td),col)]
#获取连接
# print(lst)
lst_link =[]
links = element.find_elements_by_css_selector("#dt_1 a.red")
for link in links:
link_url = link.get_attribute(“href”)
lst_link.append(link_url)
# columns = ["序号", "股票代码", "股票简称", "相关", '净利润(元)', "净利润同比(%)", "营业总收入(元)", "营业总收入同比",
# "营业支出(元)", "销售费用(元)", "管理费用(元)", "财务费用(元)", "营业总支出", "营业利润(元)",
# "利润总额(元)", "公告日期"]
columns = ["序号", "股票代码", "股票简称", "相关", "每股收益", "营业收入(元)", "去年同期(元)", "同比增长",
"季度环比增长", "净利润", "去年同期", "同比增长", "季度环比增长", "每股净资产", "净资产收益率", "所处行业", "公告日期"]
df_table = pd.DataFrame(lst, columns=columns)
df_table["url"] = lst_link
# print(df_table)
# exit()
return df_table
except Exception:
return None
def main():
all_data = pd.DataFrame()
for page in range(1,5):
df_table = index_page(page)
all_data = pd.concat([all_data,df_table])
# print(all_data)
all_data.to_excel(“2020年6月上市公司财报数据.xlsx”)
main()