2020-08-31

爬取东方财富财报数据在这里插入代码片
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import pandas as pd

browser = webdriver.Chrome(executable_path="/usr/local/bin/chromedriver")

browser.maximize_window() #最大化窗口
wait = WebDriverWait(browser,10) #最大等待时间
def index_page(page):
‘’’
爬取页面数据
:param page:页数
:return:
‘’’
# url = “http://data.eastmoney.com/bbsj/201806/lrb.html”
url = “http://data.eastmoney.com/bbsj/202006/yjkb.html”
try:
browser.get(url=url)
print(“正在爬去第%s页”%page)
#判断是否是第一页。如果大于1,则输入跳转,否则加载完成
if page>1:
#确定页书输入框
input = wait.until(EC.presence_of_element_located((By.ID,“PageContgopage”)))
# input.click()
input.clear()
input.send_keys(page)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#PageCont >a.btn_link")))
submit.click()
time.sleep(2)
wait.until(EC.presence_of_element_located((By.ID,“dt_1”)))
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"#PageCont>span.at"),str(page)))
element = browser.find_element_by_css_selector("#dt_1")
all_td = element.find_elements_by_tag_name(“td”)
lst = []
for td in all_td:
lst.append(td.text)
#查看数据总共有多少列
# print(lst)
# exit()
col = len(element.find_elements_by_css_selector(“tr:first-child td”))
lst = [lst[i:i+col] for i in range(0,len(all_td),col)]
#获取连接
# print(lst)
lst_link =[]
links = element.find_elements_by_css_selector("#dt_1 a.red")
for link in links:
link_url = link.get_attribute(“href”)
lst_link.append(link_url)

    # columns = ["序号", "股票代码", "股票简称", "相关", '净利润(元)', "净利润同比(%)", "营业总收入(元)", "营业总收入同比",
    #    "营业支出(元)", "销售费用(元)", "管理费用(元)", "财务费用(元)", "营业总支出", "营业利润(元)",
    #    "利润总额(元)", "公告日期"]
    columns = ["序号", "股票代码", "股票简称", "相关", "每股收益", "营业收入(元)", "去年同期(元)", "同比增长",
               "季度环比增长", "净利润", "去年同期", "同比增长", "季度环比增长", "每股净资产", "净资产收益率", "所处行业", "公告日期"]

    df_table = pd.DataFrame(lst, columns=columns)
    df_table["url"] = lst_link
    # print(df_table)
    # exit()
    return df_table
except Exception:
    return None

def main():
all_data = pd.DataFrame()
for page in range(1,5):
df_table = index_page(page)
all_data = pd.concat([all_data,df_table])
# print(all_data)
all_data.to_excel(“2020年6月上市公司财报数据.xlsx”)
main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值