一、分析网页、选择爬取方式
1.使用urllib\scrapy尝试
虽然可以很肯定的是对于此种大量数据的网页是采用了动态加载的方式,但是还是抱有一丝侥幸心理,最终结果是能够使用xpath在线解析的数据,在爬取下来的源码当中没有找到,故而肯定是动态加载。对于此种,笔者掌握了两种方式进行处理,其一是通过开发者工具查找请求url,通过循环构造url进行爬取,其二是通过selenium的强大可见即可爬进行爬取。前者效率高,是我一般会采用的方式
2.选择动态加载爬取方式
我在开发者工具中找到了对应的包,但是包的url的构造一直没有搞清楚(cb参数),故而我放弃了此种方法,采用了selenium大法
二、爬取网页
1.爬取逻辑介绍
https://data.eastmoney.com/bbsj/yjyg/001300.html
可以参看该url,bbsj代表年报季报板块 yjyg代表业绩预告,001300代表个股股票代码。同理,我们可以发现当yjyg变为lrb那么出来的表就是lrb。至此我们发现了url的构造规律。通过分析,我规划了一下我的爬取逻辑。首先先使用selenium爬取下所有个股的股票代码,便于进行后一步操作(这一步比较简单,故而就不放代码了)。然后我再从文件中读取每一个个股代码,构造url进行请求
2.整体代码
from selenium import webdriver
from time import sleep
from lxml import etree
import pandas as pd
import numpy as np
#chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless')
#broswer = webdriver.Chrome(chrome_options=())
yjyg = ['股票代码','截止日期','预测指标','业绩变动','预测数值(元)','业绩变动幅度','业绩变动原因','预告类型','上年同期值(元)','公告日期']
zcfz = ['股票代码','报告期','总资产(元)' ,'总资产同比(%)', '固定资产(元)','货币资金(元)','同比(%)','应收账款(元)','同比(%)','存货(元)','同比(%)','总负债(元)','总负债同比(%)','应付账款(元)','同比(%)','预收账款(元)','同比(%)','股东权益合计(元)','股东权益同比(%)','资产负债率','公告日期']
lrb = ['股票代码','报告期','净利润(元)','净利润同比(%)','扣非归母净利润(元)','扣非归母净利润同比(元)','营业总收入(元)','营业总收入同比(%)','营业支出(元)','营业支出同比(%)','销售费用(元)','管理费用(元)','财务费用(元)','营业总支出(元)','营业总支出同比(%)','营业利润(元)','营业利润同比(%)','利润总额(元)','公告日期']
xjll = ['股票代码','报告期','净现金流(元)','净现金流同比(%)','经营性现金流量净额(元)','经营性现金流量净额占比(%)','销售商品、提供劳务收到的现金金额(元)','销售商品、提供劳务收到的现金金额占比(%)','投资性现金流量净额(元)','投资性现金流净额净额占比(%)','取得投资收益收到的现金金额(元)','取得投资收益收到的现金占比(%)','购建固定资产、无形资产和其他的长期资产支付的现金金额(元)','购建固定资产、无形资产和其他的长期资产支付的现金金额占比(%)','融资性现金流量净额(元)','融资性现金流量净额占比(%)','公告日期']
def test1():
url = 'https://data.eastmoney.com/bbsj/yjbb/000920.html'
driver = webdriver.Chrome("chromedriver.exe")
driver.get(url)
sleep(3)
#broswer = webdriver.Chrome(chrome_options=())
#element = driver.find_element_by_xpath('//tr/td/text()')
#print(element)
text = driver.page_source
html = etree.HTML(text)
list1 = html.xpath('//tr/td/text()')
print(list1)
def code_Input():
#data = pd.read_csv(r"E:\python_work\爬虫\dfcf_2.csv",sep=',')
#print(np.array(data))
with open(r"E:\python_work\爬虫\dfcf_1.csv",'r',encoding='utf-8') as fp:
codeAll = fp.read()
codeAll = codeAll.split(',')
new_list = [i for i in codeAll if i != '']
return new_list
def fpTest():
pass
def getYjyg(code,fpYjyg):#获取业绩预告
#path = 'E:\python_work\爬虫\东方财富爬取数据\\' + code + '_业绩预告.csv'
#fp = open(path,'w',encoding=('utf-8'))
url = 'https://data.eastmoney.com/bbsj/yjyg/'+code+'.html'
#driver = webdriver.Chrome("chromedriver.exe")
#driver.get(url)
#for i in range(0,len(yjyg)):
# fp.write(yjyg[i])
# if i != len(yjyg) - 1:
# fp.write(',')
#fp.write('\n')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
broswer = webdriver.Chrome(chrome_options=chrome_options)
#element = driver.find_element_by_xpath('//tr/td/text()')
#print(element)
broswer.get(url)
sleep(3)
#print(code)
text = broswer.page_source
html = etree.HTML(text)
list1 = html.xpath('//div[@class="dataview-body"]//tr')
for eachCom in list1:
contentText = eachCom.xpath('./td')
content = []
for con in contentText:
content.append(con.xpath('string(.)'))
if len(content) == 0:
continue
#print(content)
#print(list1)
fpYjyg.write(code.strip())
fpYjyg.write(',')
for i in range(0,len(content)):
fpYjyg.write(content[i].replace(',',',').replace("\n",''))
#这里将英文逗号全部转化为中文逗号,防止csv读取出问题
if i != len(content) - 1:
fpYjyg.write(',')
fpYjyg.write('\n')
broswer.close()
#broswer.quit()
def getZcfz(code,fpZcfz):
url = 'https://data.eastmoney.com/bbsj/zcfz/'+code+'.html'
#driver = webdriver.Chrome("chromedriver.exe")
#driver.get(url)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
broswer = webdriver.Chrome(chrome_options=chrome_options)
#element = driver.find_element_by_xpath('//tr/td/text()')
#print(element)
broswer.get(url)
sleep(3)
#print(code)
while True:
text = broswer.page_source
html = etree.HTML(text)
list1 = html.xpath('//div[@class="dataview-body"]//tr')
for eachCom in list1:
#content = eachCom.xpath("./td//text()")
contentText = eachCom.xpath('./td')
content = []
for con in contentText:
content.append(con.xpath('string(.)'))
if len(content) == 0:
continue
#print(content)
fpZcfz.write(code.strip())
fpZcfz.write(',')
for i in range(0,len(content)):
fpZcfz.write(content[i].replace(',',','))
#这里将英文逗号全部转化为中文逗号,防止csv读取出问题
if i != len(content) - 1:
fpZcfz.write(',')
fpZcfz.write('\n')
try:
broswer.find_element_by_link_text("下一页").click()
sleep(3)
except:
#如果找不到,print something
break
broswer.close()
#broswer.quit()
def getLrb(code,fpLrb):
url = 'https://data.eastmoney.com/bbsj/lrb/'+code+'.html'
#driver = webdriver.Chrome("chromedriver.exe")
#driver.get(url)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
broswer = webdriver.Chrome(chrome_options=chrome_options)
#element = driver.find_element_by_xpath('//tr/td/text()')
#print(element)
broswer.get(url)
sleep(3)
#print(code)
while True:
text = broswer.page_source
html = etree.HTML(text)
list1 = html.xpath('//div[@class="dataview-body"]//tr')
for eachCom in list1:
#content = eachCom.xpath("./td//text()")
contentText = eachCom.xpath('./td')
content = []
for con in contentText:
content.append(con.xpath('string(.)'))
if len(content) == 0:
continue
#print(content)
fpLrb.write(code.strip())
fpLrb.write(',')
for i in range(0,len(content)):
fpLrb.write(content[i].replace(',',','))
#这里将英文逗号全部转化为中文逗号,防止csv读取出问题
if i != len(content) - 1:
fpLrb.write(',')
fpLrb.write('\n')
try:
broswer.find_element_by_link_text("下一页").click()
sleep(3)
except:
#如果找不到,print something
break
broswer.close()
#broswer.quit()
def getXjll(code,fpXjll):
url = 'https://data.eastmoney.com/bbsj/xjll/'+code+'.html'
#driver = webdriver.Chrome("chromedriver.exe")
#driver.get(url)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
broswer = webdriver.Chrome(chrome_options=chrome_options)
#broswer = webdriver.Chrome(chrome_options=())
#element = driver.find_element_by_xpath('//tr/td/text()')
#print(element)
broswer.get(url)
sleep(3)
#print(code)
while True:
text = broswer.page_source
html = etree.HTML(text)
list1 = html.xpath('//div[@class="dataview-body"]//tr')
for eachCom in list1:
#content = eachCom.xpath("./td//text()")
contentText = eachCom.xpath('./td')
content = []
for con in contentText:
content.append(con.xpath('string(.)'))
if len(content) == 0:
continue
#print(content)
fpXjll.write(code.strip())
fpXjll.write(',')
for i in range(0,len(content)):
fpXjll.write(content[i].replace(',',','))
#这里将英文逗号全部转化为中文逗号,防止csv读取出问题
if i != len(content) - 1:
fpXjll.write(',')
fpXjll.write('\n')
try:
broswer.find_element_by_link_text("下一页").click()
sleep(3)
except:
#如果找不到,print something
break
broswer.close()
#broswer.quit()
if __name__ == '__main__':
code_list = code_Input()
#getYjyg(code_list[0])
#getYjyg(code_list[1])
#这里最好在循环中使用一个异常处理
#getZcfz('600519')
#getYjyg('688718')
#getXjll('688718')
#getZcfz('688718')
#
#getYjyg('002494')
path = 'E:\python_work\爬虫\东方财富爬取数据\资产负债表.csv'
fpZcfz = open(path,'w',encoding=('utf-8'))
for i in range(0,len(zcfz)):
fpZcfz.write(zcfz[i])
if i != len(zcfz) - 1:
fpZcfz.write(',')
fpZcfz.write('\n')
path = 'E:\python_work\爬虫\东方财富爬取数据\利润表.csv'
fpLrb = open(path,'w',encoding=('utf-8'))
for i in range(0,len(lrb)):
fpLrb.write(lrb[i])
if i != len(lrb) - 1:
fpLrb.write(',')
fpLrb.write('\n')
path = 'E:\python_work\爬虫\东方财富爬取数据\现金流量表.csv'
fpXjll = open(path,'w',encoding=('utf-8'))
for i in range(0,len(xjll)):
fpXjll.write(xjll[i])
if i != len(xjll) - 1:
fpXjll.write(',')
fpXjll.write('\n')
path = 'E:\python_work\爬虫\东方财富爬取数据\业绩预告.csv'
fpYjyg = open(path,'w',encoding=('utf-8'))
for i in range(0,len(yjyg)):
fpYjyg.write(yjyg[i])
if i != len(yjyg) - 1:
fpYjyg.write(',')
fpYjyg.write('\n')
demo = []
codedemo = []
for i in range(0,len(code_list)):
try:
getYjyg(code_list[i].strip(),fpYjyg)
getXjll(code_list[i].strip(),fpXjll)
getZcfz(code_list[i].strip(),fpZcfz)
getLrb(code_list[i].strip(),fpLrb)
demo.append(i+1)
codedemo.append(code_list[i].strip())
print("第"+str(i+1)+'('+code_list[i].strip()+')'+'个已成功')
except:
print(demo)
path = 'E:\python_work\爬虫\东方财富爬取数据\logdemo.txt'
with open(path,'w',encoding=('utf-8')) as fp:
for j in range(0,len(demo)):
fp.write("第"+str(demo[j])+'('+codedemo[j].strip()+')'+'个已成功')
fp.write('\n')
path = 'E:\python_work\爬虫\东方财富爬取数据\logdemoError.txt'
with open(path,'a+',encoding=('utf-8')) as fp:
fp.write("第"+str(i+1)+'('+code_list[i].strip()+')'+'个出问题')
fp.write("\n")
path = 'E:\python_work\爬虫\东方财富爬取数据\logdemoErrorlist.txt'
with open(path,'a+',encoding=('utf-8')) as fp:
fp.write(code_list[i].strip()+',')
fpZcfz.close()
fpLrb.close()
fpXjll.close()
fpYjyg.close()
由于爬取数量较大,为了避免出现问题就终止运行的情况,笔者进行了简单的异常处理,毕竟是给自己爬,感觉就没必要自定义异常类的一些操作了,只要能提高程序健壮性就欧克。其次,笔者也进行了一些提升效率的操作。例如,修改driver请求的chrome_option参数,从而避免弹出浏览器(去除头)、加载图片。除此之外,为了方便之后对出现问题的地方重新爬取,我也进行了错误信息的读写操作。
3.出现问题总结
3.1 selenium版本问题
selenium4.0之后有些函数是被弃用了的。由于selenium爬取速度过慢,故而我借了儿子的电脑进行爬取,他的电脑我现装的selenium是4.0版本之后的,直接跑我的代码会出现问题。这里我没有选择修改代码,我对他的selenium进行了降级–先删除再安装。具体过程如下。
cd进入anaconda的script文件夹
然后
pip uninstall selenium
pip install selenium == 3.3.1
就🆗了
3.2 xpath解析时出现问题
这里要注意由于该网站的某些数据需要进行变色,所以有些td标签下为了使数据修改样式,还专门放了多个span标签,这些需要特别注意。具体解决方法可以参看我的代码
3.3 资产负载表
资产负载表有些个股表头是不一样的,这里由于数量不多,笔者没有处理。