下载巨潮网络数据的python脚本

从巨潮网络下载财报数据,觉得手动比较麻烦,就做了一个简单的python脚本。具体主要代码如下:

    driver = webdriver.PhantomJS(executable_path='浏览器引擎/自己使用phantomjs')
    if int(stockNumber) >= 600000:
        dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/sse'
    else:
        dst_url = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse'
    
    driver.get(dst_url)
    prefixpath = "文件夹路径"
    driver.find_element_by_class_name("input-stock").send_keys(stockNumber)
    driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]/a").click()
  
    prefixpathname = prefixpath+stockNumber+"/"
    if os.path.exists(prefixpathname):
        pass
    else:
        os.mkdir(prefixpathname)
    #driver.find_element_by_xpath("//ul[@id='stock_list']/li[1]").send_keys(Keys.ENTER)
    #切换网页,以获取新弹出的网页窗口
    for handle in driver.window_handles:
        driver.switch_to_window(handle)
        #print('current url:%s'%driver.current_url)
    time.sleep(1)
    urldata = driver.find_element_by_xpath("//ul[@id='category_list']")
    #print('%s'%urldata.text)
    chain = ActionChains(driver)
    moveelment = driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div")
    chain.move_to_element(moveelment).perform()
    driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div").click()

    urldata = driver.find_element_by_xpath("//div[@class='search-condition c5 drop-down']/a/div")
    #print('%s'%urldata.text)
    driver.find_element_by_xpath("//ul[@id='category_list']/li[1]/a").click()
    #选择需要的项,年报,半年报...等等
    ... ...
    #如果需要下载全部数据,需要点击更多,直到数据全部显示
    #while(rslt[0] != rslt[1]):
    #    driver.find_element_by_link_text('更多').click()
    #    #等待网页相应时间
    #    time.sleep(1)
    #    urldata = driver.find_element_by_xpath("//div[@id='con-div-his-fulltext']/div[@class='stat-right']")
    #    print('%s'%urldata.text)
    #    patternStr = '\d+'
    #    rslt = re.findall(patternStr,urldata.text)
    listNum = int(rslt[0])
    
    #listNum
    if(listNum != 0):
        for indexValue in range(1,listNum+1):
            findXpathStr = "//ul[@id='ul_his_fulltext']/li[%d]/div[@class='t3']/dd/span/a"%indexValue
            #print('%s'%findXpathStr)
            urlTextGet = driver.find_element_by_xpath(findXpathStr)
            print('%s'%urlTextGet.text)
            
            driver.find_element_by_xpath(findXpathStr).click()
            time.sleep(1)
            for handle in driver.window_handles:
                driver.switch_to_window(handle)
            time.sleep(1)
            #print('%s'%driver.current_url)
            urldata1 = driver.find_element_by_class_name("year")
            try: 
                urldata2 = driver.find_element_by_class_name("new_day")
            except:
                urldata2 = driver.find_element_by_class_name("day")
            timeStr = '%s%s'%(urldata1.text,urldata2.text)
            #print('%s%s'%(urldata1.text,urldata2.text))
            nameUrl = driver.find_element_by_xpath('//div[@class="bd-top"]/h2')
            #print('%s'%nameUrl.text.replace(name,'').strip())
            #srcUrl = driver.find_element_by_xpath('//div[@class="bd-ct"]/iframe.src')
            #print('%s'%srcUrl.text)
            #downloadfilename = './new/%s%s.pdf'%(nameUrl.text.replace(name,'').strip(),timeStr)
            nameUrlList = nameUrl.text.split('\n')
            if(len(nameUrlList)>1):
                downloadfilename = '%s%s%s.pdf'%(prefixpathname,nameUrlList[1].strip(),timeStr)
            else:
                downloadfilename = '%s%s%s.pdf'%(prefixpathname,nameUrlList[0].strip(),timeStr)

            pageRequest = request.urlopen(driver.current_url)
            pageRead = pageRequest.read().decode('utf-8')
            #pageRequest.readlines().decode('utf-8')
            findlinkSuccess = 0
            for eachline in pageRead.split('\n'):
                webDownloadURL = re.findall('src="(.+)"',eachline)
                if(len(webDownloadURL)>0) and re.search('iframe',eachline) and re.search('pdf',eachline.lower()):
                   wgetURL = webDownloadURL[0]
                   #print('%s'%wgetURL)
                   findlinkSuccess = 1
                   break
            #关闭当前URL窗口
            if(os.path.exists(downloadfilename)):
                print('%s已存在'%downloadfilename)
            elif(findlinkSuccess == 1):
                wget.download(wgetURL,downloadfilename)
            else:
                print('无效链接!ignore')
            driver.close()
            
            #返回指向前一次最新的URL
            for handle in driver.window_handles:
                driver.switch_to_window(handle)
            time.sleep(1)

    driver.close()    
    driver.quit()


  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值