利用Rquests库进行股票数据定向爬虫_r语言东方财富数据爬虫-CSDN博客

#从东方财富网获得所有的股票代码,然后通过百度股票获得各个股票的信息
import requests
from bs4 import BeautifulSoup
import re
import time
#import traceback '''测试时使用过'''


def getHTMLText(url,code='utf-8'):
    '''获得目标URL的HTML文本'''
    r = requests.get(url,timeout=30)
    r.raise_for_status()
    r.encoding = code #为提升爬去速度,可预先通过r.apparent_encoding得到目标URL的HTML文本的编码格式,在这里我设置的默认编码为UTF-8
    return r.text

def getStockList(lst,stockURL):
    '''从东方财富网获得目标URL的所有股票代码,并存入一个列表'''
    try: #确定该部分代码能否正常运行
        html = getHTMLText(stockURL,'GB2312') #为提升爬去速度,我预先通过r.apparent_encoding得到目标URL的HTML文本的编码格式为GB2312格式
        soup = BeautifulSoup(html,'html.parser') #对获得所有股票代码的HTML文本进行解析
        a = soup.find_all('a')
        for i in a:
            try: #防止因个别a标签中的数据错误而导致程序报错
                href = i.attrs['href']
                lst.append(re.findall(r'[s][hz]\d{6}',href)[0]) #提取出所有股票代码,并把它放入列表lst
            except:
                continue
    except:
        print('getStockListerror')
            
    

def getStockInfo(lst,stockURL,fpath):
    '''将获得的所有股票代码导入,然后依次根据各个股票代码在百度股票中获得各个股票的信息'''
    count = 0 #为方便进行进度展示,设置一个变量count
    for stock in lst:
        url = stockURL + stock +'.html' #将各个股票代码加入百度股票的URL接口中,以获得各个股票的URL链接
        try: #因百度股票网站的不稳定,经常报502错误,故在此处使用try,except,以免因网站原因而引起程序报错
            html = getHTMLText(url)
        except:
            count = count + 1
            print('\r已完成:{:.2f}%'.format(count*100/len(lst)),end='') #就算网站没有访问成功,依旧进行进度展示
            continue
        
        try:
            if html =='':
                count = count + 1
                print('\r已完成:{:.2f}%'.format(count*100/len(lst)),end='') #就算获得的html文本为空,依旧进行进度展示
                continue
            infoDict = {} #用于存储各个股票的信息
            soup = BeautifulSoup(html,'html.parser') #对获得的单个股票URL的HTML文本进行解析
            stockInfo = soup.find('div',attrs={'class':'stock-bets'}) #提取该股票的HTML文本中标签为div属性为class=stock-bets的文本
            
            name = stockInfo.find_all(attrs={'class':'bets-name'})[0] #提取该股票在<div class=stock-bets>下属性为class=bets-name的文本
            infoDict.update({'股票名称':name.text.split()[0]})
            
            keyList = stockInfo.find_all('dt')
            valueList = stockInfo.find_all('dd')
            for i in range(len(keyList)): #将各个股票的信息加入字典中
                key = keyList[i].text
                val = valueList[i].text
                infoDict[key] = val
            #print(infoDict) '''测试时使用过'''
            
            with open(fpath,'a',encoding='utf-8') as f: #打开fpath位置的文件并写入字典
                f.write(str(infoDict) + '\n')
            
                count = count + 1
                print('\r已完成:{:.2f}%'.format(count*100/len(lst)),end='') #进行进度展示
        except:
            count = count + 1
            print('\r已完成:{:.2f}%'.format(count*100/len(lst)),end='')
            #traceback.print_exc() '''测试时使用过'''
            continue
            
def main():
    '''主函数'''
    t = time.perf_counter() #为了获得爬去所用的时间
    stock_list_url = 'http://quote.eastmoney.com/stock_list.html' #获得所有股票代码的URL
    stock_info_url = 'https://gupiao.baidu.com/stock/' #获得各个股票信息的主URL
    output_file = 'e://stockinfo1.txt' #要打开文件夹在电脑中的位置
    slist = []
    getStockList(slist,stock_list_url) #获得所有股票代码
    exclist = slist[:100]#需要将多少支股票的信息写入文件中
    print('共有',len(slist),'支股票')
    getStockInfo(exclist,stock_info_url,output_file) #获得所需支数的股票的信息并写入到文件中
    print('\n用时:{:.2f}s'.format(time.perf_counter()-t)) #打印爬去共用了多长时间

main() #调用主函数