python爬虫实例（requests+beautifulsoup+正则表达式）入门级通过股票代号爬取价格信息_python爬取数据除了方式正则表达式和css选择器-CSDN博客

本文链接：https://blog.csdn.net/HelloSunny123321/article/details/104446103

b站【Python网络爬虫与信息提取】.MOOC. 北京理工大学 p46-48 示例

##2020/02/23 好像ip被屏蔽了，在尝试使用代理ip后还是不通过，以后再改进吧
##2020/02/23 换了个网站就可以正常运行了，代理ip没有用到
import requests  # 引入requests库
from bs4 import BeautifulSoup  # 引入beautifulsoup库
import bs4
import re  # 引入正则表达式库
import traceback
import lxml
import random
import fakeip

proxies = {}

headers = {  # 反反爬取，添加cookie  https://blog.csdn.net/Guanhai1617/article/details/104120581
    'authority': 's.taobao.com',
    'cache-control': 'max-age=0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36',
    'sec-fetch-dest': 'document',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'sec-fetch-site': 'same-site',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'referer': '*',		#略
    'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'cookie': '*',		# 略
}


# 通过requests获取网站内容
def getHTMLText(url, code='utf-8'):
    # global proxies
    try:
        r = requests.get(url, timeout=2, headers=headers)
        print(r.request.url)  # 打印实际网页网址
        r.raise_for_status()  # 若r.status_code不等于200，则报错
        r.encoding = code
        return r.text
    except:
        print("getHTMLText Error")
        # proxies = fakeip.outputip()   # 有时会卡住
        traceback.print_exc()  # 如果出错，打印错误


# 获得股票列表
def getStockList(lst, stockListURL):
    html = getHTMLText(stockListURL, 'GB2312')
    soup = BeautifulSoup(html, 'html.parser')
    a = soup.find_all('a')
    for i in a:
        try:
            href = i.attrs['href']
            stockNum = re.findall(r"\d{6}", href)[0]
            lst.append(stockNum)
        except:
            continue


# 通过股票列表输出价格信息并保存到文件
def getStockInfo(lst, stockInfoURL, fpath):
    count = 0
    summary_list = []
    for stock in lst[-1:-50:-1]:
        url = stockInfoURL + stock  # + "/index/"
        html = getHTMLText(url, 'GB2312')
        print(stock)
        try:
            if html == "":
                continue
            soup = BeautifulSoup(html, 'html.parser')

            # name = (soup.find(id="pageStockName")).string     # 适用于tock_info_url = 'http://stockpage.10jqka.com.cn/'
            # price = soup.find(name='div',attrs={'class':'txt-phra'})
            # price = price.find_all('strong')[0].string

            name = (soup.find(name='p', attrs={'class': 'fz24'})).string
            price = soup.find(name='td', attrs={'class':'tl'}).find_next_sibling(name='td').string
            summary_list.append([stock, name, price])
            print([stock, name, price])
            count = count + 1
            print('\r当前进度：{:.2f}%'.format(count*100/len(lst)),end='')   # 打印实时进度，\r使光标回到本行开头，end=''使末尾不换行
        except:
            # traceback.print_exc()
            count = count + 1
            print('\r当前进度：{:.2f}%'.format(count * 100 / len(lst)), end='')
            continue
    try:
        with open(fpath, 'a', encoding='utf-8') as f:
            for i in summary_list:
                for j in range(3):
                    f.write(str(i[j]) + "   ")
                f.write('\n')
    except:
        traceback.print_exc()
        print("Failed to open file!")


def main():
    # global proxies
    stock_list_url = 'http://quote.eastmoney.com/stock_list.html'
    # stock_info_url = 'http://stockpage.10jqka.com.cn/'
    stock_info_url = 'http://data.10jqka.com.cn/market/lhbgg/code/'
    output_file_path = 'E://stock.txt'
    # proxies = fakeip.outputip()  # 设置虚拟ip
    stocklist = []
    getStockList(stocklist, stock_list_url)
    getStockInfo(stocklist, stock_info_url, output_file_path)


main()