python爬虫学习(股票数据爬取)

最近在学习python爬虫方面的知识,下面是做的一个简单爬虫示例。
从沪深A股中爬取股票列表中的数据到本地
用到的库:requests库,BeautifulSoup
爬取的链接为:http://quote.stockstar.com/stock/ranklist_a_3_1_1.html
代码如下:

import re
import requests
from bs4 import BeautifulSoup

def getHTML(url,code='utf-8'):
    print("in fun 1")
    try:
        r=requests.get(url)
        r.raise_for_status()
        r.encoding=code
        return r.text
    except:
        print("get error")
        return ""
#查看网页源代码,获取列表信息,num表示要查找的页数
def getStockInfo(infolist,url,fpath,num=1):
    print("in fun 2")
    for i in range(num):
        myurl=url+str(i+1)+".html"
        print(myurl)
        html=getHTML(myurl,'GB2312')
        try:
            soup=BeautifulSoup(html,'html.parser')
            #print("--------")
            headlist=soup.find('thead',attrs={'class':'tbody_right'})
            print(headlist)
            headname=headlist.find_all('td')
            ls=[]
            for ea in headname:
               ls.append(ea.string)
            print(ls)
            with open(fpath,'a',encoding='utf-8')as f:#股票数据表头
                for i in range(len(ls)):
                    if len(ls[i])!=10:
                       t=ls[i]+' '*(10-len(ls[i]))
                       f.write(t+'\t')
                f.write("\n")
            tbody=soup.find('tbody',attrs={'class':'tbody_right'})#限定搜索范围
            trs=tbody.find_all('tr') #查找所有tr标签          
            for tr in trs:
                tds=tr.find_all('td')
                each=[]
                for td in tds:
                    each.append(td.string)#string表示标记之间的文本
                infolist.append(each)
            #print(infolist)
        except:
            print("erron")
            return ""
def outputfile(infolist,fpath):#打印列表
    with open(fpath,'a',encoding='utf-8')as f:
        for tr in infolist:
            for td in tr:
                if len(td)!=10:#将每个元素限定在10个字符以内
                       t=td+' '*(10-len(td))
                       f.write(t+'\t')
            f.write("\n")

def main():
    url="http://quote.stockstar.com/stock/ranklist_a_3_1_"
    num=2
    infolist=[]
    path=r"g:/project/file1.txt"
    getStockInfo(infolist,url,path)
    outputfile(infolist,path)

if __name__ == '__main__':
    main()
已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页