python-(beautifulsoup)爬虫-3

最新推荐文章于 2024-08-22 15:35:37 发布

十四呀

最新推荐文章于 2024-08-22 15:35:37 发布

阅读量285

点赞数

分类专栏： python 文章标签：爬虫 csv

本文链接：https://blog.csdn.net/u011445855/article/details/79081088

版权

python 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

刚刚学习了beautifulsoup，就想自己练练手，写的不是很好，也参考了网上一些写法。
主要是爬取天天基金上的基金的涨跌比。

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re 
import csv
from itertools import islice

#url是天天基金的主网页
#这个函数主要是爬取第一天上的基金代码，基金详情url，基金名字
#但是有个问题没解决，就是每次只爬到003327就没有数据了，不知道为什么。。。。。
def geturls(url):
    html = urlopen(url)
    bsObj = BeautifulSoup(html)
    data = ['id','name','url']
    #newline='' 一定要加，否则写出了的csv文件，每行之后都有一个\r,再读出来就是每行后面都有一个空行
    csv_file = open('funds.csv','w',newline='')
    write = csv.writer(csv_file)
#下面两个写法都可以
    urls = bsObj.select('.num_right > li')
   # urls = bsObj.find("ul",class_="num_right").findAll("li")

    print (len(urls))
    #一共爬取了2956，不知道为什么没有了
    #下面将爬取的数据写入文件中
    for url in urls:
        info_dict ={'fund_id':'',
                    'fund_name':'',
                    'fund_url':''}

        li_list = url.find('a')
        #print (li_list)
        if len(li_list) > 0:
            data=[]
            info_dict['fund_url']=li_list.attrs['href']
            fund = li_list.text
            info_dict['fund_id']=re.findall(r'\d+',fund)[0]
            info_dict['fund_name']=fund.split('）')[1]
            #fundtext.append(info_dict)
            data.append(info_dict['fund_id'])
            data.append(info_dict['fund_name'])
            data.append(info_dict['fund_url'])
           # print (info_dict['fund_id'])   
            write.writerow(data)

    csv_file.close()

    return 

 #这个函数是解析基金详情url，获取涨跌比       
def geturlinfo(info_dict):
    detail=[]
    soup = BeautifulSoup(urlopen(info_dict[2]))
   # print (info_dict['fund_url'])
    code = info_dict[0]
    print (code)
    detail.append(code)
    name = info_dict[1]
    detail.append(name)

    tags = soup.findAll(class_=re.compile("^ui-font-middle ui-color-(red|green) ui-num"))
    #print (len(tags))
    #print (len(tags))
    #这里要判断一下，因为有些基金没有数据，会运行崩掉
    for i in range(3,9):

        if i < len(tags):
            detail.append(tags[i].text)
        else:
            detail.append('-')
        #detail={'代码':code,'名称':name,'近1月':m1,'近3月':m3,'近6月':m6,'近1年':y1,'近3年':y3,'成立来':rece}

    return detail




data=['code','name','month_1','month_3','month_6','year_1','year_3','rece']
fundinfo_file = open('fundinfo.csv','w')
write = csv.writer(fundinfo_file)
write.writerow(data)
infos = geturls('http://fund.eastmoney.com/allfund.html')
funds_file = open('funds.csv','r')
read = csv.reader(funds_file)
#这里是为了避免读取csv文件的第一行，通常第一行都是列名，如果要通过判断linenum 来跳过，那每次都需要判断下，影响效率,可以采取这种写法。
for line in islice(read, 1, None):  
    info = geturlinfo(line)
    #print (info)
    write.writerow(info)
funds_file.close()
fundinfo_file.close()
#print (len(infos))

十四呀

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python-(beautifulsoup)爬虫-3

刚刚学习了beautifulsoup，就想自己练练手，写的不是很好，也参考了网上一些写法。主要是爬取天天基金上的基金的涨跌比。from urllib.request import urlopenfrom urllib.parse import urlparsefrom bs4 import BeautifulSoupimport re import csvfrom itertoo
复制链接

扫一扫

专栏目录