爬虫-股吧

import requests, random, re,json
from fake_useragent import UserAgent

def request_html(url):
    ua = UserAgent()
    headers = {'User-Agent': ua.random}
    response = requests.get(url=url, headers=headers).text
    # print(response)
    # with open('1.html','w',encoding='utf-8') as f:
    #     f.write(response)
    return response



def parse_html(response):
    news = re.compile(r'<ul class="newlist" tracker-eventcode="gb_xgbsy_ lbqy_rmlbdj">(.*?)</ul>',re.S)
    news_list = news.findall(response)[0]
    # print(news_list)

    pattern = re.compile(r'<li>(.*?)</li>', re.S)
    pattern_list = pattern.findall(news_list)
    # print(pattern_list)
    lis = []
    for i in pattern_list:
        dic = {}
        clk = re.compile(r'<cite>(.*?)</cite>',re.S) # 有换行
        clk_list = clk.findall(i)[0].strip()
        # print(clk_list)
        dic['clk']=clk_list
        rev_list = clk.findall(i)[1].strip()
        # print(rev_list)
        dic['rev']=rev_list
        sub1 = re.compile(r'class="balink">(.*?)</a>|class="icon icon_list_xuanshang">(.*?)</em>',re.S)
        sub1_list = sub1.findall(i)[0]

        dic['sub1'] = sub1_list[0]+sub1_list[1]

        # print(sub1_list)
        sub2 = re.compile(r'title="(.*?)"')
        sub2_list = sub2.findall(i)[0]
        # print(sub2_list)
        dic['sub2'] = sub2_list
        aut = re.compile(r'<font>(.*?)</font>')
        aut_list = aut.findall(i)[0]
        # print(aut_list)
        dic['aut'] = aut_list
        last = re.compile(r'class="last">(.*?)<')
        last_list = last.findall(i)[0]
        # print(last_list)
        dic['last'] = last_list
        lis.append(dic)
    return lis


if __name__ == '__main__':
    for page in range(1,13):

        url = 'http://guba.eastmoney.com/default,99_{}.html'.format(page)
        # http://guba.eastmoney.com/default,99_1.html
        # http://guba.eastmoney.com/default,99_2.html
        response = request_html(url)
        try:
            lis = parse_html(response)
        except:
            response = request_html(url)
            lis = parse_html(response)
            print(url)
        file_name = 'guba{}.json'.format(page)
        with open(file_name,'w',encoding='utf-8') as f:
            json.dump(lis,f,ensure_ascii=False)



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值