python简易爬虫

爬取喜马来雅男频小说这几本

在这里插入图片描述

import requests
import re
import csv

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Cookie':'testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; JJEVER=%7B%22fenzhan%22%3A%22noyq%22%7D; smidV2=20210430012854effd865c944ddc429b0c481dfef3f31d0035c72a77d581610'}


class ximalaiyaSpider:
    def getSource(self):
        # 获取url数据
        # 目标url
        url = 'https://www.ximalaya.com/channel/7/'
        resp = requests.get(url, headers=headers)
        resp.encoding='utf-8'
        # print(resp.content.decode('utf-8'))
        return resp.text

    def parseSource(self):
        content =self.getSource()
        r =re.match(r'.*?(<ul class="_qt">.*?</ul>).*?',content,re.S)
        # print(r.group(1))
        # #<a class="album-title line-2 lg bold kF_" title="摸金天师(紫襟演播)" href="/youshengshu/4756811/"><span class="album-tag kF_"><i class="xuicon xuicon-wanben album-tag-icon kF_"></i></span><span class="v-m kF_">摸金天师(紫襟演播)</span></a>
        a =r.group(1)
        a_all=re.findall(r'<a class="album-title line-2 lg bold kF_" title=.*?</a>',a,re.S)
        # print(a_all)  #<a class="album-title line-2 lg bold kF_" title="摸金天师(紫襟演播)" href="/youshengshu/4756811/"><span class="album-tag kF_"><i class="xuicon xuicon-wanben album-tag-icon kF_"></i></span><span class="v-m kF_">摸金天师(紫襟演播)</span></a>'
        a_titleall=[]
        pattern=re.compile(r'<a class="album-title line-2 lg bold kF_" title="(.*?)" href="/(.*?)"><span.*?>.*?</span></a>',re.S)
        for i in a_all:
            onetitle =pattern.match(i)
            # print(type(onetitle.group(1)))#摸金天师(紫襟演播)
            # print(onetitle.group(2))
            # #[https://www.ximalaya.com/]这段没有要后期拼接url哦,这里group(2)结果是/youshengshu/4756811/
            a_titleone=[onetitle.group(1),'https://www.ximalaya.com/'+onetitle.group(2)]
            # print(a_titleone)#['"摸金天师(紫襟演播)" ', 'https://www.ximalaya.com/youshengshu/4756811/']
            a_titleall.append(a_titleone)

        return a_titleall
    def saveData(self):
        content=self.parseSource()
        # 写入csv
        with open('喜马来雅.csv','w',encoding='utf-8',newline='')as f:
            writer=csv.writer(f)
            header1=["作品",'链接']
            writer.writerow(header1)
            writer.writerows(content)


def main():
    ximalaiyaSpider().saveData()

if __name__ == '__main__':
    main()

csv结果:

在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值