爬取喜马来雅男频小说这几本
import requests
import re
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36','Cookie':'testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1619717328; JJEVER=%7B%22fenzhan%22%3A%22noyq%22%7D; smidV2=20210430012854effd865c944ddc429b0c481dfef3f31d0035c72a77d581610'}
class ximalaiyaSpider:
def getSource(self):
url = 'https://www.ximalaya.com/channel/7/'
resp = requests.get(url, headers=headers)
resp.encoding='utf-8'
return resp.text
def parseSource(self):
content =self.getSource()
r =re.match(r'.*?(<ul class="_qt">.*?</ul>).*?',content,re.S)
a =r.group(1)
a_all=re.findall(r'<a class="album-title line-2 lg bold kF_" title=.*?</a>',a,re.S)
a_titleall=[]
pattern=re.compile(r'<a class="album-title line-2 lg bold kF_" title="(.*?)" href="/(.*?)"><span.*?>.*?</span></a>',re.S)
for i in a_all:
onetitle =pattern.match(i)
a_titleone=[onetitle.group(1),'https://www.ximalaya.com/'+onetitle.group(2)]
a_titleall.append(a_titleone)
return a_titleall
def saveData(self):
content=self.parseSource()
with open('喜马来雅.csv','w',encoding='utf-8',newline='')as f:
writer=csv.writer(f)
header1=["作品",'链接']
writer.writerow(header1)
writer.writerows(content)
def main():
ximalaiyaSpider().saveData()
if __name__ == '__main__':
main()
csv结果: