MIDI爬取

最新推荐文章于 2021-09-14 11:06:44 发布

ashome123

最新推荐文章于 2021-09-14 11:06:44 发布

阅读量3.1k

点赞数

本文链接：https://blog.csdn.net/ashome123/article/details/107117403

版权

Python 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

Python 爬虫学习

1 篇文章 0 订阅

订阅专栏

从midiworld网站按类别爬取midi文件

import requests
from bs4 import BeautifulSoup
import os
import time
from time import sleep
import random
from collections import Counter
import csv


def midiworld_crawler():
   BASE_URL = 'https://www.midiworld.com/search/'
   headers = {
      # 自行添加
   }
   styles = ['blues', 'jazz', 'pop', 'rock', 'movie%20themes', 'christmas%20carols', 'disney%20themes', 'country', 'rap', 'punk', 'dance', 'video%20game%20themes']
   
   # 新建目录
   if not os.path.exists('./midiworld'):
      os.mkdir('./midiworld')
   for style in styles:
      if not os.path.exists('./midiworld/' + style):
         os.mkdir('./midiworld/' + style)
   
   # 爬取记录
   csv_file = open('./midiworld/record.csv', 'a')
   csv_writer = csv.writer(csv_file)
   song_num = Counter()
   
   # 数据爬取
   for style in styles:
      print('===== 开始爬取:', style)
      
      for i in range(1, 1000): # page number
         url = BASE_URL + str(i) + '/?q=' + style
         r = requests.get(url, headers=headers, timeout=20)
         if (r.status_code != 200):
            print('获取网页失败: ',r)
            sleep(random.random()*10)
            continue
            
         sleep(random.random()/5)
         bs = BeautifulSoup(r.text, 'html.parser')
         hrefs = bs.select('a[target="_blank"]')
         
         if len(hrefs) == 0:
            break
            
         for href in hrefs:
            name = href.parent.next_element.strip('\n').strip(' - ').replace('/', '_')
            url = href['href']
            id = url.split('/')[-1]
            r = requests.get(url, headers=headers, timeout=30)
            sleep(random.random()/5)
            
            if (r.status_code != 200):
               print('下载失败: ',href)
               sleep(random.random()*10)
               continue
            
            try:
               with open('./midiworld/' + style + '/' + str(id) + '_' + name + '.midi', 'wb') as f:
                  f.write(r.content)
                  csv_writer.writerow( (id, style, name, url) )
                  song_num[style] += 1
            except:
               print('保存失败: ', './midiworld/' + style + '/' + str(id) + '_' + name + '.midi')
               
            
         print('========== ' + style + '第' + str(i) + '页已完成, 累计数目: ' + str(song_num[style]))    
            
      print('===== ' + style + '爬取完成, 共计' + str(song_num[style]) + '首\n\n')
      
   csv_file.close()
   
   
   
   

if __name__ == '__main__':
   midiworld_crawler()

ashome123

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MIDI爬取

从midiworld网站按类别爬取midi文件import requestsfrom bs4 import BeautifulSoupimport osimport timefrom time import sleepimport randomfrom collections import Counterimport csvdef midiworld_crawler(): BASE_URL = 'https://www.midiworld.com/search/' header
复制链接

扫一扫