教育领域“知之”大模型--山东大学软件学院2024年项目实训（三）-CSDN博客

本文链接：https://blog.csdn.net/jxnb666/article/details/139334939

通过爬虫获得来自博客上的数据集

博客中爬取出的数据集：

代码：


import requests
import json
from bs4 import BeautifulSoup
import logging
import time

# 使用headers模拟浏览器请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE'
}
# 使用日志记录所有请求和错误信息，便于在出错时进行调试和处理。
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s %(levelname)s %(message)s',
                    datefmt='%a, %d %b %Y %H:%M:%S',
                    filename='qqM.log',
                    filemode='w')


class QQMusic:
    def __init__(self):
        pass

    # 获取所有专辑的链接
    def getAlbumLinks(self):
        album_links = []
        print("qqMusicSpider starts············")
        for page_num in range(100):  # page从0开始
            base_url = 'https://c.y.qq.com/v8/fcg-bin/album_library?cmd=get_album_info&page=' + str(
                page_num) + '&pagesize=20&sort=1'
            logging.info(base_url)
# 时间延时
            time.sleep(2)
            page = requests.get(base_url, headers=headers)
            html = page.content
            # 替换掉开头结尾，用于字典的转换
            html2 = html.replace("MusicJsonCallback(".encode(), "".encode())
            html3 = html2.replace("})".encode(), "}".encode())
            html_dict = json.loads(html3)
            album_list = html_dict['data']['albumlist']
            # 获取关键字album_mid的内容，拼接成专辑的链接
            for album in album_list:
                link_id = album['album_mid']
                # print lin1
                # k_id
                album_link = 'https://y.qq.com/n/yqq/album/' + str(link_id) + '.html'
                album_links.append(album_link)
            print("Get album_urls of Page " + str(page_num + 1))
        return album_links

    # 获取专辑的详细信息
    def getAlbumInfo(self, url):
        logging.info(url)
        album_dict = {}  # 创建字典储存所有信息
        try:
            time.sleep(2)
            info_page = requests.get(url, headers=headers)
            info_html = info_page.text
            # print info_html
            soup = BeautifulSoup(info_html, 'html.parser')
            album_name = soup.find_all('h1', class_='data__name_txt')[0].get_text(strip=True)  # 专辑名称
            album_dict['album_name'] = album_name

            # 有的也没有歌手，没有歌手信息的，以下5个小信息对应的key不同。
            # 多出来的演奏，译名舍掉，key值存为空
            singer = soup.find_all('a', class_='data__singer_txt')[0].get_text(strip=True)  # 歌手
            if len(singer) == 0:
                info_all = soup.find_all('ul', class_='data_info__item')  # 演奏，发行时间，发行公司，译名，类型。
                album_dict['singer'] = "null"
                album_dict['music_genre'] = "null"
                album_dict['language'] = "null"
                # 有的也没有类型
                if len(info_all) == 4:
                    pub_time = info_all[1].get_text(strip=True)  # 发行时间
                    pub_company = info_all[2].get_text(strip=True)  # 发行公司
                    album_dict['pub_time'] = pub_time
                    album_dict['pub_company'] = pub_company
                    # album_type = info_all[3].get_text(strip=True)  # 类型
                    album_dict['album_type'] = 'null'

                # 这个条件下有的info_all长度只有3， 只有发行公司、发行时间、类型
                elif len(info_all) == 3:
                    pub_time = info_all[0].get_text(strip=True)  # 发行时间
                    pub_company = info_all[1].get_text(strip=True)  # 发行公司
                    album_dict['pub_time'] = pub_time
                    album_dict['pub_company'] = pub_company
                    album_type = info_all[2].get_text(strip=True)  # 类型
                    album_dict['album_type'] = album_type
                else:
                    pub_time = info_all[1].get_text(strip=True)  # 发行时间
                    pub_company = info_all[2].get_text(strip=True)  # 发行公司
                    album_dict['pub_time'] = pub_time
                    album_dict['pub_company'] = pub_company
                    album_type = info_all[4].get_text(strip=True)  # 类型
                    album_dict['album_type'] = album_type

            else:
                album_dict['singer'] = singer

                info_all = soup.find_all('li', class_='data_info__item_song')  # 流派，语种，发行时间，发行公司，类型等信息在一起
                # 有的没有发行公司，长度为4，完整的5个都有，长度为5
                if len(info_all) == 4:
                    music_genre = info_all[0].get_text(strip=True)  # 流派
                    language = info_all[1].get_text(strip=True)  # 语种
                    pub_time = info_all[2].get_text(strip=True)  # 发行时间
                    unknown = info_all[3].get_text(strip=True)
                    if '发行公司' in unknown:
                        pub_company = unknown
                        album_dict['pub_company'] = pub_company  # 发行公司
                        album_dict['album_type'] = "无"
                    if '类型' in unknown:
                        album_type = unknown
                        album_dict['album_type'] = album_type  # 专辑类型
                        album_dict['pub_company'] = "无"
                    album_dict['music_genre'] = music_genre
                    album_dict['language'] = language
                    album_dict['pub_time'] = pub_time
                else:
                    music_genre = info_all[0].get_text(strip=True)  # 流派
                    language = info_all[1].get_text(strip=True)  # 语种
                    pub_time = info_all[2].get_text(strip=True)  # 发行时间
                    pub_company = info_all[3].get_text(strip=True)  # 发行公司
                    album_type = info_all[4].get_text(strip=True)  # 专辑类型
                    album_dict['music_genre'] = music_genre
                    album_dict['language'] = language
                    album_dict['pub_time'] = pub_time
                    album_dict['pub_company'] = pub_company
                    album_dict['album_type'] = album_type

            album_intro = soup.find_all('div', class_='about__cont')[0].get_text(strip=True)  # 专辑简介
            album_dict['album_intro'] = album_intro
            # 专辑内的所有歌曲信息
            song_list = []
            songs = soup.find_all('span', class_='songlist__songname_txt')  # 歌名
            for i in songs:
                song = i.get_text(strip=True)
                song_list.append(song)

            artist_list = []
            artists = soup.find_all('div', class_='songlist__artist')  # 演唱者
            for i in artists:
                artist = i.get_text(strip=True)
                artist_list.append(artist)

            time_list = []
            song_time = soup.find_all('div', class_='songlist__time')  # 时长
            for i in song_time:
                time_minute = i.get_text(strip=True)
                time_list.append(time_minute)

            album_info = zip(song_list, artist_list, time_list)
            album_info2 = []
            for item in album_info:
                songs_list = dict()
                songs_list['song'] = item[0]
                songs_list['who_sing'] = item[1]
                songs_list['time'] = item[2]
                album_info2.append(songs_list)

            album_dict['album_info2'] = album_info2
            # print album_dict, url
            return album_dict
        except Exception as e:
            logging.info(e)
            # 有的页面会请求失败，一个是版权无法查看，一个是找不到该页面，此时返回空字典，value全部为null
            album_dict['album_name'] = "null"
            album_dict['singer'] = "null"
            album_dict['music_genre'] = "null"
            album_dict['language'] = "null"
            album_dict['pub_time'] = "null"
            album_dict['pub_company'] = "null"
            album_dict['album_type'] = "null"
            album_dict['album_info2'] = "null"
            return album_dict