request python beauty soup_详解Python 采用 requests Beautiful Soup 爬取豆瓣音乐

该博客介绍了如何使用Python的requests和BeautifulSoup库爬取豆瓣音乐页面上的信息,包括随机生成user-agent、输入搜索关键词和页数、获取URL、正则匹配链接、解析网页内容并提取歌曲、歌手、评分等关键信息,最后将数据保存到文件中。
摘要由CSDN通过智能技术生成

# !/usr/bin/env python3

# -*- coding:utf-8-*-

# 导入需要使用的库

import requests

import re

from urllib import request

from bs4 import BeautifulSoup as bs

import requests

import json

import time

import random

# 随机生成user-agent

def process_request():

USER_AGENTS = [

'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',

'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',

'Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',

'Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)',

'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)',

'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',

'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',

'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)',

'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',

'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1',

'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0',

'Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5',

'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',

'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',]

user_header=random.choice(USER_AGENTS)

return user_header

# 输入需要获取的页数和风格

def main():

user_header = process_request()

search_keywords = input('please input your search_keywords:')

user_in_nub = input('please input your pages:')

for i in generate_allurl(user_in_nub,search_keywords,user_header):

# print(i)

ls_url =  get_allurl(i,user_header)

for j in ls_url:

info_text = open_url(j,user_header)

writer_to_text(info_text)

# 构建url

def generate_allurl(user_in_nub,search_keywords,user_header):

# 最外页网址

url1 = 'https://music.douban.com/tag/'

header ={

'Host':'music.douban.com',

'Upgrade-Insecure-Requests':'1',

'User-Agent':user_header

}

time.sleep(5)

get_url = requests.get(url1,headers = header)

soup = bs(get_url.content,'lxml')

j = 0

info = {}

for i in range(0,len(soup.select('table.tagCol'))):

x=soup.select('table.tagCol')[i]

for q in range (0,len(x.select('tbody tr td a'))):

lsdirx =str(i) + str(q)

info[lsdirx] = x.select('tbody tr td a')[q].text

a = list(info.values())

# print(a)

# print(search_keywords)

url = 'https://music.douban.com/tag/' + search_keywords + '?start={}&type=T'

# print(url)

# url = 'https://music.douban.com/tag/OST?start={}&type=T'

next_page = int(user_in_nub)*20

for url_next in range(0,next_page,20):

# url_next =url_next + 20

yield url.format(url_next)

# 正则匹配链接url

def get_allurl(generate_allurl,user_header):

header ={

'Host':'music.douban.com',

'Upgrade-Insecure-Requests':'1',

'User-Agent':user_header

}

time.sleep(5)

get_url = requests.get(generate_allurl,headers = header)

if get_url.status_code == 200:

re_set = re.compile('<>

re_get = re.findall(re_set,get_url.text)

return re_get

# print(get_url.text)

# print(re_get)

# 获取所需要的信息

def open_url(re_get,user_header):

header ={

'Host':'music.douban.com',

'Upgrade-Insecure-Requests':'1',

'User-Agent':user_header

}

time.sleep(5)

res = requests.get(re_get,headers = header)

if res.status_code == 200:

info = {'歌名': '', '其他属性': '', '评分': '', '评价人数': '', '推荐常用标签': '', '推荐歌手': '','推荐歌曲':''}

soup = bs(res.text, 'lxml')

if (soup.select('#wrapper h1 span')):

info['歌名'] = str(soup.select('#wrapper h1 span')[0].text)

if(soup.select('div#info')):

info['其他属性'] = str(soup.select('div#info')[0].text).split('\n')

if(soup.select('strong.ll.rating_num')):

info['评分'] = str(soup.select('strong.ll.rating_num')[0].text)

if (soup.select('.rating_people span')):

info['评价人数'] = str(soup.select('.rating_people span')[0].text)

# info['评价人数'] = str(soup.select('.rating_people span')[0].text)

if (soup.select('.tags-body')):

info['推荐常用标签'] = soup.select('.tags-body')[0].text.split('\n')

if (soup.select('div#db-rec-artist-section div.content.clearfix dl.subject-rec-list dd a') != None):

#    print(len(soup.select('.content.clearfix .subject-rec-list dd a')))

j = 0

for i in soup.select('div#db-rec-artist-section div.content.clearfix dl.subject-rec-list dd a'):

lsdir = str('推荐歌手' + str(j))

info[lsdir] = soup.select('div#db-rec-artist-section div.content.clearfix dl.subject-rec-list dd a')[j].text

j += 1

if (soup.select('div#db-rec-section div dl dd a') != None):

#    print(len(soup.select('.content.clearfix .subject-rec-list dd a')))

j = 0

for i in soup.select('div#db-rec-section div dl dd a'):

lsdir = str('推荐歌曲' + str(j))

info[lsdir] = soup.select('div#db-rec-section div dl dd a')[j].text

j += 1

return info

def writer_to_text(text):  # 储存到text

with open('豆瓣音乐.text', 'a', encoding='utf-8')as f:

f.write(json.dumps(text, ensure_ascii=False) + '\n')

f.close()

if __name__ == '__main__':

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值