豆瓣音乐Top250爬取并写入数据库

最新推荐文章于 2023-09-01 21:30:00 发布

Last_xuan1

最新推荐文章于 2023-09-01 21:30:00 发布

阅读量1k

点赞数

分类专栏： # 普通爬虫

本文链接：https://blog.csdn.net/qq_43391383/article/details/86648023

版权

普通爬虫专栏收录该内容

18 篇文章 3 订阅

订阅专栏

先在谷歌浏览器获取cookie和post，让请求头nb一点
新东西：数据库，请求头变多

from requests.exceptions import RequestException
from lxml import etree
import requests
import pymongo
import time
import re

#写入mongoDB数据库
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
musicTop = mydb['musicTop']

#稍微nb一点的头
headers = {
    'Cookie': '''bid=wYnIk_O7xiw; ap_v=0,6.0; _pk_ref.100001.afe6=%5B%22%22%2C%22%22%2C1548392486%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DmU93waKk3Ksu3N8cUepiXRKj8svhyQ1VR91gwgY1osd9fIl1PssUca3YEmwWPcyf%26wd%3D%26eqid%3D999cf844000c4ae2000000065c4a981f%22%5D; _pk_ses.100001.afe6=*; __utma=30149280.1390490439.1548046477.1548246212.1548392487.3; __utmc=30149280; __utmz=30149280.1548392487.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="138331257:+EQ1zSTvvuo"; ck=lJ_i; push_noty_num=0; push_doumail_num=0; __utmv=30149280.13833; ct=y; _pk_id.100001.afe6=9c1145ea0d7fdfe4.1548392486.1.1548392677.1548392486.; __utmb=30149280.5.10.1548392487''',
    'Host': 'music.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def getHtml(url):
    try:
        webData = requests.get(url,headers = headers,timeout = 30)
        if webData.status_code == 200:
            return webData.text
        return None
    except RequestException:
        return None

def getInfo(html):
    entrance_urls = re.findall('a class="nbg" href="(.*?)"',html,re.S)
    for entrance_url in entrance_urls:
        webData = requests.get(entrance_url,headers = headers)
        selector = etree.HTML(webData.text)
        name = selector.xpath('//div[@id="wrapper"]/h1/span/text()')
        # author = selector.xpath('//div[@id="info"]/span[1]/span/a/text()')#此处不可有[0]
        # 这里用Xpath发现会出现下标越界，翻查发现，有的网页会多出<br/>，正则吧
        author = re.findall('表演者:.*?>(.*?)</a>',webData.text,re.S)
        # 值得注意的是这个正则表达式，如果是   '表演者:<.*?>(.*?)</a>' 是不行的
        style = re.findall('流派:</span>&nbsp;(.*?)<br />',webData.text,re.S)#[0].strip()
        #出现error，list index out of range，下标越界，可能会找不到
        if len(style) == 0:
            style = '未知'
        else:
            style = style[0].strip()
        releasetime = re.findall('发行时间:</span>&nbsp;(.*?)<br />',webData.text,re.S)
        score = re.findall('property="v:average">(.*?)</strong>',webData.text,re.S)
        data = {
            '歌名':name[0],
            '作者':author[0],
            '流派':style,
            '发行时间':releasetime[0].strip(),
            '评分':score[0]
        }
        musicTop.insert_one(data)

if __name__ == '__main__':
    urls = ['https://music.douban.com/top250?start={0}'.format(i) for i in range(0,250,25)]
    for url in urls:
        html = getHtml(url)
        getInfo(html)
        time.sleep(0.5)

Last_xuan1

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
豆瓣音乐Top250爬取并写入数据库

先在谷歌浏览器获取cookie和post，让请求头nb一点from requests.exceptions import RequestExceptionfrom lxml import etreeimport requestsimport pymongoimport timeimport re#写入mongoDB数据库client = pymongo.MongoClient('...
复制链接

扫一扫

专栏目录