豆瓣音乐Top250爬取并写入数据库

先在谷歌浏览器获取cookie和post,让请求头nb一点
新东西:数据库,请求头变多

from requests.exceptions import RequestException
from lxml import etree
import requests
import pymongo
import time
import re

#写入mongoDB数据库
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
musicTop = mydb['musicTop']

#稍微nb一点的头
headers = {
    'Cookie': '''bid=wYnIk_O7xiw; ap_v=0,6.0; _pk_ref.100001.afe6=%5B%22%22%2C%22%22%2C1548392486%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DmU93waKk3Ksu3N8cUepiXRKj8svhyQ1VR91gwgY1osd9fIl1PssUca3YEmwWPcyf%26wd%3D%26eqid%3D999cf844000c4ae2000000065c4a981f%22%5D; _pk_ses.100001.afe6=*; __utma=30149280.1390490439.1548046477.1548246212.1548392487.3; __utmc=30149280; __utmz=30149280.1548392487.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="138331257:+EQ1zSTvvuo"; ck=lJ_i; push_noty_num=0; push_doumail_num=0; __utmv=30149280.13833; ct=y; _pk_id.100001.afe6=9c1145ea0d7fdfe4.1548392486.1.1548392677.1548392486.; __utmb=30149280.5.10.1548392487''',
    'Host': 'music.douban.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}

def getHtml(url):
    try:
        webData = requests.get(url,headers = headers,timeout = 30)
        if webData.status_code == 200:
            return webData.text
        return None
    except RequestException:
        return None

def getInfo(html):
    entrance_urls = re.findall('a class="nbg" href="(.*?)"',html,re.S)
    for entrance_url in entrance_urls:
        webData = requests.get(entrance_url,headers = headers)
        selector = etree.HTML(webData.text)
        name = selector.xpath('//div[@id="wrapper"]/h1/span/text()')
        # author = selector.xpath('//div[@id="info"]/span[1]/span/a/text()')#此处不可有[0]
        # 这里用Xpath发现会出现下标越界,翻查发现,有的网页会多出<br/>,正则吧
        author = re.findall('表演者:.*?>(.*?)</a>',webData.text,re.S)
        # 值得注意的是这个正则表达式,如果是   '表演者:<.*?>(.*?)</a>' 是不行的
        style = re.findall('流派:</span>&nbsp;(.*?)<br />',webData.text,re.S)#[0].strip()
        #出现error,list index out of range,下标越界,可能会找不到
        if len(style) == 0:
            style = '未知'
        else:
            style = style[0].strip()
        releasetime = re.findall('发行时间:</span>&nbsp;(.*?)<br />',webData.text,re.S)
        score = re.findall('property="v:average">(.*?)</strong>',webData.text,re.S)
        data = {
            '歌名':name[0],
            '作者':author[0],
            '流派':style,
            '发行时间':releasetime[0].strip(),
            '评分':score[0]
        }
        musicTop.insert_one(data)

if __name__ == '__main__':
    urls = ['https://music.douban.com/top250?start={0}'.format(i) for i in range(0,250,25)]
    for url in urls:
        html = getHtml(url)
        getInfo(html)
        time.sleep(0.5)
        
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值