先在谷歌浏览器获取cookie和post,让请求头nb一点
新东西:数据库,请求头变多
from requests.exceptions import RequestException
from lxml import etree
import requests
import pymongo
import time
import re
#写入mongoDB数据库
client = pymongo.MongoClient('localhost',27017)
mydb = client['mydb']
musicTop = mydb['musicTop']
#稍微nb一点的头
headers = {
'Cookie': '''bid=wYnIk_O7xiw; ap_v=0,6.0; _pk_ref.100001.afe6=%5B%22%22%2C%22%22%2C1548392486%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DmU93waKk3Ksu3N8cUepiXRKj8svhyQ1VR91gwgY1osd9fIl1PssUca3YEmwWPcyf%26wd%3D%26eqid%3D999cf844000c4ae2000000065c4a981f%22%5D; _pk_ses.100001.afe6=*; __utma=30149280.1390490439.1548046477.1548246212.1548392487.3; __utmc=30149280; __utmz=30149280.1548392487.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; dbcl2="138331257:+EQ1zSTvvuo"; ck=lJ_i; push_noty_num=0; push_doumail_num=0; __utmv=30149280.13833; ct=y; _pk_id.100001.afe6=9c1145ea0d7fdfe4.1548392486.1.1548392677.1548392486.; __utmb=30149280.5.10.1548392487''',
'Host': 'music.douban.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
}
def getHtml(url):
try:
webData = requests.get(url,headers = headers,timeout = 30)
if webData.status_code == 200:
return webData.text
return None
except RequestException:
return None
def getInfo(html):
entrance_urls = re.findall('a class="nbg" href="(.*?)"',html,re.S)
for entrance_url in entrance_urls:
webData = requests.get(entrance_url,headers = headers)
selector = etree.HTML(webData.text)
name = selector.xpath('//div[@id="wrapper"]/h1/span/text()')
# author = selector.xpath('//div[@id="info"]/span[1]/span/a/text()')#此处不可有[0]
# 这里用Xpath发现会出现下标越界,翻查发现,有的网页会多出<br/>,正则吧
author = re.findall('表演者:.*?>(.*?)</a>',webData.text,re.S)
# 值得注意的是这个正则表达式,如果是 '表演者:<.*?>(.*?)</a>' 是不行的
style = re.findall('流派:</span> (.*?)<br />',webData.text,re.S)#[0].strip()
#出现error,list index out of range,下标越界,可能会找不到
if len(style) == 0:
style = '未知'
else:
style = style[0].strip()
releasetime = re.findall('发行时间:</span> (.*?)<br />',webData.text,re.S)
score = re.findall('property="v:average">(.*?)</strong>',webData.text,re.S)
data = {
'歌名':name[0],
'作者':author[0],
'流派':style,
'发行时间':releasetime[0].strip(),
'评分':score[0]
}
musicTop.insert_one(data)
if __name__ == '__main__':
urls = ['https://music.douban.com/top250?start={0}'.format(i) for i in range(0,250,25)]
for url in urls:
html = getHtml(url)
getInfo(html)
time.sleep(0.5)