1 #-*- coding: utf-8 -*-
2 '''
3 Created on 2018年8月14日4
5 @author: zww6
7 '''
8 importtime9 importre10 importrandom11 importrequests12 from lxml importetree13 importpandas as pd14
15
16 username_list, score_list, date_list, like_list, content_list, userid_list =[17 ], [], [], [], [], []18
19
20 defget_content(musicId, currentPage):21 headers ={22 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}23 cookies = {'cookies': '你的cookie'}24
25 url = ''.join(['https://music.douban.com/subject/',26 str(musicId), '/comments/hot?p=', str(currentPage)])27
28 res = requests.get(url, headers=headers, cookies=cookies)29 res.encoding = "utf-8"
30
31 if (res.status_code == 200):32 print('\n第{}页的数据爬取成功'.format(currentPage))33 print(url)34 else:35 print('\n o(╯□╰)o第{}页的数据爬取失败'.format(currentPage))36 print(url)37 x =etree.HTML(res.text)38 #歌名只需要取一次就行了
39 globalSongName40 if currentPage == 1:41 SongName = x.xpath('//*[@id="content"]/h1/text()')42
43 for j in range(1, 21): #豆瓣一页只有20条评论
44 #用户名
45 user_name =x.xpath(46 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/a/text()'.format(j))47 #评分
48 score =x.xpath(49 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span[1]/@title'.format(j))50 #时间//如果没有评分的,时间xpath路径有点变化
51 ifscore:52 date =x.xpath(53 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span[2]/text()'.format(j))54 else:55 date =x.xpath(56 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span/text()'.format(j))57 score = ''
58 #有多少个人点赞
59 like =x.xpath(60 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[1]/span/text()'.format(j))61 #评论内容
62 content =x.xpath(63 '//*[@id="comments"]/ul/li[{}]/div[2]/p/span/text()'.format(j))64
65 username_list.append(str(user_name[0]).strip())66 #把中文转化成星级来显示
67 ifscore:68 if score[0] == '力荐':69 score = '★★★★★'
70 elif score[0] == '推荐':71 score = '★★★★'
72 elif score[0] == '还行':73 score = '★★★'
74 elif score[0] == '较差':75 score = '★★'
76 else:77 score = '★'
78 score_list.append(score)
#有些人评论了文字,但是没有给出评分
79 else:80 score_list.append('暂无评分')81 date_list.append(str(date[0]))82 like_list.append(str(like[0]))83 content_list.append(str(content[0]).strip())84
85
86 defmain(musicId, scrapyPage):87 globalSongName88 for i in range(1, scrapyPage + 1):89 get_content(musicId, i)90 #随机等待时间,免得被封ip
91 time.sleep(round(random.uniform(1, 2), 2))92 infos = {'username': username_list, 'score': score_list,93 'content': content_list, 'date': date_list, 'like': like_list}94 data =pd.DataFrame(95 infos, columns=['username', 'score', 'content', 'date', 'like'])96 songName = ''.join(SongName)97 data.to_csv(songName + ".csv") #存储名为 歌曲名.csv
98 print('scrapy done!')99
100
101 if __name__ == '__main__':102 main(3040149, 100) #3040149 100 评论歌曲的ID号+要爬取的评论页面数