python爬取豆瓣评论_python+requests爬取豆瓣歌曲评论

最新推荐文章于 2020-12-17 02:54:56 发布

weixin_39605326

最新推荐文章于 2020-12-17 02:54:56 发布

阅读量136

点赞数

文章标签： python爬取豆瓣评论

1 #-*- coding: utf-8 -*-

2 '''

3 Created on 2018年8月14日4

5 @author: zww6

7 '''

8 importtime9 importre10 importrandom11 importrequests12 from lxml importetree13 importpandas as pd14

16 username_list, score_list, date_list, like_list, content_list, userid_list =[17 ], [], [], [], [], []18

20 defget_content(musicId, currentPage):21 headers ={22 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}23 cookies = {'cookies': '你的cookie'}24

25 url = ''.join(['https://music.douban.com/subject/',26 str(musicId), '/comments/hot?p=', str(currentPage)])27

28 res = requests.get(url, headers=headers, cookies=cookies)29 res.encoding = "utf-8"

31 if (res.status_code == 200):32 print('\n第{}页的数据爬取成功'.format(currentPage))33 print(url)34 else:35 print('\n o(╯□╰)o第{}页的数据爬取失败'.format(currentPage))36 print(url)37 x =etree.HTML(res.text)38 #歌名只需要取一次就行了

39 globalSongName40 if currentPage == 1:41 SongName = x.xpath('//*[@id="content"]/h1/text()')42

43 for j in range(1, 21): #豆瓣一页只有20条评论

44 #用户名

45 user_name =x.xpath(46 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/a/text()'.format(j))47 #评分

48 score =x.xpath(49 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span[1]/@title'.format(j))50 #时间//如果没有评分的，时间xpath路径有点变化

51 ifscore:52 date =x.xpath(53 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span[2]/text()'.format(j))54 else:55 date =x.xpath(56 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[2]/span/text()'.format(j))57 score = ''

58 #有多少个人点赞

59 like =x.xpath(60 '//*[@id="comments"]/ul/li[{}]/div[2]/h3/span[1]/span/text()'.format(j))61 #评论内容

62 content =x.xpath(63 '//*[@id="comments"]/ul/li[{}]/div[2]/p/span/text()'.format(j))64

65 username_list.append(str(user_name[0]).strip())66 #把中文转化成星级来显示

67 ifscore:68 if score[0] == '力荐':69 score = '★★★★★'

70 elif score[0] == '推荐':71 score = '★★★★'

72 elif score[0] == '还行':73 score = '★★★'

74 elif score[0] == '较差':75 score = '★★'

76 else:77 score = '★'

78 score_list.append(score)

#有些人评论了文字，但是没有给出评分

79 else:80 score_list.append('暂无评分')81 date_list.append(str(date[0]))82 like_list.append(str(like[0]))83 content_list.append(str(content[0]).strip())84

86 defmain(musicId, scrapyPage):87 globalSongName88 for i in range(1, scrapyPage + 1):89 get_content(musicId, i)90 #随机等待时间，免得被封ip

91 time.sleep(round(random.uniform(1, 2), 2))92 infos = {'username': username_list, 'score': score_list,93 'content': content_list, 'date': date_list, 'like': like_list}94 data =pd.DataFrame(95 infos, columns=['username', 'score', 'content', 'date', 'like'])96 songName = ''.join(SongName)97 data.to_csv(songName + ".csv") #存储名为歌曲名.csv

98 print('scrapy done!')99

100

101 if __name__ == '__main__':102 main(3040149, 100) #3040149 100 评论歌曲的ID号+要爬取的评论页面数

weixin_39605326

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬取豆瓣评论_python+requests爬取豆瓣歌曲评论

1 #-*- coding: utf-8 -*-2 '''3 Created on 2018年8月14日45 @author: zww67 '''8 importtime9 importre10 importrandom11 importrequests12 from lxml importetree13 importpandas as pd141516 username_list, score_...
复制链接

扫一扫