- 因为爬取QQ音乐数据做文本分析,所以使用SnowNLP打上了标签,以下代码注释了打标签过程,需要自己解开
- 需要使用 webdriver 这里使用的是chrome浏览器的参考安装
import pandas as pd
from snownlp import SnowNLP
comment_text = []
from selenium import webdriver
from icecream import ic
import time
import csv
driver = webdriver.Chrome()
def get_data(music_url):
driver = webdriver.Chrome()
driver.get(music_url)
driver.implicitly_wait(10)
js_button = 'document.documentElement.scrollTop=10000000'
for i in range(100):
driver.execute_script(js_button)
time.sleep(1.5)
print(i)
print("滚动条已经处于页面最下方!")
divs = driver.find_elements_by_css_selector('p.comment__text')
for div in divs:
if div.text == "":
continue
comment_text.append(div.text)
url_all = ['https://y.qq.com/n/ryqq/toplist/62','https://y.qq.com/n/ryqq/toplist/26','https://y.qq.com/n/ryqq/toplist/27','https://y.qq.com/n/ryqq/toplist/4','https://y.qq.com/n/ryqq/toplist/57','https://y.qq.com/n/ryqq/toplist/5','https://y.qq.com/n/ryqq/toplist/3','https://y.qq.com/n/ryqq/toplist/16','https://y.qq.com/n/ryqq/toplist/17','https://y.qq.com/n/ryqq/toplist/28','https://y.qq.com/n/ryqq/toplist/108','https://y.qq.com/n/ryqq/toplist/129','https://y.qq.com/n/ryqq/toplist/107','https://y.qq.com/n/ryqq/toplist/105','https://y.qq.com/n/ryqq/toplist/58']
get_data(url_all[12])
comment_text_tag = pd.DataFrame()
comment_text_tag["comment"] = comment_text
comment_text_tag.to_csv("./comment_text_tag13.csv")