selenium爬取淘宝2024最新款上衣前两页数据进行可视化分析

本文链接：https://blog.csdn.net/m0_73639297/article/details/137999111

本文介绍了如何使用Python的Selenium库和相关库如ChromeOptions处理反爬虫问题，抓取京东商品的前40页评论，并通过CSV和词云进行数据分析。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
import gc
import csv
from matplotlib import rcParams  ## run command settings for plotting

config = {
    "mathtext.fontset":'stix',
    "font.family":'serif',
    "font.serif": ['SimHei'],
    "font.size": 10,   # 字号
    'axes.unicode_minus': False # 处理负号，即-号
}
rcParams.update(config)  ## 设置画图的一些参数
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)

web.get('https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fwww.jd.com%2F')

web.maximize_window()
web.find_element('id','loginname').send_keys('jd_FVzgxqKJDpIL')
web.find_element('css selector','#nloginpwd').send_keys('pass-word')
web.implicitly_wait(10)
web.find_element('id','loginsubmit').click()
time.sleep(2)
web.implicitly_wait(10)
web.find_element('css selector','#key').send_keys('数据结构与算法：Python语言实现')
web.find_element('css selector','#search > div > div.form > button').click()
comments=web.find_element('css selector','#J_comment_12425597')
#跳转链接
lianjie=comments.get_attribute('href')
print(lianjie)
web.get(lianjie)
data_list=[]
#爬取商品前四十页的评论

for i in range(1,40):
    web.implicitly_wait(10)
    comments1=web.find_elements('css selector','#comment-0 > div')
    i=1
    for comment in comments1:
        if(i>10):
            continue
        commentor=comment.find_element('css selector','.comment-item .user-info')
       
        information=comment.find_element('tag name','p')
        
        info=comment.find_element('css selector','.comment-item .order-info')
        dit={
            '评论者':commentor.text,
            '评论内容':information.text,
            '时间':info.text,
            }
        print(dit)
        #用字典的形式显示出来
        data_list.append(dit)
        i=i+1     
    web.find_element('css selector','#comment-0 > div.com-table-footer > div > div > a.ui-pager-next').click()
    
    print('进入下一页')
    time.sleep(1)
#用csv文件保存字典内容
csv_file='comments2_csv'
csv_headers=['评论者','评论内容','时间']
with open(csv_file,'w',newline='',encoding='utf-8')as file:
    writer=csv.DictWriter(file,fieldnames=csv_headers)
    writer.writeheader()
    for data in data_list:
        writer.writerow(data)
print('csv文件已保存')
#用词云对内容机型可视化分析
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

text = ''
with open('comments2_csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    next(reader)  # 跳过表头
    for row in reader:
        text += ' '.join(row) + ' '  # 将词云连接在一起

# 生成词云，调整词云尺寸和字体大小
wordcloud = WordCloud(font_path='simsun.ttc', background_color='white', width=800, height=800, max_font_size=100).generate(text)

# 显示词云
plt.figure(figsize=(10, 10))  # 调整画布大小
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()  

gc.collect()

爬取的数据：