selenium爬取亚马逊商品评论

亚马逊商品评论有反爬虫,所以就用selenium爬了。网速一定要好,不然爬的真的是天昏地暗。配合多线程就会快很多,这个不写了,爬的时候手动复制了N个代码去爬。还有一个点,中文和英文的设置,可以在评论里面加进去

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
import csv


class AsinSpider():
    options = webdriver.ChromeOptions()
    # options.add_argument("--proxy-sever = 218.60.8.99:3129")
    # options.add_argument("disable-web-security")
    options.add_argument("--lang=en-US")
    options.add_argument(
        'user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75"')
    # 为加快网页加载速度,禁止加载图片和CSS样式
    prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
    options.add_experimental_option("prefs", prefs)
    driver = webdriver.Chrome(executable_path=r"C:\Users\zxzy\AppData\Local\Google\Chrome\Application\chromedriver.exe",
                              options=options)

    def __init__(self, re_url, re_asin):
        self.review_url = re_url
        self.review_asin = re_asin
        self.item = None
        self.reviews_content = []
        self.review_stars = []
        self.review_customer = []
        self.review_img = []
        self.next_page = None

    def close_pop(self):
        self.driver.get(self.review_url)
        # 关闭弹窗
        try:
            WebDriverWait(self.driver, 100).until(
                EC.presence_of_element_located((By.CLASS_NAME, "a-button-input"))
            )
            pop = self.driver.find_element_by_class_name("a-button-input")
            WebDriverWait(self.driver, 100).until(
                EC.presence_of_element_located((By.CLASS_NAME, "a-button-input"))
            )
            pop.click()

        except:
            pass

    # 设置网页最大加载时长,超过刷新
    def get_url(self):
        self.driver.set_page_load_timeout(20)
        try:
            self.driver.get(self.next_page)
        except TimeoutException:
            self.driver.refresh()

    def get_main_information(self):
        # 等待页面加载完毕
        while True:
            try:
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.ID, 'cm_cr-review_list'))
                )
                break
            except Exception as e:
                print(e)
                self.driver.refresh()
        us = self.driver.find_element_by_id("cm_cr-review_list")

        # 爬取评论信息
        self.reviews_content = []
        self.review_stars = []
        self.review_customer = []
        self.review_img = []

        # 获取每页的全部评论信息
        reviews = self.driver.find_elements_by_xpath('//div[@class = "a-section celwidget"]')
        # 先以列表形式存储每条评论的文字信息,后期再处理
        self.reviews_content = [review.text for review in reviews]
        # 遍历每条评论,获取星级,客户链接,评论图片链接
        for review in reviews:
            # 获取评论星级
            try:
                self.review_stars.append(
                    review.find_element_by_xpath('.//i[@data-hook = "review-star-rating"]').get_attribute("innerHTML"))
            except:
                self.review_stars.append(
                    review.find_element_by_xpath('.//i[@data-hook = "cmps-review-star-rating"]').get_attribute(
                        "innerHTML"))
            # 获取客户链接
            try:
                WebDriverWait(review, 5).until(
                    EC.presence_of_element_located((By.XPATH, ".//a[@class = 'a-profile']"))
                )
                self.review_customer.append(
                    review.find_element_by_xpath(".//a[@class = 'a-profile']").get_attribute("href"))
            except:
                self.review_customer.append(None)

            # 获取评论中包含的图片链接
            try:
                WebDriverWait(review, 1).until(
                    EC.presence_of_element_located((By.XPATH, ".//*[@class='a-declarative']/a/img"))
                )
                imgs = review.find_elements_by_xpath(".//*[@class='a-declarative']/a/img")
                self.review_img.append([img.get_attribute("src") for img in imgs])
            except:
                self.review_img.append(None)

        # 判断是否还有下一页next_page
        try:
            WebDriverWait(self.driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, '//li[@class = "a-last"]/a'))
            )
            self.next_page = us.find_element_by_xpath('.//li[@class = "a-last"]/a').get_attribute("href")

        except NoSuchElementException:
            self.driver.find_elements_by_xpath('//li[@class = "a-disabled a-last"]')
            self.next_page = None
            print("未有下一页")

        except TimeoutException:
            self.next_page = None
            print("时间超了")
            self.driver.refresh()
        print("提取完毕")

    # 先简单分割下content内容,后面发现其实没必要。后续又写了一段代码专门用来解析
    def parse(self):
        for cc, cu, sa, img in zip(self.reviews_content, self.review_customer, self.review_stars, self.review_img):
            cc = "".join(cc).split("\n")
            yield {
                "review_names": cc[0],
                "review_customer": cu,
                "review_stars": sa,
                "review_titles": cc[1],
                "review_date": cc[2],
                "review_sizes": cc[3] if len(cc) > 4 else None,
                "review_revs": cc[4] if len(cc) > 5 else None,
                "review_others1": cc[5] if len(cc) > 6 else None,
                "review_others2": cc[6] if len(cc) > 7 else None,
                "asin": self.review_asin,
                'review_images': img
            }

    # 保存数据
    def save(self):
        with open(r'D:\python\amazon_review\test.csv', 'a', newline='', encoding='gb18030') as f:
            fieldnames = ["review_names", "review_customer", "review_stars", "review_titles", "review_date",
                          "review_sizes", "review_revs", "review_others1", "review_others2", "asin", "review_images"]
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for item in self.parse():
                writer.writerow(item)

    # 运行
    def run(self):
        # 先关闭修改地址的弹窗,这个函数其实也不用些,没必要
        self.close_pop()
        # 获取评论信息
        self.get_main_information()
        # 保存数据
        self.save()
        # 如果存在下一页,则继续爬取
        while self.next_page:
            # 设置网页最长加载时间,默认第一页一定会加载
            self.get_url()
            self.get_main_information()
            self.save()


def main():
    re_url = 'https://www.amazon.com/product-reviews/B07FMQKBSP'
    re_asin = 'B07FMQKBSP'
    spider = AsinSpider(re_url, re_asin)
    spider.run()
    print("%s爬取完毕" % re_asin)


if __name__ == '__main__':
    main()

存下来的数据再进行处理,这里主要用正则来处理

import pandas as pd
import re

# 读取数据
data = pd.read_csv(r'D:\python\amazon_review\review.csv',encoding = "gb18030")


# 提取评论星级
review_stars = [re.sub("<.+?>",'',i) for i in data['review_stars']]
data['review_stars'] = review_stars

# 处理review_titles错位的数据,先取出处理,再删除原来数据,最好再concat
findex = data.loc[data['review_titles'].str.contains("前|VINE|vine|制造",na=False)].index
f_data = data.loc[findex,:]
del f_data['review_titles']
f_data.columns = ['review_names', 'review_customer', 'review_stars', 'review_titles','review_date', 'review_sizes', 'review_revs', 
                  'review_others1','asin']
data.drop(findex,inplace = True)
data = pd.concat([data,f_data],sort = False)



# 处理review_date错位数据,根据开头不是20判断,错位数据只有1条,直接删了
data['review_date'] = data['review_date'].str.strip()
dindex = data.loc[True^ data['review_date'].str.startswith("20",na=False)].index
print(len(dindex))#只有1条
data.drop(dindex,inplace = True)



# 处理尺寸颜色错位的数据
data['review_sizes'] = data['review_sizes'].str.strip()
strs = ('Color','COLOR','color','SIZE','Size','size','已确认','早期','免费')
sindex = data.loc[True ^data['review_sizes'].str.startswith(strs,na=False)].index
sindex_data = data.loc[sindex,:]

# 更改列名
del sindex_data['review_others2']
sindex_data.rename(columns = {'review_others1':'review_others2',
                              'review_revs':'review_others1',
                              'review_sizes':'review_revs'},inplace = True)

# 删除再拼接
data.drop(sindex,inplace = True)
data = pd.concat([data,sindex_data],sort = False)



# 处理颜色和尺码
color = ('Color','COLOR','color')
size = ('SIZE','Size','size')
data['review_sizes'] = data['review_sizes'].str.replace(":",":")
c_data = data.loc[data['review_sizes'].str.startswith(color,na=False)]
s_data = data.loc[data['review_sizes'].str.startswith(size,na=False)]
data.drop(c_data.index,inplace = True)
data.drop(s_data.index,inplace = True)

# 由于size和color不固定哪儿开始,所以得弄四个dataframe
c_data['review_colors'] = c_data['review_sizes'].str.split(":").str[1].str.replace("Size","",re.I)
c_data['review_sizes1'] = c_data['review_sizes'].str.split(":").str[2].str.replace("color","",re.I)
s_data['review_colors'] = s_data['review_sizes'].str.split(":").str[2].str.replace("color","",re.I)
s_data['review_sizes1'] = s_data['review_sizes'].str.split(":").str[1].str.replace("color","",re.I)
data = pd.concat([data,c_data,s_data],sort = False)

# 保存数据
data.to_csv(r'D:\python\amazon_review\review.csv',index = False)

 

  • 6
    点赞
  • 23
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值