爬取苏宁手机用户评论的图片

最新推荐文章于 2024-08-21 11:22:17 发布

鲲仔

最新推荐文章于 2024-08-21 11:22:17 发布

阅读量87

点赞数

文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_51142241/article/details/114083070

版权

# -*- coding: utf-8 -*-
import json
import os
import random
import jsonpath
import requests
import lxml
from lxml import etree
from selenium import webdriver
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import urllib
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


class SNProcess:
    def __init__(self):
        self.url = 'https://search.suning.com/手机/'
        self.header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}

    def get_html(self, url):
        """获取HTML"""
        headers = self.header
        html = requests.get(url, headers=headers, verify=False, timeout=10).content.decode('utf-8', 'ignore')
        return html

    def re_func(self, regex, html):
        """正则解析函数"""
        pattern = re.compile(regex, re.S)
        r = pattern.findall(html)
        return r

    def parse_html(self):
        """爬取一级页面"""
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        url = self.url
        driver.get(url=url)
        time.sleep(1)
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
        time.sleep(2)
        html_source = driver.page_source
        html = lxml.etree.HTML(html_source)
        phone_number = html.xpath('//li[@doctype="1"]/@id')
        for hr_i in phone_number:
            self.get_phone_info(hr_i)
            time.sleep(random.randint(1, 2))

    def get_phone_info(self, number):
        nu = 1
        phone_id = number.split('-')
        phone_url = 'https://product.suning.com/' + str(phone_id[0]) + '/' + str(phone_id[1]) + '.html'

        phone_html = self.get_html(url=phone_url)
        # 获取源码独有id
        one_regex = r'"clusterId":"(.*?)"'
        one_id = self.re_func(regex=one_regex, html=phone_html)
        for num in range(1, 10):
            url = 'https://review.suning.com/ajax/getClusterReviewImages/cluster-' + str(one_id[0]) + '-0000000' + str(
                phone_id[1]) + '-' + str(phone_id[0]) + '-' + str(num) + '-10-imgReviewList.htm?callback=imgReviewList'

            images = requests.post(url, headers=self.header, verify=False, timeout=10)
            j1 = str(images.content, encoding='utf-8')
            j = json.loads(j1[14:len(j1) - 1])
            # images.text获取不是json格式要切掉前面一段
            ima = jsonpath.jsonpath(j, '$..url')
            i = 1
            for image_url in ima:
                print('*' * 10 + "正在下载第" + str((nu - 1) * 10 + i) + '张图片' + '*' * 10)
                try:
                    res = urllib.request.urlopen('https:' + image_url + '.jpg', timeout=5).read()
                    path = 'E:/suning/'
                    im_id = image_url.split('/')
                    if not os.path.exists(path):
                        os.makedirs(path)
                    with open(path + phone_id[1] + im_id[-1] + str(i) + '.jpg', 'wb') as f:
                        f.write(res)
                        f.close()
                except:
                    continue
                finally:
                    i += 1


if __name__ == "__main__":
    c = SNProcess()
    c.parse_html()

这个代码质量不好，还在学习中求大佬指点迷津

鲲仔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取苏宁手机用户评论的图片

# -*- coding: utf-8 -*-import jsonimport osimport randomimport jsonpathimport requestsimport lxmlfrom lxml import etreefrom selenium import webdriverfrom bs4 import BeautifulSoupfrom fake_useragent import UserAgentimport timeimport urllibimpor
复制链接

扫一扫