# -*- coding: utf-8 -*-
import json
import os
import random
import jsonpath
import requests
import lxml
from lxml import etree
from selenium import webdriver
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import time
import urllib
import re
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class SNProcess:
def __init__(self):
self.url = 'https://search.suning.com/手机/'
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
def get_html(self, url):
"""获取HTML"""
headers = self.header
html = requests.get(url, headers=headers, verify=False, timeout=10).content.decode('utf-8', 'ignore')
return html
def re_func(self, regex, html):
"""正则解析函数"""
pattern = re.compile(regex, re.S)
r = pattern.findall(html)
return r
def parse_html(self):
"""爬取一级页面"""
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
url = self.url
driver.get(url=url)
time.sleep(1)
driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(2)
html_source = driver.page_source
html = lxml.etree.HTML(html_source)
phone_number = html.xpath('//li[@doctype="1"]/@id')
for hr_i in phone_number:
self.get_phone_info(hr_i)
time.sleep(random.randint(1, 2))
def get_phone_info(self, number):
nu = 1
phone_id = number.split('-')
phone_url = 'https://product.suning.com/' + str(phone_id[0]) + '/' + str(phone_id[1]) + '.html'
phone_html = self.get_html(url=phone_url)
# 获取源码独有id
one_regex = r'"clusterId":"(.*?)"'
one_id = self.re_func(regex=one_regex, html=phone_html)
for num in range(1, 10):
url = 'https://review.suning.com/ajax/getClusterReviewImages/cluster-' + str(one_id[0]) + '-0000000' + str(
phone_id[1]) + '-' + str(phone_id[0]) + '-' + str(num) + '-10-imgReviewList.htm?callback=imgReviewList'
images = requests.post(url, headers=self.header, verify=False, timeout=10)
j1 = str(images.content, encoding='utf-8')
j = json.loads(j1[14:len(j1) - 1])
# images.text获取不是json格式要切掉前面一段
ima = jsonpath.jsonpath(j, '$..url')
i = 1
for image_url in ima:
print('*' * 10 + "正在下载第" + str((nu - 1) * 10 + i) + '张图片' + '*' * 10)
try:
res = urllib.request.urlopen('https:' + image_url + '.jpg', timeout=5).read()
path = 'E:/suning/'
im_id = image_url.split('/')
if not os.path.exists(path):
os.makedirs(path)
with open(path + phone_id[1] + im_id[-1] + str(i) + '.jpg', 'wb') as f:
f.write(res)
f.close()
except:
continue
finally:
i += 1
if __name__ == "__main__":
c = SNProcess()
c.parse_html()
这个代码质量不好,还在学习中求大佬指点迷津