# -*- coding:utf-8 -*-
import os
import time
import requests
import urllib
import uuid
from pyquery import PyQuery
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/75.0.3770.142 Safari/537.36"
}
def href_url_download():
root_url = "https://m.sfbest.com/product/info/"
url = "https://m.sfbest.com/category"
# print(url)
try:
txt = requests.get(url, headers=headers).text
doc = PyQuery(txt)
category_lists = doc(".category-list").items()
for category_list in category_lists:
class_name = category_list(".i-name").text() # 大类别名称
print(class_name)
class_titles = category_list(".i-title").text()
titles = class_titles.split(" ")
a_list = category_list(".i-list a").items()
for index, a in enumerate(a_list):
keyword = titles[index].replace("/", "_") # 细分类名称
print(keyword)
urls = a.attr("href")
for pages in range(1, 6):
page = str(pages)
# print(page)
href = urls.replace("pageNo=" + str(int(page) - 1), "pageNo=" + page)
print(href)
txt2 = requests.get(href, headers=headers).text # 获取URL及headers
doc2 = PyQuery(txt2)
urls2 = doc2(".p-img.fl").items()
for url2 in urls2:
href2 = root_url + url2.attr("productid")
# print(href2)
txt3 = requests.get(href2, headers=headers).text # 获取URL及headers
doc3 = PyQuery(txt3)
urls3 = doc3(".swiper-slide img").items()
for url3 in urls3:
href3 = url3.attr("src")
print(href3)
image_download(class_name, keyword, href3) # 下载"图片"
except requests.exceptions.ConnectionError as e:
print("requests.exceptions.ConnectionError")
def image_download(words, keyword, image_url):
folder_path = "./image/" + words + "/" + keyword + "/" # 图片存储路径
if not os.path.exists(folder_path):
os.makedirs(folder_path)
# print(folder_path)
image_name = keyword + "_" + str(uuid.uuid1()) + ".jpg" # 图片命名
print(image_name)
# urllib.request.urlretrieve(href3, folder_path + "/" + image)
try:
content = requests.get(image_url, headers=headers)
with open(folder_path + image_name, "wb") as f:
f.write(content.content)
except urllib.error.HTTPError:
print("Internal Server Error")
except requests.exceptions.ConnectionError as e:
print("requests.exceptions.ConnectionError")
if __name__ == '__main__':
href_url_download() # 获取"花卉类别URL"及"具体花卉URL"
顺丰优选网爬虫
最新推荐文章于 2024-05-13 18:02:56 发布