顺丰优选网爬虫

最新推荐文章于 2024-05-13 18:02:56 发布

weixin_38185649

最新推荐文章于 2024-05-13 18:02:56 发布

阅读量422

点赞数

分类专栏： python图片爬虫文章标签：图片爬虫三层关联顺丰优选

本文链接：https://blog.csdn.net/weixin_38185649/article/details/102826788

版权

python图片爬虫专栏收录该内容

20 篇文章 0 订阅

订阅专栏

# -*- coding:utf-8 -*-
import os
import time
import requests
import urllib
import uuid
from pyquery import PyQuery

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/75.0.3770.142 Safari/537.36"
}


def href_url_download():
    root_url = "https://m.sfbest.com/product/info/"
    url = "https://m.sfbest.com/category"
    # print(url)
    try:
        txt = requests.get(url, headers=headers).text
        doc = PyQuery(txt)
        category_lists = doc(".category-list").items()
        for category_list in category_lists:
            class_name = category_list(".i-name").text()  # 大类别名称
            print(class_name)
            class_titles = category_list(".i-title").text()
            titles = class_titles.split(" ")
            a_list = category_list(".i-list a").items()
            for index, a in enumerate(a_list):
                keyword = titles[index].replace("/", "_")  # 细分类名称
                print(keyword)
                urls = a.attr("href")
                for pages in range(1, 6):
                    page = str(pages)
                    # print(page)
                    href = urls.replace("pageNo=" + str(int(page) - 1), "pageNo=" + page)
                    print(href)
                    txt2 = requests.get(href, headers=headers).text  # 获取URL及headers
                    doc2 = PyQuery(txt2)
                    urls2 = doc2(".p-img.fl").items()
                    for url2 in urls2:
                        href2 = root_url + url2.attr("productid")
                        # print(href2)
                        txt3 = requests.get(href2, headers=headers).text  # 获取URL及headers
                        doc3 = PyQuery(txt3)
                        urls3 = doc3(".swiper-slide img").items()
                        for url3 in urls3:
                            href3 = url3.attr("src")
                            print(href3)
                            image_download(class_name, keyword, href3)  # 下载"图片"
    except requests.exceptions.ConnectionError as e:
        print("requests.exceptions.ConnectionError")


def image_download(words, keyword, image_url):
    folder_path = "./image/" + words + "/" + keyword + "/"  # 图片存储路径
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        # print(folder_path)
    image_name = keyword + "_" + str(uuid.uuid1()) + ".jpg"  # 图片命名
    print(image_name)
    # urllib.request.urlretrieve(href3, folder_path + "/" + image)
    try:
        content = requests.get(image_url, headers=headers)
        with open(folder_path + image_name, "wb") as f:
            f.write(content.content)
    except urllib.error.HTTPError:
        print("Internal Server Error")
    except requests.exceptions.ConnectionError as e:
        print("requests.exceptions.ConnectionError")


if __name__ == '__main__':
    href_url_download()  # 获取"花卉类别URL"及"具体花卉URL"

weixin_38185649

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
顺丰优选网爬虫

# -*- coding:utf-8 -*-import osimport timeimport requestsimport urllibimport uuidfrom pyquery import PyQueryheaders = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/...
复制链接

扫一扫