顺丰优选网爬虫

# -*- coding:utf-8 -*-
import os
import time
import requests
import urllib
import uuid
from pyquery import PyQuery

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/75.0.3770.142 Safari/537.36"
}


def href_url_download():
    root_url = "https://m.sfbest.com/product/info/"
    url = "https://m.sfbest.com/category"
    # print(url)
    try:
        txt = requests.get(url, headers=headers).text
        doc = PyQuery(txt)
        category_lists = doc(".category-list").items()
        for category_list in category_lists:
            class_name = category_list(".i-name").text()  # 大类别名称
            print(class_name)
            class_titles = category_list(".i-title").text()
            titles = class_titles.split(" ")
            a_list = category_list(".i-list a").items()
            for index, a in enumerate(a_list):
                keyword = titles[index].replace("/", "_")  # 细分类名称
                print(keyword)
                urls = a.attr("href")
                for pages in range(1, 6):
                    page = str(pages)
                    # print(page)
                    href = urls.replace("pageNo=" + str(int(page) - 1), "pageNo=" + page)
                    print(href)
                    txt2 = requests.get(href, headers=headers).text  # 获取URL及headers
                    doc2 = PyQuery(txt2)
                    urls2 = doc2(".p-img.fl").items()
                    for url2 in urls2:
                        href2 = root_url + url2.attr("productid")
                        # print(href2)
                        txt3 = requests.get(href2, headers=headers).text  # 获取URL及headers
                        doc3 = PyQuery(txt3)
                        urls3 = doc3(".swiper-slide img").items()
                        for url3 in urls3:
                            href3 = url3.attr("src")
                            print(href3)
                            image_download(class_name, keyword, href3)  # 下载"图片"
    except requests.exceptions.ConnectionError as e:
        print("requests.exceptions.ConnectionError")


def image_download(words, keyword, image_url):
    folder_path = "./image/" + words + "/" + keyword + "/"  # 图片存储路径
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        # print(folder_path)
    image_name = keyword + "_" + str(uuid.uuid1()) + ".jpg"  # 图片命名
    print(image_name)
    # urllib.request.urlretrieve(href3, folder_path + "/" + image)
    try:
        content = requests.get(image_url, headers=headers)
        with open(folder_path + image_name, "wb") as f:
            f.write(content.content)
    except urllib.error.HTTPError:
        print("Internal Server Error")
    except requests.exceptions.ConnectionError as e:
        print("requests.exceptions.ConnectionError")


if __name__ == '__main__':
    href_url_download()  # 获取"花卉类别URL"及"具体花卉URL"

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_38185649

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值