爬虫系列之爬取1688

门柚

于 2022-07-28 16:58:44 发布

阅读量2.3k

点赞数 1

分类专栏： python 文章标签：爬虫

本文链接：https://blog.csdn.net/weixin_46364913/article/details/126038921

版权

python 专栏收录该内容

12 篇文章 2 订阅

订阅专栏

项目地址：GitHub - Carmenliukang/1688_crawler-image_search_products: 通过 1688 PC 端网址，上传图片查询类似的商品

仅供学习，禁止商用

1688

lib/alibaba_lib 是具体实现方式
简要流程如下：
1. 填入cookie
2. upload image 
3. 返回链接

部分代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from lib.alibaba_lib import Alibaba

if __name__ == '__main__':
    filename = 'data/下载.jpeg'
    cookie = """请填写登入成功的cookie"""
    url = Alibaba(cookie).run(filename)
    print(url)

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import re
import json
import requests
from lib.func_txy import request_post
from lib.func_txy import request_get_content
from lib.func_txy import get_random_str
from urllib.parse import urlparse


class Alibaba(object):
    """
    1688 PC 端接口获取相似商品的接口
    """

    def __init__(self, cookie):
        self.upload_url = "https://stream-upload.taobao.com/api/upload.api?appkey=1688search&folderId=0&_input_charset=utf-8&useGtrSessionFilter=false"  # 上传图片
        self.imageSearch_service_url = "https://open-s.1688.com/openservice/imageSearchOfferResultViewService"
        self._headers(cookie=cookie)
        self.search_page_size = 40

    def setSearchPageSize(self, pageSize):
        self.search_page_size = pageSize

    def _headers(self, cookie):
        headres = {
            'Origin': "https://www.1688.com",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0",
            "Accept": "*/*",
            "Cache-Control": "no-cache",
            "refer": "https://www.1688.com/",
            "cookie": cookie
        }
        self.headers = headres

    def upload_img(self, filename):
        """
        用于上传图片
        :return:
        """
        name = get_random_str(5) + ".jpeg"
        if os.path.exists(filename):
            bytestream = open(filename, "rb").read()
        else:
            us = urlparse(filename)
            if not us:
                return 'fail', None
            r = requests.get(filename)
            bytestream = io.BytesIO(r.content)

        files = {
            "name": (None, name),
            # "ua": (None, ""),
            "file": (name, bytestream)
        }

        status, res = request_post(self.upload_url, data=None, files=files, headers=self.headers)
        key = ""
        if status == "succ":
            data = json.loads(res)
            url = data["object"]["url"]
            key = url.split("/")[-1]
        return status, key

    def img_search(self, url):
        """
        用于上传图片并搜索商品列表
        从1688官网图搜页面扒出来的jsonp接口
        :return: dict o None
        """
        status_desc, data = request_get_content(url, headers=self.headers)
        if status_desc == "succ":
            return 'succ', data
        else:
            return 'fail', None

    def check_goods(self, html):
        """
        todo 这里需要匹配
        :param html:
        :return:
        """
        re.findall("window.data.offerresultData = successDataCheck\(.*?\)", html)

    def run(self, filename, need_products=False):
        # uoload image file
        status, key = self.upload_img(filename)

        # 上传成功后，拼接生成的 查询 URL
        if status == "succ":
            url_res = f"https://s.1688.com/youyuan/index.htm?tab=imageSearch&imageAddress={key}&spm="
            if need_products == False:
                return url_res
            else:
                status_desc, data = self.img_search(url_res)
                if status_desc == 'succ':
                    return data
                return None
        else:
            return ""