爬虫系列之爬取1688

项目地址:GitHub - Carmenliukang/1688_crawler-image_search_products: 通过 1688 PC 端网址,上传图片查询类似的商品

仅供学习,禁止商用


1688

lib/alibaba_lib 是具体实现方式

简要流程如下:
1. 填入cookie
2. upload image 
3. 返回链接

部分代码: 

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from lib.alibaba_lib import Alibaba

if __name__ == '__main__':
    filename = 'data/下载.jpeg'
    cookie = """请填写登入成功的cookie"""
    url = Alibaba(cookie).run(filename)
    print(url)

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import os
import re
import json
import requests
from lib.func_txy import request_post
from lib.func_txy import request_get_content
from lib.func_txy import get_random_str
from urllib.parse import urlparse


class Alibaba(object):
    """
    1688 PC 端接口获取相似商品的接口
    """

    def __init__(self, cookie):
        self.upload_url = "https://stream-upload.taobao.com/api/upload.api?appkey=1688search&folderId=0&_input_charset=utf-8&useGtrSessionFilter=false"  # 上传图片
        self.imageSearch_service_url = "https://open-s.1688.com/openservice/imageSearchOfferResultViewService"
        self._headers(cookie=cookie)
        self.search_page_size = 40

    def setSearchPageSize(self, pageSize):
        self.search_page_size = pageSize

    def _headers(self, cookie):
        headres = {
            'Origin': "https://www.1688.com",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:85.0) Gecko/20100101 Firefox/85.0",
            "Accept": "*/*",
            "Cache-Control": "no-cache",
            "refer": "https://www.1688.com/",
            "cookie": cookie
        }
        self.headers = headres

    def upload_img(self, filename):
        """
        用于上传图片
        :return:
        """
        name = get_random_str(5) + ".jpeg"
        if os.path.exists(filename):
            bytestream = open(filename, "rb").read()
        else:
            us = urlparse(filename)
            if not us:
                return 'fail', None
            r = requests.get(filename)
            bytestream = io.BytesIO(r.content)

        files = {
            "name": (None, name),
            # "ua": (None, ""),
            "file": (name, bytestream)
        }

        status, res = request_post(self.upload_url, data=None, files=files, headers=self.headers)
        key = ""
        if status == "succ":
            data = json.loads(res)
            url = data["object"]["url"]
            key = url.split("/")[-1]
        return status, key

    def img_search(self, url):
        """
        用于上传图片并搜索商品列表
        从1688官网图搜页面扒出来的jsonp接口
        :return: dict o None
        """
        status_desc, data = request_get_content(url, headers=self.headers)
        if status_desc == "succ":
            return 'succ', data
        else:
            return 'fail', None

    def check_goods(self, html):
        """
        todo 这里需要匹配
        :param html:
        :return:
        """
        re.findall("window.data.offerresultData = successDataCheck\(.*?\)", html)

    def run(self, filename, need_products=False):
        # uoload image file
        status, key = self.upload_img(filename)

        # 上传成功后,拼接生成的 查询 URL
        if status == "succ":
            url_res = f"https://s.1688.com/youyuan/index.htm?tab=imageSearch&imageAddress={key}&spm="
            if need_products == False:
                return url_res
            else:
                status_desc, data = self.img_search(url_res)
                if status_desc == 'succ':
                    return data
                return None
        else:
            return ""

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值