python实现京东单个商品信息抓取(标题,品牌(中/英文),图片,型号,价格,详情)

python相关包

pip install lxml
pip install requests
pip install urllib3

代码部分

#!/usr/bin/python3
## -*- coding: utf-8 -*-

import requests,re,urllib3,json,random,string
import lxml.etree as etree
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

class jdspider:
    """"""
    def __init__(self,headers=None):
        cookie=''.join(random.sample(string.ascii_letters + string.digits, 32)*4)
        self.heads = headers if headers else {
            'authority': 'item.jd.com',
            'method': 'GET',
            'scheme': 'https',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'cookie': '__jdu=1675665312520306609640; areaId=1; PCSYCityID=CN_110000_110100_0; shshshfpa=6f8ad375-c871-c406-2bc6-5357bb27518c-1675665315; shshshfpb=hkXyP4dFGBP_nNUpKxHJNTw; ipLoc-djd=1-2801-54766-0; mt_xid=V2_52007VwMVVFlfVV4dSRpYBmYBE1VaWVpdGkgpDFcyBUFbXg1OCB5LGkAAb1YaTlRcUQoDQBxaVmRUE1BcX1ZZL0oYXwd7AhFOXF5DWhtCGl4OZAUiUG1YYl4dShlfDWUBF1VtXVNTGQ==; unpl=JF8EAKlnNSttURxdARoFThUWQlwDW11aHEcDbGIGUg1RSlwGGQZJRhl7XlVdXxRKFB9uYRRUWlNLUQ4fACsSEXteXV5tC0oXBW5uBV1cWUtkNRgCKxsgS1pSWloPTxUDbGMGVV9ZTFMEEwUeEyBKbVNuXg9IEgRuYAZdWFp7ZAUfBRwTE3tcZF9tSh9LAWdvB1xeFUtTAx8FHBYSS15QXVwKShAEbm8CUVxoSmQG; __jdv=122270672|norefer|t_281_20170818001|cpc|_0_8f8506d6780f41cfa12427a809235ce8|1675757696357; TrackID=1RLmikjo_ekVYZD2O9X00Vuq2nsUtExey0-QYtYUAfXIF_BEx7wN3s4YSv8PpFiUrpofG1yKxVKBOm9kGx-cVVzRmfOQBtB8Pko22itVG6Ss; pinId=57kC0TMY_imlDxsL1hZtMbV9-x-f3wj7; pin=jd_6690f5fcff316; unick=jd_130011zjj; _tp=GddPg2iKimJO1ghABMLB+Fpac9UhEdJvtWgGtDwUWd4=; _pst=jd_6690f5fcff316; user-key=5728a71d-7d12-4f11-95a4-a624b0f67983; shshshfpx=6f8ad375-c871-c406-2bc6-5357bb27518c-1675665315; __jdc=122270672; shshshfp=a7f104aa8822ce82a1956a5a34c1bef3; ip_cityCode=2802; jsavif=1; jsavif=1; __jda=122270672.1675665312520306609640.1675665313.1675828511.1675834848.12; 3AB9D23F7A4B3C9B=GJNZ7FN5IDR24R66U6TOLDWZGASH24KNCSCVI3OH3JYU4JYIL3BUJYCUEZZLDSRWCOQPYRN4OZ2LWGE6FXIY2VEX3M',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36',
        }
        self.goods_url = "https://item.jd.com/{}.html"
        self.detail_url = "https://cd.jd.com/description/channel?skuId={}&mainSkuId={}&charset=utf-8&cdn=2&callback=showdesc"
        # 代理配置
        self.tunnel = "tps124.kdlapi.com:15818"
        username = "t13020803831933"
        password = ""
        self.proxies = {
            "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": self.tunnel},
            "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": self.tunnel}
        }
        self.proxies=None

    def getprice(self,jd_sn):
        res = requests.get("https://item.m.jd.com/product/{}.html".format(jd_sn), headers=self.heads, proxies=self.proxies)
        html = res.text
        s = re.compile(r'"price":(.*\}),')
        js = re.findall(s, html)
        try:
            data = json.loads(js[0])
            return {"m": data["m"], "p": data["p"], "op": data["op"]}
        except Exception as e:
            return {}

    def downloadimg(self,img_url):
        """图片资源下载"""
        if "http" not in img_url:
            img_url = "http:"+img_url
        return img_url.replace(".avif","")

    def spider(self,sku):
        """数据获取"""
        goods = dict()
        #商品页html代码
        res = requests.get(self.goods_url.format(sku),headers=self.heads,verify=False, proxies=self.proxies)
        html = res.text
        mainskuid = re.findall(r'mainSkuId:.\d+', html)[0].split(":'")[1]
        #获取详情图片
        detail = requests.get(self.detail_url.format(sku,mainskuid),headers=self.heads,verify=False, proxies=self.proxies)
        tree = etree.HTML(html)
        # 商品标题
        title = tree.xpath("/html/body/div[6]/div/div[2]/div[1]//text()")
        for item in title:
            if item.strip():
                title = item.strip()
        goods['goods_name'] = title
        imgs = tree.xpath("//*[@id='spec-list']/ul/li/img/@src")
        ware = tree.xpath("//*[@id='detail']/div[2]/div[2]/div[2]/p//text()")
        if ware:
            ware = ware[0]
        #商品包装
        goods['ware'] = ware
        #获取品牌
        brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-5']//text()")
        if not brand:
            brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-4']//text()")
        elif not brand:
            brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-3']//text()")
        elif not brand:
            brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-2']//text()")
        elif not brand:
            brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-1']//text()")
        elif not brand:
            brand = tree.xpath("//*[@clstag='shangpin|keycount|product|mbNav-0']//text()")
        if brand:
            brand = brand[0].replace(" ","")
        #获取型号
        model=""
        model1 = tree.xpath("//*[@id='crumb-wrap']/div/div[1]/div[9]//text()")
        brand1=brand
        goods["p_link"] = "https://item.jd.com/%s.html"%sku
        goods["jd_sn"] = sku
        goods['en_brand'] =""
        if "(" in brand:
            brand1 = brand.split("(")
            if len(brand1)>1:
                en_brand = brand1[1].replace(")","")
                goods['en_brand'] = en_brand.replace(" ","")
            zh_brand = brand1[0]
        else:
            zh_brand = brand1
        try:
            if zh_brand and isinstance(zh_brand,list):
                goods['zh_brand'] = zh_brand[0].replace(" ","")
            else:
                goods['zh_brand'] = zh_brand.replace(" ","")
        except Exception as e:
            goods['zh_brand']=""
        try:
            if not model:
                model = model1[0].replace(goods['zh_brand'],"").replace(goods['en_brand'],"")
            goods["model"] = model.replace(" ","").replace("\r","").replace("\n","")
        except Exception as e:
            goods["model"] =""
        try:
            if not goods["model"]:
                model = tree.xpath('//*[@id="crumb-wrap"]/div/div[1]/div[@class="item ellipsis"]//text()')
                model = model[0].replace(goods['zh_brand'],"").replace(goods['zh_brand'],"")
            goods["model"] = model.replace(" ","").replace("\r","").replace("\n","").replace("(","").replace(")","")
        except Exception as e:
            goods["model"]=""
        parameter_list = tree.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[@class="parameter2 p-parameter-list"]/li//text()')
        attrs_dict = dict()
        for parameter in parameter_list:
            param = parameter.split(":")
            if ("商品编号" not in param[0]) and ("店铺" not in param[0]):
                if len(param)>1:
                    attrs_dict[param[0]]=param[1]
        attrs = tree.xpath("//*[@id='detail']/div/div/div/div[@class='Ptable-item']")
        for item1 in attrs:
            attr = item1.xpath('dl/dl[@class="clearfix"]')
            for item in attr:
                attrname = item.xpath("dt//text()")
                attrvalue = item.xpath("dd//text()")
                attrs_dict[attrname[0].strip()] = attrvalue[-1].strip()
        goods['image_url'] = list()
        #获取相册图片
        for img in imgs:
            url = "https:" + img.replace("s54x54", "s400x400").replace("n5", "sku")
            goods['image_url'].append(self.downloadimg(url))
        goods['price'] = self.getprice(sku)
        #获取详情部分
        s = re.compile(r'(//img.*?(\.jpg|\.png|\.jpeg|\.gif))')
        try:
            datail_html = eval(detail.content.decode().replace("showdesc"," "))["content"].replace("data-lazyload","src").replace("//img10","https://img10").replace("<style></style>","<style>img{width:100%;margin:0 auto}</style>")
            datail_imgs = re.findall(s, datail_html)
        except Exception as e:
            datail_imgs = re.findall(s, detail.text)
        datail_imgs=[img[0] for img in datail_imgs]
        goods_detail=list()
        for detail_img in datail_imgs:
            goods_detail.append(self.downloadimg(detail_img))
        goods['goods_content'] = goods_detail
        goods['attrs'] = attrs_dict
        return goods

if __name__ == '__main__':
    jd_spider = jdspider()
    goods = jd_spider.spider("8163617")
    print(json.dumps(goods))

返回数据:

{'goods_name': '福临门  面粉 麦芯通用小麦粉 中筋粉 馒头、包子、烙饼等各类面食 中粮出品 十斤 5kg(新老包装随机发货)', 'ware': '面粉*1', 'p_link': 'https://item.jd.com/8163617.html', 'jd_sn': '8163617', 'en_brand': '', 'zh_brand': '福临门', 'model': '面粉', 'image_url': ['https://img12.360buyimg.com/sku/jfs/t1/130368/33/29760/77107/63367b89Edeba674b/3f394e7176f87f80.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/184788/40/11977/83962/60dae86eE53d5f781/395d705d2a05a369.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/133680/36/3344/133873/5efb2132E6f5f3907/da42bef991bb8137.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/130722/39/3309/191088/5efb214dE798ecf85/c996260e88fbcf86.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/30944/25/12069/158605/5cb6d866Eb8223b63/39014969736ab6ec.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/30568/24/12098/206240/5cb6d866E52af5b76/0c87d72d3e6045d7.jpg', 'https://img12.360buyimg.com/sku/jfs/t1/36030/2/918/137565/5cb6d866E081adb5c/3d9b1327927033bd.jpg'], 'price': {'m': '40.00', 'p': '25.90', 'op': '29.90'}, 'goods_content': ['http://img30.360buyimg.com/sku/jfs/t1/124486/31/19486/271554/60b44bd8E3af704d2/779209bd54d08613.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/120024/12/19494/190137/60b44bd8Eec546fe5/7ba5709ab0700d78.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/119935/9/20140/284780/60b44bd8E0a0e3d04/897bdf1e780549b5.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/173493/33/12253/280326/60b44bd8E41102deb/cbf47d1715258209.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/183467/37/6629/284945/60b44bd8Ec3aca21b/ac2d45d6dbd3ef72.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/172257/5/12271/280601/60b44bd8Ec214616c/aac314443a965381.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/128236/26/19102/282816/60b44bd8E8509e542/167c154edc3599ab.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/179134/19/6490/279496/60b44bd8E403a3561/fdb2e464c68f0d2f.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/183555/35/6547/283235/60b44bd8Eaea13877/2e9729a6e29d53ef.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/195060/35/5592/286560/60b44bd8E5274b157/080d8e2295ff8eb7.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/176982/32/6688/285309/60b44bd8Ed75a7b10/c2ba32a5d76575f8.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/180719/6/6666/191040/60b44bd8E5744e5df/f687e8de0c20e1b0.jpg', 'http://img30.360buyimg.com/sku/jfs/t1/110975/11/15228/278363/60b44bd8E94621ebe/49dad409ccb0ff24.jpg'], 'attrs': {'商品名称': '福临门面粉', '商品毛重': '5.1kg', '商品产地': '河南濮阳', '包装形式': '袋装', '类别': '麦芯粉', '净含量': '5000g', '保质期': '12个月', '生产许可证号': 'SC10141090200018', '产品标准号': 'GB/T 1355'}}
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值