简易的京东商品信息爬取_爬取京东的商品信息-CSDN博客

本文链接：https://blog.csdn.net/m0_73831440/article/details/138726237

最近搞项目缺数据，随便写了一个爬虫，不知道好不好用，拿来分享一下，希望各位大佬提供一下指导。商品评论数量（QuantitySold）这个数据不知道为什么始终获取值为空，我这里自己采用的随机数

import time
import random

import requests
from lxml import etree
import pymysql
from DBUtils.PooledDB import PooledDB

db_pool = PooledDB(
    creator=pymysql,  # 使用 PyMySQL 作为数据库连接库
    mincached=1,  # 最小空闲连接数
    maxcached=20,  # 最大空闲连接数
    maxconnections=20,  # 最大连接数
    host='localhost',
    user='root',
    password='',
    database='shopping',
    charset='utf8mb4'
)

# 插入数据
def insert_data(data):
    conn = db_pool.connection()
    cursor = conn.cursor()
    try:
        # 执行插入操作
        cursor.execute("INSERT INTO products (shopname, name, type, QuantitySold, price, description, images) VALUES (%s, %s, %s, %s, %s, %s, %s)", data)
        # 提交事务
        conn.commit()
    except Exception as e:
        print("Error inserting data:", e)
        # 回滚事务
        conn.rollback()
    finally:
        # 关闭游标和连接
        cursor.close()
        conn.close()


head = {
    "Cookie":"",
     "User-Agent":""
}

def main(shoptitle, page):
    url = f"https://search.jd.com/Search?keyword=${shoptitle}&wq=${shoptitle}&pvid=8858151673f941e9b1a4d2c7214b2b52&isList=0&page=${page}"
    resp = requests.get(url, headers=head)
    resp.encoding = "utf-8"
    et = etree.HTML(resp.text)
    shoppingName = et.xpath("//*[@id='J_goodsList']/ul/li/div/div/span/a/text()")
    title = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/@title")
    img = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/img/@data-lazy-img")
    price = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-price']/strong/i/text()")
    Qun = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-commit']/strong/text()")
    # print(etree.tostring(item, encoding="unicode"))//*[@id="J_comment_100091072389"]
    xin = et.xpath("//*[@id='J_goodsList']/ul/li/div[@class='gl-i-wrap']/div[@class='p-img']/a/@href")
    numurl = []
    lianjie =[]
    for item in img:
        numurl.append("https:"+item)
    for item in xin:
        lianjie.append("https:" + item)
    time.sleep(1)
    for index in range(len(shoppingName)):
        random_number = random.randint(100, 500000)
        paqu(lianjie[index], shoppingName[index], title[index], shoptitle, random_number, price[index], numurl[index])

def paqu(Url, shoppingName, title, shoptitle, random_number, price, img):
    resp = requests.get(Url, headers=head)
    resp.encoding = "utf-8"
    et = etree.HTML(resp.text)
    num = et.xpath("//*[@id='detail']/div[2]/div[1]/div[1]/ul[3]/li/text()")
    lianjie = ""
    for item in num:
        lianjie += item+","
    print(img)
    data_to_insert = (shoppingName, title, shoptitle, random_number, price, lianjie, img)
    insert_data(data_to_insert)


shoptitle = ""
if __name__ == '__main__':
    shoptitle = input("请输入你要爬取的商品类型")
    #自己定义要爬取商品多少页
    for cx in range(20):
       main(shoptitle, cx+2)