利用关键词爬取京东平台商品信息

import requests
from bs4 import BeautifulSoup
import csv
import time


def get_jd_products(keyword, max_page=1):
    products = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'}

    for page in range(1, max_page + 1):
        url = f"https://search.jd.com/Search?keyword={keyword}&enc=utf-8&page={page}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        for item in soup.select("li.gl-item"):
            name = item.select_one("div.p-name a em").text.strip()
            price = item.select_one("div.p-price i").text.strip()
            link = 'https:' + item.select_one("div.p-name a")["href"].strip()
            response = requests.get(link, headers=headers)
            details_soup = BeautifulSoup(response.text, "html.parser")
            try:
                product_id = \
                    details_soup.find("ul", {"class": "parameter2 p-parameter-list"}).find("li").text.split(":")[
                        1].strip()
            except AttributeError:
                product_id = "N/A"

            # Retrieve the comment count for the product
            try:
                comment_count = details_soup.select_one(".comment-count").text.strip().split()[1]
            except AttributeError:
                comment_count = "N/A"

            # Retrieve the store name for the product
            try:
                store_name = details_soup.select_one(".name a").text.strip()
            except AttributeError:
                store_name = "N/A"

            products.append({"name": name, "price": price, "product_id": product_id, "link": link, "comment_count": comment_count, "store_name": store_name})

        time.sleep(5)

    # Sort the products by comment count in descending order and return the top 50
    sorted_products = sorted(products, key=lambda p: int(p["comment_count"]) if p["comment_count"] != "N/A" else 0, reverse=True)
    return sorted_products[:50]


def save_products_to_csv(products, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["名称", "价格", "货号", "链接", "评论数", "店铺名称"])
        for product in products:
            writer.writerow([product["name"], product["price"], product["product_id"], product["link"], product["comment_count"], product["store_name"]])
    print(f"商品信息已保存至{filename}!")


if __name__ == "__main__":
    keyword = "清风原木"
    max_page = 10
    products = get_jd_products(keyword, max_page=max_page)
    filename = f"{keyword}_{max_page}页_评论数前50.csv"
    save_products_to_csv(products, filename)

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值