python京东商品_python 爬虫，爬取京东商品信息

最新推荐文章于 2024-09-30 09:30:48 发布

weixin_39527372

最新推荐文章于 2024-09-30 09:30:48 发布

阅读量186

点赞数

文章标签： python京东商品

本文链接：https://blog.csdn.net/weixin_39527372/article/details/111557274

版权

该博客展示了如何利用Python进行网络爬虫，详细步骤包括设置请求头、获取商品分类页面、遍历多页商品信息，抓取商品标题、价格、店铺名称、评论数以及首图，并将数据存储到Excel文件中。

摘要由CSDN通过智能技术生成

import requests

from bs4 import BeautifulSoup

import numpy as np

import pandas as pd

headers_index = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",

'HOST': 'www.jd.com',

"Accept-Language": "zh-CN,zh;q=0.9",

'Referer': 'https://www.jd.com/',

"Upgrade-Insecure-Requests": "1",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"

}

headers_product = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",

'HOST': 'list.jd.com',

"Accept-Language": "zh-CN,zh;q=0.9",

'Referer': 'https://list.jd.com/list.html?cat=9192%2C12632%2C12634&s=1&click=0&page=1',

"Upgrade-Insecure-Requests": "1",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"

}

# 商品分类的 css

category_css = '.cate_menu_lk'

# 商品分类所在的页面

category_url = 'https://www.jd.com/'

health_product_url = 'https://list.jd.com/listNew.php?cat=9192%2C12632%2C12634&click=0&page='

products = []

# 进入首页

def crawl_category(content_callback):

resp = requests.get(category_url, headers_index)

# print(resp.text)

content_callback(resp.text)

# 爬取首页分类对应的域名和链接

def save_category(text):

soup = BeautifulSoup(text, 'html.parser')

body = soup.select(category_css)

with open('./category_val.txt', 'w') as f:

for a in body:

f.write(a.attrs['href'] + " ," + a.text + '\r\n')

# pass

# 爬取20页保健品信息○( ＾皿＾)っ

def on_health_product_page(save_product_content):

for i in range(1, 20):

url = health_product_url + str(i)

headers_product['Referer'] = url

# print(url)

resp = requests.get(url, headers=headers_product)

# print(resp.text)

save_product_content(resp.text)

df = pd.DataFrame(columns=['title', 'price', '商店名字', '评论数', '首图'], data=products)

df.to_excel('./goodsList.xlsx', encoding='UTF-8', index=False)

# 保存商品信息

def save_product_content(txt):

soup = BeautifulSoup(txt, 'html.parser')

product_list = soup.select('.gl-i-wrap')

# print(len(product_list))

for product in product_list:

p_price = product.select_one('.p-price').text.replace('￥','').strip()

p_img = product.select_one('.p-img img').attrs['data-lazy-img']

p_shop = product.select_one('.p-shop a')

shop_name = '无'

if p_shop is not None:

shop_name = p_shop.attrs['title']

p_comment = product.select_one('div.p-commit').text.strip()

p_title = product.select_one('.p-name em').text.strip()

print("title: {}, price {}, s商店名字{} 评论数{} 首图{}".format(p_title, p_price, shop_name, p_comment, p_img))

products.append([p_title, p_price, shop_name, p_comment, p_img])

# pass

# print(df)

# crawl_category(save_category)

on_health_product_page(save_product_content)

weixin_39527372

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫