python京东商品_python 爬虫,爬取京东商品信息

该博客展示了如何利用Python进行网络爬虫,详细步骤包括设置请求头、获取商品分类页面、遍历多页商品信息,抓取商品标题、价格、店铺名称、评论数以及首图,并将数据存储到Excel文件中。
摘要由CSDN通过智能技术生成

import requests

from bs4 import BeautifulSoup

import numpy as np

import pandas as pd

headers_index = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",

'HOST': 'www.jd.com',

"Accept-Language": "zh-CN,zh;q=0.9",

'Referer': 'https://www.jd.com/',

"Upgrade-Insecure-Requests": "1",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"

}

headers_product = {

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",

'HOST': 'list.jd.com',

"Accept-Language": "zh-CN,zh;q=0.9",

'Referer': 'https://list.jd.com/list.html?cat=9192%2C12632%2C12634&s=1&click=0&page=1',

"Upgrade-Insecure-Requests": "1",

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"

}

# 商品 分类的 css

category_css = '.cate_menu_lk'

# 商品分类所在的页面

category_url = 'https://www.jd.com/'

health_product_url = 'https://list.jd.com/listNew.php?cat=9192%2C12632%2C12634&click=0&page='

products = []

# 进入首页

def crawl_category(content_callback):

resp = requests.get(category_url, headers_index)

# print(resp.text)

content_callback(resp.text)

# 爬取首页分类对应的域名和链接

def save_category(text):

soup = BeautifulSoup(text, 'html.parser')

body = soup.select(category_css)

with open('./category_val.txt', 'w') as f:

for a in body:

f.write(a.attrs['href'] + " ," + a.text + '\r\n')

# pass

# 爬取20页保健品信息○( ^皿^)っ

def on_health_product_page(save_product_content):

for i in range(1, 20):

url = health_product_url + str(i)

headers_product['Referer'] = url

# print(url)

resp = requests.get(url, headers=headers_product)

# print(resp.text)

save_product_content(resp.text)

df = pd.DataFrame(columns=['title', 'price', '商店名字', '评论数', '首图'], data=products)

df.to_excel('./goodsList.xlsx', encoding='UTF-8', index=False)

# 保存商品信息

def save_product_content(txt):

soup = BeautifulSoup(txt, 'html.parser')

product_list = soup.select('.gl-i-wrap')

# print(len(product_list))

for product in product_list:

p_price = product.select_one('.p-price').text.replace('¥','').strip()

p_img = product.select_one('.p-img img').attrs['data-lazy-img']

p_shop = product.select_one('.p-shop a')

shop_name = '无'

if p_shop is not None:

shop_name = p_shop.attrs['title']

p_comment = product.select_one('div.p-commit').text.strip()

p_title = product.select_one('.p-name em').text.strip()

print("title: {}, price {}, s商店名字{} 评论数{} 首图{}".format(p_title, p_price, shop_name, p_comment, p_img))

products.append([p_title, p_price, shop_name, p_comment, p_img])

# pass

# print(df)

# crawl_category(save_category)

on_health_product_page(save_product_content)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值