import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
headers_index = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'HOST': 'www.jd.com',
"Accept-Language": "zh-CN,zh;q=0.9",
'Referer': 'https://www.jd.com/',
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
}
headers_product = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'HOST': 'list.jd.com',
"Accept-Language": "zh-CN,zh;q=0.9",
'Referer': 'https://list.jd.com/list.html?cat=9192%2C12632%2C12634&s=1&click=0&page=1',
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
# 商品 分类的 css
category_css = '.cate_menu_lk'
# 商品分类所在的页面
category_url = 'https://www.jd.com/'
health_product_url = 'https://list.jd.com/listNew.php?cat=9192%2C12632%2C12634&click=0&page='
products = []
# 进入首页
def crawl_category(content_callback):
resp = requests.get(category_url, headers_index)
# print(resp.text)
content_callback(resp.text)
# 爬取首页分类对应的域名和链接
def save_category(text):
soup = BeautifulSoup(text, 'html.parser')
body = soup.select(category_css)
with open('./category_val.txt', 'w') as f:
for a in body:
f.write(a.attrs['href'] + " ," + a.text + '\r\n')
# pass
# 爬取20页保健品信息○( ^皿^)っ
def on_health_product_page(save_product_content):
for i in range(1, 20):
url = health_product_url + str(i)
headers_product['Referer'] = url
# print(url)
resp = requests.get(url, headers=headers_product)
# print(resp.text)
save_product_content(resp.text)
df = pd.DataFrame(columns=['title', 'price', '商店名字', '评论数', '首图'], data=products)
df.to_excel('./goodsList.xlsx', encoding='UTF-8', index=False)
# 保存商品信息
def save_product_content(txt):
soup = BeautifulSoup(txt, 'html.parser')
product_list = soup.select('.gl-i-wrap')
# print(len(product_list))
for product in product_list:
p_price = product.select_one('.p-price').text.replace('¥','').strip()
p_img = product.select_one('.p-img img').attrs['data-lazy-img']
p_shop = product.select_one('.p-shop a')
shop_name = '无'
if p_shop is not None:
shop_name = p_shop.attrs['title']
p_comment = product.select_one('div.p-commit').text.strip()
p_title = product.select_one('.p-name em').text.strip()
print("title: {}, price {}, s商店名字{} 评论数{} 首图{}".format(p_title, p_price, shop_name, p_comment, p_img))
products.append([p_title, p_price, shop_name, p_comment, p_img])
# pass
# print(df)
# crawl_category(save_category)
on_health_product_page(save_product_content)