python爬虫数据分析项目双十一_双十一商品优惠打折，用Python爬取商品信息分析发现这里套路真深...

最新推荐文章于 2023-09-27 00:08:10 发布

weixin_39627751

最新推荐文章于 2023-09-27 00:08:10 发布

阅读量368

点赞数

文章标签： python爬虫数据分析项目双十一

咱们今天就以京东笔记本电脑为例分析一下

首先咱们针对京东商城笔记本的网页进行分析，这回只要在网页源码上分析，就可以获取笔记本价格、标题、评论数、商家名称、商家性质。

想要学习Python。关注小编，私信【学习资料】，即可免费领取一整套系统的板Python学习教程！

爬取代码

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.ui import WebDriverWait

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium import webdriver

from bs4 import BeautifulSoup

import pymongo

import time

# 连接数据库

client = pymongo.MongoClient(host='localhost', port=27017)

db = client.JD_products

collection = db.products

# 启动浏览器

browser = webdriver.Chrome()

wait = WebDriverWait(browser, 50)

def to_mongodb(data):

# 存储数据信息

try:

collection.insert(data)

print("Insert The Data Successfully")

except:

print('Insert The Data Failed')

def search():

browser.get('https://www.jd.com/')

try:

# 查找搜索框及搜索按钮，输入信息并点击按钮

input = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#key")))

submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))

input[0].send_keys('笔记本')

submit.click()

# 查找笔记本按钮及销量按钮，依次点击按钮

button_1 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_selector > div:nth-child(2) > div > div.sl-value > div.sl-v-list > ul > li:nth-child(1) > a")))

button_1.click()

button_2 = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_filter > div.f-line.top > div.f-sort > a:nth-child(2)")))

button_2.click()

# 获取总页数

page = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#J_bottomPage > span.p-skip > em:nth-child(1) > b')))

return page[0].text

except TimeoutException:

search()

def next_page(page_number):

try:

# 滑动到网页底部，加载出所有商品信息

browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

time.sleep(10)

html = browser.page_source

parse_html(html)

# 当网页到达100页时，下一页按钮失效，所以选择结束程序

while page_number == 101:

exit()

# 查找下一页按钮，并点击按钮

button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage > span.p-num > a.pn-next > em')))

button.click()

wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "#J_goodsList > ul > li:nth-child(60)")))

# 判断翻页成功

wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)))

except TimeoutException:

return next_page(page_number)

def parse_html(html):

"""

解析商品列表网页

"""

data = {}

soup = BeautifulSoup(html, 'html.parser')

goods_info = soup.select('.gl-item')

# 查看当前页商品数量，了解是否还有未加载的商品

quantity = 'item: ' + str(len(goods_info))

print(quantity)

for info in goods_info:

# 获取商品标题信息

title = info.select('.p-name.p-name-type-2 a em')[0].text.strip()

title = title.replace('爱心东东', '')

print("title: ", title)

data['_id'] = title

# 获取商品价格信息

price = info.select('.p-price i')[0].text.strip()

price = int(float(price))

print("price: ", price)

data['price'] = price

# 获取商品的评论数量

commit = info.select('.p-commit strong')[0].text.strip()

commit = commit.replace('条评价', '')

if '万' in commit:

commit = commit.split("万")

commit = int(float(commit[0]) * 10000)

else:

commit = int(float(commit.replace('+', '')))

print("commit: ", commit)

data['commit'] = commit

# 获取商品的商店名称

shop_name = info.select('.p-shop a')

if (len(shop_name)) == 1:

print("shop_name: ", shop_name[0].text.strip())

data['shop_name'] = shop_name[0].text.strip()

else:

print("shop_name: ", '京东')

data['shop_name'] = '京东'

# 获取商品的商店属性

shop_property = info.select('.p-icons i')

if (len(shop_property)) >= 1:

message = shop_property[0].text.strip()

if message == '自营':

print("shop_property: ", message)

data['shop_property'] = message

else:

print("shop_property: ", '非自营')

data['shop_property'] = '非自营'

else:

print("shop_property: ", '非自营')

data['shop_property'] = '非自营'

to_mongodb(data)

print(data)

print("

def main():

total = int(search())

print(total)

for i in range(2, total+2):

time.sleep(20)

print("第", i-1, "页：")

next_page(i)

if __name__ == "__main__":

main()

虽然一开始就是以笔记本这个关键词去搜索，但是这里还是需要再点击一次笔记本按钮，这是因为直接搜索笔记本会出现平常上课做笔记的那种笔记本，导致会获取无用信息。所以利用京东自身更加详细的归类，得到我们想要的信息。

其中每一个网页有60条商品数据，那么按道理应该有6000条的笔记本商品信息，但是最后却只获取了5992条。

估计两个原因：

1、在MongoDB中商品的标题为主键，商品标题出现重复

2、网页未能加载完所有的商品信息

最后成功获取商品信息

读取MongoDB中数据进行可视化分析部分代码

from pyecharts import Bar

import pandas as pd

import pymongo

client = pymongo.MongoClient('localhost', 27017)

db = client.JD_products

table = db.products

df = pd.DataFrame(list(table.find()))

price_info = df['price']

bins = [0, 2000, 2500, 3000, 3500, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, 16000, 19000, 200000]

level = ['0-2000', '2000-2500', '2500-3000', '3000-3500', '3500-4000', '4000-5000', '5000-6000', '6000-7000', '7000-8000', '8000-9000', '9000-10000', '10000-12000', '12000-14000', '14000-16000', '16000-19000', '19000以上']

price_stage = pd.cut(price_info, bins=bins, labels=level).value_counts().sort_index()

attr = price_stage.index

v1 = price_stage.values

bar = Bar('笔记本价格分布柱状图', title_pos='center', title_top='10', width=800, height=400)

bar.add('', attr, v1, is_stack=True, xaxis_rotate=30, yaxis_min=0, xaxis_interval=0, is_splitline_show=False, is_label_show=True)

bar.render('笔记本价格分布柱状图.html')

和之前的爬取京东的电脑价格数据做了一个对比，加上京东上面双十一给的津贴，等等发现，价格方面还是稍微的有一点点优惠的，部分商家还是有提高价格之后才进行优惠的，这里也给想要买笔记本电脑的小伙伴一个简单的参数，最后选择一部适合自己的笔记本。一般的笔记本参数如下：

CPU：酷睿系列i3、i5、i7，标压M与低压U

硬盘：500G、1T、2T

显卡：AMD，NVIDIA

内存：4G，8G

weixin_39627751

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
python爬虫数据分析项目双十一_双十一商品优惠打折，用Python爬取商品信息分析发现这里套路真深...

咱们今天就以京东笔记本电脑为例分析一下首先咱们针对京东商城笔记本的网页进行分析，这回只要在网页源码上分析，就可以获取笔记本价格、标题、评论数、商家名称、商家性质。想要学习Python。关注小编，私信【学习资料】，即可免费领取一整套系统的板Python学习教程！爬取代码from selenium.webdriver.support import expected_conditions as ECfr...
复制链接

扫一扫