当当网图书爬虫与数据分析

爬虫篇

'''
Function:
	当当网图书爬虫
'''
import time
import pickle
import random
import requests
from bs4 import BeautifulSoup


headers = {
	'Upgrade-Insecure-Requests': '1',
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
	'Accept-Encoding': 'gzip, deflate',
	'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
	'Cache-Control': 'no-cache',
	'Connection': 'keep-alive',
	'Host': 'search.dangdang.com'
}



'''解析, 提取需要的数据'''
def parseHtml(html):
	data = {}
	soup = BeautifulSoup(html, 'lxml')
	conshoplist = soup.find_all('div', {'class': 'con shoplist'})[0]
	for each in conshoplist.find_all('li'):
		# 书名
		bookname = each.find_all('a')[0].get('title').strip(' ')
		# 书图
		img_src = each.find_all('a')[0].img.get('data-original')
		if img_src is None:
			img_src = each.find_all('a')[0].img.get('src')
		img_src = img_src.strip(' ')
		# 价格
		price = float(each.find_all('p', {'class': 'price'})[0].span.text[1:])
		# 简介
		detail = each.find_all('p', {'class': 'detail'})[0].text
		# 评分
		stars = float(each.find_all('p', {'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20
		# 评论数量
		num_comments = float(each.find_all('p', {'class': 'search_star_line'})[0].a.text[:-3])
		data[bookname] = [img_src, price, detail, stars, num_comments]
	return data


'''主函数'''
def main(keyword):
	url = 'http://search.dangdang.com/?key={}&act=input&page_index={}'
	results = {}
	num_page = 0
	while True:
		num_page += 1
		print('[INFO]: Start to get the data of page%d...' % num_page)
		page_url  = url.format(keyword, num_page)
		res = requests.get(page_url, headers=headers)
		if '抱歉,没有找到与“%s”相关的商品,建议适当减少筛选条件' % keyword in res.text:
			break
		page_data = parseHtml(res.text)
		results.update(page_data)
		time.sleep(random.random() + 0.5)
	with open('%s_%d.pkl' % (keyword, num_page-1), 'wb') as f:
		pickle.dump(results, f)
	return results


if __name__ == '__main__':
	main('python')

在这里插入图片描述

绘制图书图片墙

思路:
1)先利用爬取当当网图书的图片ur
2)批量爬取图片
3)绘制图片墙

import os
import time
import math
import pickle
import requests
from PIL import Image


PICDIR = 'pictures'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
}


'''图片下载'''
def downloadPics(urls, savedir):
    if not os.path.exists(savedir):
        os.mkdir(savedir)
    for idx, url in enumerate(urls):
        res = requests.get(url, headers=headers)
        with open(os.path.join(savedir, '%d.jpg' % idx), 'wb') as f:
            f.write(res.content)
        time.sleep(0.5)


'''制作照片墙'''
def makePicturesWall(picdir):
    picslist = os.listdir(picdir)
    num_pics = len(picslist)
    print('照片数量',num_pics)
    size = 64
    line_numpics = int(math.sqrt(num_pics))#正方形
    picwall = Image.new('RGBA', (line_numpics*size, line_numpics*size))
    x = 0
    y = 0
    for pic in picslist:
        img = Image.open(os.path.join(picdir, pic))
        img = img.resize((size, size), Image.ANTIALIAS)     #改变图片尺寸
        picwall.paste(img, (x*size, y*size))    #合并图片
        x += 1
        if x == line_numpics:
            x = 0
            y += 1
    print('[INFO]: Generate pictures wall successfully...')
    picwall.save("picwall.png")     #保存图片


if __name__ == '__main__':
    with open('python_61.pkl', 'rb') as f:
        data = pickle.load(f)
    urls = [j[0] for i, j in data.items()]  #加载图片下载 url
    # downloadPics(urls, PICDIR)
    makePicturesWall(PICDIR)

图片墙:
在这里插入图片描述

数据分析篇

'''
import os
import jieba
import pickle
from pyecharts import Bar
from pyecharts import Pie
from pyecharts import Funnel
from wordcloud import WordCloud


'''柱状图(2)'''
def drawBar(title, data, savepath='./results'):
	if not os.path.exists(savepath):
		os.mkdir(savepath)
	bar = Bar(title, title_pos='center')
	#bar.use_theme('vintage')
	attrs = [i for i, j in data.items()]
	values = [j for i, j in data.items()]
	bar.add('', attrs, values, xaxis_rotate=15, yaxis_rotate=30)
	bar.render(os.path.join(savepath, '%s.html' % title))


'''饼图'''
def drawPie(title, data, savepath='./results'):
	if not os.path.exists(savepath):
		os.mkdir(savepath)
	pie = Pie(title, title_pos='center')
	#pie.use_theme('westeros')
	attrs = [i for i, j in data.items()]
	values = [j for i, j in data.items()]
	pie.add('', attrs, values, is_label_show=True,
			legend_orient="vertical", #标签成列
			legend_pos="left",# #标签在左
			radius=[30, 75],
			rosetype="area" #宽度属性随值大小变化
	)
	pie.render(os.path.join(savepath, '%s.html' % title))


'''漏斗图'''
def drawFunnel(title, data, savepath='./results'):
	if not os.path.exists(savepath):
		os.mkdir(savepath)
	funnel = Funnel(title, title_pos='center')
	#funnel.use_theme('chalk')
	attrs = [i for i, j in data.items()]
	values = [j for i, j in data.items()]
	funnel.add("", attrs, values, is_label_show=True,
			   label_pos="inside",#显示标签在图像中
			   label_text_color="#fff",
			   funnel_gap=5,
			   legend_pos="left",
			   legend_orient="vertical" #标签成列
			   )
	funnel.render(os.path.join(savepath, '%s.html' % title))


'''统计词频'''
def statistics(texts, stopwords):
	words_dict = {}
	for text in texts:
		temp = jieba.cut(text)
		for t in temp:
			if t in stopwords or t == 'unknow':
				continue
			if t in words_dict.keys():
				words_dict[t] += 1
			else:
				words_dict[t] = 1
	return words_dict


'''词云'''
def drawWordCloud(words, title, savepath='./results'):
	if not os.path.exists(savepath):
		os.mkdir(savepath)
	wc = WordCloud( background_color='white', max_words=2000, width=1920, height=1080, margin=5)
	wc.generate_from_frequencies(words)
	wc.to_file(os.path.join(savepath, title+'.png'))



if __name__ == '__main__':
	with open('python_61.pkl', 'rb') as f:
		data = pickle.load(f)
		
	# 价格分布
	results = {}
	prices = []
	price_max = ['', 0]
	for key, value in data.items():
		price = value[1]
		if price_max[1] < price:
			price_max = [key, price]
		prices.append(price)
	results['小于50元'] = sum(i < 50 for i in prices)
	results['50-100元'] = sum((i < 100 and i >= 50) for i in prices)
	results['100-200元'] = sum((i < 200 and i >= 100) for i in prices)
	results['200-300元'] = sum((i < 300 and i >= 200) for i in prices)
	results['300-400元'] = sum((i < 400 and i >= 300) for i in prices)
	results['400元以上'] = sum(i >= 400 for i in prices)
	drawPie('python相关图书的价格分布', results)
	print('价格最高的图书为: %s, 目前单价为: %f' % (price_max[0], price_max[1]))

	# 评分分布
	results = {}
	stars = []
	for key, value in data.items():
		star = value[3] if value[3] > 0 else '暂无评分'
		stars.append(str(star))
	for each in sorted(set(stars)):
		results[each] = stars.count(each)
	drawBar('python相关图书评分分布', results)
	
	# 评论数量
	results = {}
	comments_num = []
	top6 = {}
	for key, value in data.items():
		num = int(value[-1])
		comments_num.append(num)
		top6[key.split('【')[0].split('(')[0].split('(')[0].split(' ')[0].split(':')[0]] = num
	results['0评论'] = sum(i == 0 for i in comments_num)
	results['0-100评论'] = sum((i > 0 and i <= 100) for i in comments_num)
	results['100-1000评论'] = sum((i > 100 and i <= 1000) for i in comments_num)
	results['1000-5000评论'] = sum((i > 1000 and i <= 5000) for i in comments_num)
	results['5000评论以上'] = sum(i > 5000 for i in comments_num)
	drawFunnel('python相关图书评论数量分布', results)
	top6 = dict(sorted(top6.items(), key=lambda item: item[1])[-6:])
	drawBar('python相关图书评论数量TOP6', top6)
	# 词云
	stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
	texts = [j[2] for i, j in data.items()]
	words_dict = statistics(texts, stopwords)
	drawWordCloud(words_dict, 'python相关图书简介词云', savepath='./results')

图片展示:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
评论词云:
在这里插入图片描述

全部代码与数据放在Github上:
https://github.com/why19970628/Python_Crawler/tree/master/DangDang_Books

  • 4
    点赞
  • 91
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
当当网图书畅销榜的数据清洗大致可以分为以下几个步骤: 1. 获取数据:从当当网图书畅销榜的网站上爬取数据,一般可以使用 Python 中的 requests 库或者 scrapy 爬虫框架。 2. 数据解析:将爬取的网页数据解析成可以处理的数据结构,比如 Python 中的字典或者列表。 3. 数据清洗:对解析出来的数据进行清洗,一般需要处理缺失值、异常值、重复值、错误数据等,同时需要进行数据类型转换、去除空格、去除特殊符号等操作。 4. 数据存储:将清洗后的数据存储到文件或数据库中,以便后续的分析和使用。 具体实现的话,可以参考以下代码示例: ```python import requests from bs4 import BeautifulSoup import pandas as pd # 爬取当当网图书畅销榜的数据 url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-{}' data = [] for page in range(1, 6): r = requests.get(url.format(page)) soup = BeautifulSoup(r.text, 'html.parser') items = soup.select('.bang_list li') for item in items: name = item.select('.name a')[0].text.strip() author = item.select('.publisher_info a')[0].text.strip() price = item.select('.price_n')[0].text.strip() comment = item.select('.star.line .tuijian')[0].text.strip() data.append({'书名': name, '作者': author, '价格': price, '评论': comment}) # 将数据转换成 DataFrame 格式 df = pd.DataFrame(data) # 数据清洗 df['价格'] = df['价格'].apply(lambda x: float(x.replace('¥', ''))) df['评论'] = df['评论'].apply(lambda x: x.replace('条评论', '')) df['评论'] = df['评论'].apply(lambda x: int(x) if x else 0) # 数据存储 df.to_csv('dangdang_book.csv', index=False, encoding='utf-8-sig') ``` 上面的代码中,我们首先使用 requests 库和 BeautifulSoup 解析库爬取当当网图书畅销榜的数据,然后将数据转换成 DataFrame 格式,并进行了一些清洗操作,最后将清洗后的数据存储到了 CSV 文件中。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值