import pandas as pd
import requests
import re
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc("font", family='Microsoft YaHei')
def request_dandan(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException as e:
print(e)
return None
'''
</li>
<li>
<div class="list_num red">2.</div>
<div class="pic"><a href="http://product.dangdang.com/28541936.html" target="_blank"><img src="http://img3m6.ddimg.cn/38/25/28541936-1_l_9.jpg" alt="男孩的学习力" title="男孩的学习力"/></a></div>
<div class="name"><a href="http://product.dangdang.com/28541936.html" target="_blank" title="男孩的学习力">男孩的学习力</a></div>
<div class="star"><span class="level"><span style="width: 97.2%;"></span></span><a href="http://product.dangdang.com/28541936.html?point=comment_point" target="_blank">152376条评论</a><span class="tuijian">100%推荐</span></div>
<div class="publisher_info">[日]<a href="http://search.dangdang.com/?key=富永雄辅" title="[日]富永雄辅 著,吴一红 译,酷威文化 出品" target="_blank">富永雄辅</a> 著,<a href="http://search.dangdang.com/?key=吴一红" title="[日]富永雄辅 著,吴一红 译,酷威文化 出品" target="_blank">吴一红</a> 译,<a href="http://search.dangdang.com/?key=酷威文化" title="[日]富永雄辅 著,吴一红 译,酷威文化 出品" target="_blank">酷威文化</a> 出品</div>
<div class="publisher_info"><span>2020-06-01</span> <a href="http://search.dangdang.com/?key=四川文艺出版社" target="_blank">四川文艺出版社</a></div>
<div class="biaosheng">五星评分:<span>114574次</span></div>
<div class="price">
<p><span class="price_n">¥34.70</span>
<span class="price_r">¥39.80</span>(<span class="price_s">8.7折</span>)
</p>
<p class="price_e">电子书:<span class="price_n">¥7.99</span></p>
<div class="buy_button">
<a ddname="加入购物车" name="" href="javascript:AddToShoppingCart('28541936');" class="listbtn_buy">加入购物车</a>
<a name="" href="http://product.dangdang.com/1901212680.html" class="listbtn_buydz" target="_blank">购买电子书</a>
<a ddname="加入收藏" id="addto_favorlist_28541936" name="" href="javascript:showMsgBox('addto_favorlist_28541936',encodeURIComponent('28541936&platform=3'), 'http://myhome.dangdang.com/addFavoritepop');" class="listbtn_collect">收藏</a>
</div>
</div>
</li>
'''
list_num = []
list_image = []
list_title = []
list_recommend = []
list_author = []
list_times = []
list_price = []
def crawlData(page):
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
html = request_dandan(url)
print(html)
pattern = re.compile(
'<li.*?list_num.*?(\d+)\.</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)次</span></div>.*?<p><span class="price_n">¥(.*?)</span>.*?</li>',
re.S)
items = re.findall(pattern, html)
for item in items:
print('开始写入数据 ====> ' + ' num:' + item[0] + ' image:' + item[1] + ' title:' + item[2] + ' recommend:' + item[3] + ' author:' + item[4]+ ' times:' + item[5] + ' price:' + item[6])
list_num.append(int(item[0]))
list_image.append(item[1])
list_title.append(item[2])
list_recommend.append(item[3])
list_author.append(item[4])
list_times.append(int(item[5]))
# 价格有可能带逗号,需要手动去掉
tPrice = item[6].replace(',', '')
list_price.append(float(tPrice))
if __name__ == "__main__":
for i in range(1, 5):
crawlData(i)
print("数据爬取完毕,爬取数据如下:")
print(list_num)
print(list_image)
print(list_title)
print(list_recommend)
print(list_author)
print(list_times)
print(list_price)
df = pd.DataFrame(
{'num': list_num, 'image': list_image, 'title': list_title, 'recommend': list_recommend, 'author': list_author,
'times': list_times, 'price': list_price})
df.to_csv('book.csv', index=False)
print("数据写入csv完毕")
data = pd.read_csv('book.csv')
print("读取csv,清理含有空字段值的行")
# 简单的数据清洗,清洗掉含有空字段值的行
newData = data.dropna(axis=0, how='any')
print(newData)
# 统计所有书籍在各个价格区间中的数量
price = ['0-49元', '50-99元', '100-149元', '超过150元']
data_0_49 = df[(newData['price'] >= 0) & (newData['price'] < 50)]
data_50_99 = df[(newData['price'] >= 50) & (newData['price'] < 100)]
data_100_149 = df[(newData['price'] >= 100) & (newData['price'] < 150)]
data_150_plus = df[(newData['price'] >= 150)]
count = [len(data_0_49), len(data_50_99), len(data_100_149), len(data_150_plus)]
priceAndCount = pd.DataFrame({'price': price, 'count': count})
priceAndCount.plot(kind='bar', x='price', y='count', title='各个价格区间内书籍数量')
plt.show()
简单的爬虫、数据清理和可视化
最新推荐文章于 2023-06-23 20:15:56 发布