简单的爬虫、数据清理和可视化

最新推荐文章于 2023-06-23 20:15:56 发布
songsantai
最新推荐文章于 2023-06-23 20:15:56 发布
阅读量226
点赞数
文章标签：爬虫 python 开发语言
本文链接：https://blog.csdn.net/songsantai/article/details/125631914
版权
import pandas as pd
import requests
import re
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rc("font", family='Microsoft YaHei')


def request_dandan(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException as e:
        print(e)
        return None


'''
    </li>    
    
    <li>
    <div class="list_num red">2.</div>   
    <div class="pic"><a href="http://product.dangdang.com/28541936.html" target="_blank"><img src="http://img3m6.ddimg.cn/38/25/28541936-1_l_9.jpg" alt="男孩的学习力"  title="男孩的学习力"/></a></div>    
    <div class="name"><a href="http://product.dangdang.com/28541936.html" target="_blank" title="男孩的学习力">男孩的学习力</a></div>    
    <div class="star"><span class="level"><span style="width: 97.2%;"></span></span><a href="http://product.dangdang.com/28541936.html?point=comment_point" target="_blank">152376条评论</a><span class="tuijian">100%推荐</span></div>    
    <div class="publisher_info">[日]<a href="http://search.dangdang.com/?key=富永雄辅" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">富永雄辅</a> 著，<a href="http://search.dangdang.com/?key=吴一红" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">吴一红</a> 译，<a href="http://search.dangdang.com/?key=酷威文化" title="[日]富永雄辅 著，吴一红 译，酷威文化 出品" target="_blank">酷威文化</a> 出品</div>    
    <div class="publisher_info"><span>2020-06-01</span>&nbsp;<a href="http://search.dangdang.com/?key=四川文艺出版社" target="_blank">四川文艺出版社</a></div>    

            <div class="biaosheng">五星评分：<span>114574次</span></div>
                      
    
    <div class="price">        
        <p><span class="price_n">&yen;34.70</span>
                        <span class="price_r">&yen;39.80</span>(<span class="price_s">8.7折</span>)
                    </p>
                    <p class="price_e">电子书：<span class="price_n">&yen;7.99</span></p>
                <div class="buy_button">
                          <a ddname="加入购物车" name="" href="javascript:AddToShoppingCart('28541936');" class="listbtn_buy">加入购物车</a>
                        
                        <a name="" href="http://product.dangdang.com/1901212680.html" class="listbtn_buydz" target="_blank">购买电子书</a>
                        <a ddname="加入收藏" id="addto_favorlist_28541936" name="" href="javascript:showMsgBox('addto_favorlist_28541936',encodeURIComponent('28541936&platform=3'), 'http://myhome.dangdang.com/addFavoritepop');" class="listbtn_collect">收藏</a>
     
        </div>

    </div>
  
    </li>  

'''

list_num = []
list_image = []
list_title = []
list_recommend = []
list_author = []
list_times = []
list_price = []


def crawlData(page):
    url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
    html = request_dandan(url)
    print(html)
    pattern = re.compile(
        '<li.*?list_num.*?(\d+)\.</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)次</span></div>.*?<p><span class="price_n">&yen;(.*?)</span>.*?</li>',
        re.S)
    items = re.findall(pattern, html)

    for item in items:
        print('开始写入数据 ====> ' + ' num:' + item[0] + ' image:' + item[1] + ' title:' + item[2] + ' recommend:' + item[3] + ' author:' + item[4]+ ' times:' + item[5] + ' price:' + item[6])
        list_num.append(int(item[0]))
        list_image.append(item[1])
        list_title.append(item[2])
        list_recommend.append(item[3])
        list_author.append(item[4])
        list_times.append(int(item[5]))
        # 价格有可能带逗号，需要手动去掉
        tPrice = item[6].replace(',', '')
        list_price.append(float(tPrice))


if __name__ == "__main__":

    for i in range(1, 5):
        crawlData(i)
    print("数据爬取完毕，爬取数据如下：")
    print(list_num)
    print(list_image)
    print(list_title)
    print(list_recommend)
    print(list_author)
    print(list_times)
    print(list_price)

    df = pd.DataFrame(
        {'num': list_num, 'image': list_image, 'title': list_title, 'recommend': list_recommend, 'author': list_author,
         'times': list_times, 'price': list_price})
    df.to_csv('book.csv', index=False)
    print("数据写入csv完毕")

    data = pd.read_csv('book.csv')

    print("读取csv，清理含有空字段值的行")
    # 简单的数据清洗，清洗掉含有空字段值的行
    newData = data.dropna(axis=0, how='any')
    print(newData)

    # 统计所有书籍在各个价格区间中的数量
    price = ['0-49元', '50-99元', '100-149元', '超过150元']
    data_0_49 = df[(newData['price'] >= 0) & (newData['price'] < 50)]
    data_50_99 = df[(newData['price'] >= 50) & (newData['price'] < 100)]
    data_100_149 = df[(newData['price'] >= 100) & (newData['price'] < 150)]
    data_150_plus = df[(newData['price'] >= 150)]
    count = [len(data_0_49), len(data_50_99), len(data_100_149), len(data_150_plus)]

    priceAndCount = pd.DataFrame({'price': price, 'count': count})
    priceAndCount.plot(kind='bar', x='price', y='count', title='各个价格区间内书籍数量')

    plt.show()