python爬虫学习爬取豆瓣top250图书信息，并写入excel

最新推荐文章于 2023-11-17 14:58:06 发布

希望无忧无虑

最新推荐文章于 2023-11-17 14:58:06 发布

阅读量1.1k

点赞数 1

分类专栏：爬虫文章标签： python

本文链接：https://blog.csdn.net/qq_44874692/article/details/107985781

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

入门爬虫，爬取豆瓣top250，并保存到excel

import pandas as pd
import requests
from bs4 import BeautifulSoup

# 请求html页面函数（headers代理信息可更改）
def get_html(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'}
    resp = requests.get(url,headers=headers)
    return resp

# 解析页面，获得数据
def html_parse(resp):
    #BS 的解析
    soup = BeautifulSoup(resp.text, 'lxml')
    # 书名
    alldiv = soup.find_all('div', class_='pl2')
    names = [a.find('a')['title'] for a in alldiv]
    # 作者
    allp = soup.find_all('p',class_='pl')
    authors = [p.get_text() for p in allp]
    # 评分
    starspan = soup.find_all('span', class_='rating_nums')
    scores = [s.get_text() for s in starspan]
    # 简介
    sumspan = soup.find_all('span', class_='inq')
    sums = [i.get_text() for i in sumspan]
    # 将得到的数据，封装成字典
    data = {
        'name':names,
        'author':authors,
        'score':scores,
        'sum':sums
    }
    return data 

# 获得所有页面的函数
def all_page():
    base_url = 'http://book.douban.com/top250?start='
    urllist = []
    for page in range(0, 250, 25):
        allurl = base_url + str(page)
        urllist.append(allurl)
    return  urllist
    
data = {'name':[],'author':[],'score':[],'sum':[]}
#获取url列表
url_list = all_page()       
# 爬
for url in url_list:
    data['name'] = data['name'] + html_parse(get_html(url))['name']
    data['author'] = data['author'] + html_parse(get_html(url))['author']
    data['score'] = data['score'] + html_parse(get_html(url))['score']
    data['sum'] = data['sum'] + html_parse(get_html(url))['sum']
# 将数据frame化
df = pd.DataFrame.from_dict({key:pd.Series(value) for key, value in data.items()})
# 写入excel
df.to_excel('图书表.xlsx',sheet_name='2020收集',na_rep=" ",)
print("完成")

希望无忧无虑

关注

1
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
python爬虫学习爬取豆瓣top250图书信息，并写入excel

入门爬虫，爬取豆瓣top250，并保存到excelimport pandas as pdimport requestsfrom bs4 import BeautifulSoup# 请求html页面函数（headers代理信息可更改）def get_html(url): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrom
复制链接

扫一扫