爬虫之爬取豆瓣图书排行榜

最新推荐文章于 2023-08-31 22:15:00 发布

年糕coder

最新推荐文章于 2023-08-31 22:15:00 发布

阅读量969

点赞数

本文链接：https://blog.csdn.net/jsq916/article/details/82849781

版权

from bs4 import BeautifulSoup
from lxml import etree
import requests
import time
import os
if __name__=='__main__':
    # download_url='https://book.douban.com/top250?start={}'
    head={}
    head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'

    '''使用soup.select对html文件进行选择'''
    # rank=0
    # for j in range(0,250,25):
    #     download_url = 'https://book.douban.com/top250?start={}'.format(j)
    #     res=requests.get(url = download_url, headers = head)
    #     res.encoding='uft-8'
    #     soup=BeautifulSoup(res.text,'html.parser')
    #     book_table = soup.select('div.indent')
    #     for i in range(25):
    #         rank+=1
    #         title = soup.select('div.pl2 a')[i].text.strip().split()
    #         title = '{}{}{}'.format(title[0], title[1], title[2]) if len(title) == 3 else title[0]
    #         try:
    #             word=soup.select('span.inq')[i].text.strip()
    #         except:
    #             word=None
    #         publish = soup.select('td p.pl')[i].text.strip()
    #         score = soup.select('span.rating_nums')[i].text.strip()
    #         print("{}: {} / {} / {} / {}".format(rank, title, score,publish,word))

    '''使用xpath对html文件进行编辑'''
    rank=0
    for j in range(0, 250, 25):
        download_url = 'https://book.douban.com/top250?start={}'.format(j)
        res=requests.get(url = download_url, headers = head).text
        s=etree.HTML(res)
        file=s.xpath('// *[ @ id = "content"] / div / div[1] / div / table')
        time.sleep(2)
        for div in file:
            rank+=1
            title_master=div.xpath('./ tr / td[2] / div[1] / a / @title')
            try:
                title_slave=div.xpath('./ tr / td[2] / div[1] / a / span/text()')
                title=title_master[0]+title_slave[0]
            except:
                title=title_master[0]
            score=div.xpath('./tr/td[2]/div[2]/span[2]/text()')[0]
            evale=div.xpath('./tr/td[2]/div[2]/span[3]/text()')[0].strip('(').strip(')').strip()
            try:
                word = div.xpath('./tr / td[2] / p[2] / span/text()')[0]
            except:
                word=None
            publish=div.xpath('./ tr / td[2] / p[1]/text()')[0]
            # write_book_info={
            #     'rank':rank,
            #     'title':title,
            #     'score':score,
            #     'evale':evale,
            #     'publish':publish,
            #     'word':word
            # }
            # # print("{}: {} / {} / {} / {} / {}".format(rank, title, score, evale, publish, word))
            # print(write_book_info)
            with open('book_info.txt','a',encoding='utf-8') as f:
                f.write("{}: {} / {}/ {} / {} / {} \n".format(rank, title, score, evale, publish, word))
    print('finish saving!')

年糕coder

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
爬虫之爬取豆瓣图书排行榜

from bs4 import BeautifulSoupfrom lxml import etreeimport requestsimport timeimport osif __name__=='__main__': # download_url='https://book.douban.com/top250?start={}' head={} head['...
复制链接

扫一扫