python 3爬取allitebook电子书

import requests
import pandas as pd
import unittest
import re
import os
from time import sleep
from lxml import etree
from unittest import  TestCase

class Spider(TestCase):

    book_name = []
    book_detail_link = []


    def down_load_file(page,keyword):
        book_name = []
        book_detail_link = []
        print("===================正在爬取第{}页===================".format(page))
        # allitebook首页
        itebookurl = 'http://www.allitebooks.org/page/{}/?s={}'.format(page, keyword)
        print("书籍链接为:" + itebookurl)
        r = requests.get(itebookurl).text
        s = etree.HTML(r)
        # print(r)
        name = s.xpath('//div[@class="entry-body"]/header/h2/a/text()')
        # 获取书籍详情页
        href = s.xpath('//div[@class="entry-body"]/header/h2/a/@href')
        book_detail_link.extend(href)
        # print(type(href))
        # print(href)

        book_name.extend(name)
        ds_book_name = pd.DataFrame.from_dict(book_name)
        ds_book_link = pd.DataFrame.from_dict(href)
        # 显示前5行数据
        # print(ds_book_name.head())
        # print(ds_book_link.head())

        # 高版本openpyxl会报错
        # ds.to_excel('itebook.xlsx')

        ds_book_name.to_csv('itebook_book_name.csv')
        ds_book_link.to_csv('itebook_book_link.csv')
        # print('---------------------------------------------')
        for j in href:
            bookdetail = requests.get(j).text
            # print(bookdetail)
            book_pdf_link = etree.HTML(bookdetail)
            # 获取文件下载链接
            download_links = book_pdf_link.xpath('//*[@id="main-content"]/div/article/footer/div/span[1]/a/@href')
            print(download_links)
            # 匹配下载链接中文件名称
            rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+(.pdf|.zip)"
            try:
                download_links = download_links[0]

                print(download_links)
                book = re.search(rex, download_links).group()
            # print(book)
            except:
                print("没有匹配成功"+download_links)
            else:

                Spider.get_file(download_links, book)
    def get_book_pdf (page,keyword):
        book_name = []
        book_detail_link=[]
        #实现翻页
        for i in range(page):
            print("===================正在爬取第{}页===================".format(i+1))
            #allitebook首页
            itebookurl = 'http://www.allitebooks.org/page/{}/?s={}'.format(i+1,keyword)
            print("书籍链接为:"+itebookurl)
            r = requests.get(itebookurl).text
            s = etree.HTML(r)
            # print(r)
            name = s.xpath('//div[@class="entry-body"]/header/h2/a/text()')
            #获取书籍详情页
            href=s.xpath('//div[@class="entry-body"]/header/h2/a/@href')
            book_detail_link.extend(href)
            # print(type(href))
            # print(href)

            book_name.extend(name)
            ds_book_name=pd.DataFrame.from_dict(book_name)
            ds_book_link=pd.DataFrame.from_dict(href)
            #显示前5行数据
            # print(ds_book_name.head())
            # print(ds_book_link.head())

            #高版本openpyxl会报错
            # ds.to_excel('itebook.xlsx')

            ds_book_name.to_csv('itebook_book_name.csv')
            ds_book_link.to_csv('itebook_book_link.csv')
            # print('---------------------------------------------')
            for j in href:
                bookdetail = requests.get(j).text
                # print(bookdetail)
                book_pdf_link = etree.HTML(bookdetail)
                #获取文件下载链接
                download_links=book_pdf_link.xpath('//*[@id="main-content"]/div/article/footer/div/span[1]/a/@href')
                # print(download_links)
                #匹配下载链接中文件名称
                rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+(.pdf|.zip)"
                try:
                    download_links=download_links[0]
                    book = re.search(rex, download_links).group()
                    # print(book)
                    # Spider.get_file(download_links,book)
                except:
                    print("没有匹配成功"+download_links)
                else:
                    # rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+.zip"
                    # download_links = download_links[0]
                    # book = re.search(rex, download_links).group()
                    # print(book)
                    Spider.get_file(download_links, book)

            print("===================爬取第{}页完毕===================".format(i+1))
            # sleep(1)
    #下载文件
    def get_file(pdfurl,file_name):
        r = requests.get(pdfurl)

        path="pdf"
        try:
            if not os.path.exists(path):
                os.mkdir(path)
            if not os.path.exists(path+"\\"+file_name):
                fo = open(path+"\\"+file_name,'wb') # 注意要用'wb',b表示二进制,不要用'w'
                fo.write(r.content)            # r.content -> requests中的二进制响应内容:以字节的方式访问请求响应体,对于非文本请求 fo.close()
                print('正在下载文件' + file_name + '...')
                fo.close()
                print('文件' + file_name + '下载完毕!')
            else:
                print('文件{}已存在!'.format(file_name))
        except:
            print('出现异常!')



    # def test_get_book_pdf(self):
    #     Spider.get_book_pdf(53,'python')

    def test_down_load_file(self):
            Spider.down_load_file(13, 'python')

if __name__=='__main__':
    unittest.main(verbosity=2)

    #     pass
    # get_book_name(1,'python')
    # ds=pd.DataFrame.from_dict(book_name)
    # print(ds.head())
    # #高版本openpyxl会报错
    # # ds.to_excel('itebook.xlsx')
    #
    # ds.to_csv('itebook.csv')

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

知识的宝藏

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值