import requests
import pandas as pd
import unittest
import re
import os
from time import sleep
from lxml import etree
from unittest import TestCase
class Spider(TestCase):
book_name = []
book_detail_link = []
def down_load_file(page,keyword):
book_name = []
book_detail_link = []
print("===================正在爬取第{}页===================".format(page))
# allitebook首页
itebookurl = 'http://www.allitebooks.org/page/{}/?s={}'.format(page, keyword)
print("书籍链接为:" + itebookurl)
r = requests.get(itebookurl).text
s = etree.HTML(r)
# print(r)
name = s.xpath('//div[@class="entry-body"]/header/h2/a/text()')
# 获取书籍详情页
href = s.xpath('//div[@class="entry-body"]/header/h2/a/@href')
book_detail_link.extend(href)
# print(type(href))
# print(href)
book_name.extend(name)
ds_book_name = pd.DataFrame.from_dict(book_name)
ds_book_link = pd.DataFrame.from_dict(href)
# 显示前5行数据
# print(ds_book_name.head())
# print(ds_book_link.head())
# 高版本openpyxl会报错
# ds.to_excel('itebook.xlsx')
ds_book_name.to_csv('itebook_book_name.csv')
ds_book_link.to_csv('itebook_book_link.csv')
# print('---------------------------------------------')
for j in href:
bookdetail = requests.get(j).text
# print(bookdetail)
book_pdf_link = etree.HTML(bookdetail)
# 获取文件下载链接
download_links = book_pdf_link.xpath('//*[@id="main-content"]/div/article/footer/div/span[1]/a/@href')
print(download_links)
# 匹配下载链接中文件名称
rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+(.pdf|.zip)"
try:
download_links = download_links[0]
print(download_links)
book = re.search(rex, download_links).group()
# print(book)
except:
print("没有匹配成功"+download_links)
else:
Spider.get_file(download_links, book)
def get_book_pdf (page,keyword):
book_name = []
book_detail_link=[]
#实现翻页
for i in range(page):
print("===================正在爬取第{}页===================".format(i+1))
#allitebook首页
itebookurl = 'http://www.allitebooks.org/page/{}/?s={}'.format(i+1,keyword)
print("书籍链接为:"+itebookurl)
r = requests.get(itebookurl).text
s = etree.HTML(r)
# print(r)
name = s.xpath('//div[@class="entry-body"]/header/h2/a/text()')
#获取书籍详情页
href=s.xpath('//div[@class="entry-body"]/header/h2/a/@href')
book_detail_link.extend(href)
# print(type(href))
# print(href)
book_name.extend(name)
ds_book_name=pd.DataFrame.from_dict(book_name)
ds_book_link=pd.DataFrame.from_dict(href)
#显示前5行数据
# print(ds_book_name.head())
# print(ds_book_link.head())
#高版本openpyxl会报错
# ds.to_excel('itebook.xlsx')
ds_book_name.to_csv('itebook_book_name.csv')
ds_book_link.to_csv('itebook_book_link.csv')
# print('---------------------------------------------')
for j in href:
bookdetail = requests.get(j).text
# print(bookdetail)
book_pdf_link = etree.HTML(bookdetail)
#获取文件下载链接
download_links=book_pdf_link.xpath('//*[@id="main-content"]/div/article/footer/div/span[1]/a/@href')
# print(download_links)
#匹配下载链接中文件名称
rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+(.pdf|.zip)"
try:
download_links=download_links[0]
book = re.search(rex, download_links).group()
# print(book)
# Spider.get_file(download_links,book)
except:
print("没有匹配成功"+download_links)
else:
# rex = "[a-zA-Z ,0-9-.:!&~@#$%^*?’]+.zip"
# download_links = download_links[0]
# book = re.search(rex, download_links).group()
# print(book)
Spider.get_file(download_links, book)
print("===================爬取第{}页完毕===================".format(i+1))
# sleep(1)
#下载文件
def get_file(pdfurl,file_name):
r = requests.get(pdfurl)
path="pdf"
try:
if not os.path.exists(path):
os.mkdir(path)
if not os.path.exists(path+"\\"+file_name):
fo = open(path+"\\"+file_name,'wb') # 注意要用'wb',b表示二进制,不要用'w'
fo.write(r.content) # r.content -> requests中的二进制响应内容:以字节的方式访问请求响应体,对于非文本请求 fo.close()
print('正在下载文件' + file_name + '...')
fo.close()
print('文件' + file_name + '下载完毕!')
else:
print('文件{}已存在!'.format(file_name))
except:
print('出现异常!')
# def test_get_book_pdf(self):
# Spider.get_book_pdf(53,'python')
def test_down_load_file(self):
Spider.down_load_file(13, 'python')
if __name__=='__main__':
unittest.main(verbosity=2)
# pass
# get_book_name(1,'python')
# ds=pd.DataFrame.from_dict(book_name)
# print(ds.head())
# #高版本openpyxl会报错
# # ds.to_excel('itebook.xlsx')
#
# ds.to_csv('itebook.csv')