直接上代码,由于用的网络不稳定,爬一个页面保存到1个txt中。有兴趣的,可以稍微修改代码,将所有的下载链接保存到一个txt中,这样用迅雷下载更方便。
from urllib.request import urlopen
from bs4 import BeautifulSoup
page1='http://www.allitebooks.com/?s=machine+learning'
pageN='http://www.allitebooks.com/page/#/?s=machine+learning'
#一共有21页
totalPage=21
def ParseListPage(lstPage):
print("Parsing "+lstPage+" ...")
try:
pgHtml = urlopen(lstPage)
except HTTPError as e:
print('http error happened!')
return None
try:
bsPage=BeautifulSoup(pgHtml.read())
except AttributeError as e:
return None
bkTitles=bsPage.find_all("h2","entry-title")
bookLinks=[];
for h in bkTitles:
lk=h.a.get('href')
if lk!=None:
bookLinks.append(lk)
return bookLinks
def GetPdfLink(bookPageUrl):
print("Fetching PDF:"+bookPageUrl+"...")
try:
bkPgHtml=urlopen(bookPageUrl)
except HTTPError as e:
print('http error happened!')
return None
try:
bsDetailPage=BeautifulSoup(bkPgHtml.read())
except AttributeError as e:
return None
linkSpans=bsDetailPage.find_all('span','download-links')
if linkSpans==None:
return None
pdfLink=linkSpans[0].a.get('href')
return pdfLink
if __name__=='__main__':
lks1=ParseListPage(page1)
f=open('1.txt','w')
for lk in lks1:
pdf=GetPdfLink(lk)
f.write(pdf+'\n')
f.close()
print('Page 1 is Done!')
for n in range(12,totalPage+1):
f=open(str(n)+'.txt','w')
print('parsing PAGE '+str(n)+'...')
lks=ParseListPage(pageN.replace('#',str(n)))
for lk in lks:
pdf=GetPdfLink(lk)
f.write(pdf+'\n')
f.close()
print('Page'+str(n)+' is Done!')
-------------------------------------------------------------------------------------
2018-10-05:
最近发现,该网站似乎增加了反爬功能,使用该代码可能会被拒绝,出现403错误。
-------------------------------------------------------------------------------------
欢迎关注我的公众号: