第一篇个人博客：python爬虫的实战——书籍下载链接查找

_CodeSleep

已于 2023-04-11 15:54:09 修改

阅读量1.1k

点赞数

文章标签： python 爬虫小说

于 2023-04-08 14:25:29 首次发布

本文链接：https://blog.csdn.net/2301_76907210/article/details/130027889

版权

本文介绍了使用Python的urllib和BeautifulSoup库进行网络请求和HTML解析，实现从书籍网站查找特定书籍并获取下载链接的过程。首先确定URL，通过用户输入书籍名替换空格为%20。接着模拟浏览器发送请求，保存网页内容到HTML文件。然后使用BeautifulSoup解析HTML，提取书籍名称和作者信息，并查找下一个页面的URL。最后，通过多步链接跳转找到最终的下载链接。

摘要由CSDN通过智能技术生成

爬虫的实战

#爬虫实战内容简介

此次爬虫实战，为了完成小组项目中的一小部分功能——对于书籍资料的查找。我们是对于书籍网站进行访问，然后查找到想要的书籍，并且查找到下载地址。这个时候就需要对HTML进行熟练，才能对与其下载网址的URL进行锁定（href）最后找到最终下载的网址

#项目实战

##1. 首先是我们需要的函数库##

(1) urllib.request
(2) bs4

##2. url的确定

https://annas-archive.org/search?q=Against%20intellectual%20monopoly

q= 后面的内容是我们需要自己找规律的
我们查找的书名时Against%20intellectual%20monopoly
可以发现%20代表空格这时候我们就可以写出我们的第一步代码了

base_url = "https://annas-archive.org/search?q="
book_name = input("Entry Book Name!")
book_name = book_name.replace(" ","%20")

url = base_url + book_name

##3. 模拟浏览器进行请求以及保存该网页的HTML##

#url使用上一模块的代码
#这里的header是反爬的UA反爬
headers = {
    "User-Agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'
}

#请求对象的定制
request = urllib.request.Request(url=url,headers=headers)

#模拟浏览器进行请求
response = urllib.request.urlopen(request)

#对于我们读取的网页HTML文件进行保存
content = response.read().decode("utf-8")

with open("pachong.html","w",encoding="utf-8")as fp:
    fp.write(content)

##4. 使用bs4对网页进行解析##

下面的代码我们对于书籍名称和作者进行检索

from bs4 import BeautifulSoup


soup = BeautifulSoup(open("pachong.html",encoding="utf-8"),"lxml")

#这里使用了re模块主要是为了对我们下面搜索的HTML里的标签进行去除，只保留文字内容
import re

Book_num = 1

#这里使用beautiful soup语法 查找class为h-[125]的div标签内的内容
for x in soup.find_all("div",class_="h-[125]"):
    if x.h3 != None:
    	#使用正则表达式去除所有<>标签内的所有内容
        dr = re.compile(r'<[^>]+>',re.S)
        #查找我们找出的每一个div标签中h3的内容以及class为truncate italic的所有内容
        x_h3 = str(x.h3)

        Auther = str(x.find_all(class_="truncate italic"))
        BookName = dr.sub("",x_h3)
        Auther = dr.sub("",Auther)

        print(f"{Book_num}. Bookname: {BookName}   Auther: {Auther}")
        Book_num += 1

##5. 查找下一个页面的url

==这是上上一步通过模拟浏览器请求找到的网页，此时点击其中一本书我们会跳转到另一个网页，所以说明这个网页有超链接(href)，我们通过这个网页的HTML文件找到下个网页的url ==

==这是上一步通过模拟浏览器请求找到的网页，此时点击其中一本书我们会跳转到另一个网页，所以说明这个网页有超链接(href)，我们通过这个网页的HTML文件找到下个网页的url ==
我们此时找到了下一个页面的url
在这里插入图片描述

代码块展示


#因为我们下面使用for循环 'w'格式下次书写会覆盖上一次书写的 所以我们使用'a'进行叠加 但是为了下次打开可以清除我们上依次输入的 所以我们需要先用'w'进行覆盖
with open("url.txt", 'w', encoding="utf-8") as fp:
    pass

#和之前的代码是同样意思 只是这次查找到是href
for x in soup.find_all("div",class_="h-[125]"):
    with open("url.txt",'a',encoding="utf-8")as fp:
    #这里使用try语句因为不是所有的a下属都有href，所以会报错，使用try语句可以跳过没有href的a标签
        try:
            fp.write(x.a.attrs["href"]+",")
        except:
            pass

with open("url.txt",'r',encoding="utf-8")as fp:
    data = fp.read()

url_list = data.split(",")

##6. 接下来的操作和上面的所有代码块相似，因为这个网页需要两次超链接跳转，所以基本和上面的代码相似##

num = input("123")
num = int(num)-1




import time

time.sleep(0.1)

base_url1 = "https://annas-archive.org"

url1 = base_url1 + url_list[num]

url1 = "https://annas-archive.org/md5/224c5b1195fd2fb4650c3b75c2908d65"

headers = {
    "User-Agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'
}

request1 = urllib.request.Request(url=url1,headers=headers)

response1 = urllib.request.urlopen(request1)

content1 = response1.read().decode("utf-8")

with open("url.html","w",encoding="utf-8")as fp:
    fp.write(content1)

from bs4 import BeautifulSoup

soup1 = BeautifulSoup(open("url.html",encoding="utf-8"),"lxml")

with open("url1.txt", 'w', encoding="utf-8") as fp:
    pass

for x in soup1.find_all("a",class_="js-download-link"):
    with open("url1.txt",'a',encoding="utf-8")as fp:
        try:
            fp.write(x.attrs["href"]+",")
        except:
            pass

with open("url1.txt",'r',encoding="utf-8")as fp:
    data1 = fp.read()

url_list = data1.split(",")

for x in url_list:
    print(x)

#完整代码展示##

import urllib.request

base_url = "https://annas-archive.org/search?q="
book_name = input("Entry Book Name!")
book_name = book_name.replace(" ","%20")

url = base_url + book_name

headers = {
    "User-Agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'
}

request = urllib.request.Request(url=url,headers=headers)

response = urllib.request.urlopen(request)

content = response.read().decode("utf-8")

with open("pachong.html","w",encoding="utf-8")as fp:
    fp.write(content)

from bs4 import BeautifulSoup

soup = BeautifulSoup(open("pachong.html",encoding="utf-8"),"lxml")

import re

Book_num = 1

for x in soup.find_all("div",class_="h-[125]"):
    if x.h3 != None:
        dr = re.compile(r'<[^>]+>',re.S)
        x_h3 = str(x.h3)

        Auther = str(x.find_all(class_="truncate italic"))
        BookName = dr.sub("",x_h3)
        Auther = dr.sub("",Auther)

        print(f"{Book_num}. Bookname: {BookName}   Auther: {Auther}")
        Book_num += 1


with open("url.txt", 'w', encoding="utf-8") as fp:
    pass

for x in soup.find_all("div",class_="h-[125]"):
    with open("url.txt",'a',encoding="utf-8")as fp:
        try:
            fp.write(x.a.attrs["href"]+",")
        except:
            pass

with open("url.txt",'r',encoding="utf-8")as fp:
    data = fp.read()

url_list = data.split(",")



num = input("123")
num = int(num)-1




import time

time.sleep(0.1)

base_url1 = "https://annas-archive.org"

url1 = base_url1 + url_list[num]

url1 = "https://annas-archive.org/md5/224c5b1195fd2fb4650c3b75c2908d65"

headers = {
    "User-Agent" : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63'
}

request1 = urllib.request.Request(url=url1,headers=headers)

response1 = urllib.request.urlopen(request1)

content1 = response1.read().decode("utf-8")

with open("url.html","w",encoding="utf-8")as fp:
    fp.write(content1)

from bs4 import BeautifulSoup

soup1 = BeautifulSoup(open("url.html",encoding="utf-8"),"lxml")

with open("url1.txt", 'w', encoding="utf-8") as fp:
    pass

for x in soup1.find_all("a",class_="js-download-link"):
    with open("url1.txt",'a',encoding="utf-8")as fp:
        try:
            fp.write(x.attrs["href"]+",")
        except:
            pass

with open("url1.txt",'r',encoding="utf-8")as fp:
    data1 = fp.read()

url_list = data1.split(",")

for x in url_list:
    print(x)