由于到图书馆中查找数据的时候,每个网页都需要一张一张的翻转,而同时因为每张网页中的内容十分有限,故写此爬虫,方便查找之用
# -*- coding=utf-8 -*-
#@author: 、Edgar
#@version: 1.1
import requests
import urllib.error
from bs4 import BeautifulSoup
import time
import threading
def get_html(url):
"""
获取网页的源代码
"""
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.100 Safari/537.36"}
try:
response = requests.get(url, headers=header)
response.encoding = response.apparent_encoding
except requests.HTTPError as e:
print(e)
except urllib.error.URLError as e:
print(e)
else:
return response.text
def is_last_page(soup):
"""
判断该网页是不是最后一页了, 如果是的话,就返回False代表是最后一页
否则的话返回下一页的网页地址
"""
target = soup.find('a', {
"title": "Next"})
if target is None:
return False
else:
return target["href"]
def spider(soup):
"""
爬取是搜索后的网页,获得书名,余量等
"""
tr_list = soup.find("table", {
"cellspacing": "1"}).findAll("tr", {