由于到图书馆中查找数据的时候,每个网页都需要一张一张的翻转,而同时因为每张网页中的内容十分有限,故写此爬虫,方便查找之用
# -*- coding=utf-8 -*-
#@author: 、Edgar
#@version: 1.1
import requests
import urllib.error
from bs4 import BeautifulSoup
import time
import threading
def get_html(url):
"""
获取网页的源代码
"""
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/76.0.3809.100 Safari/537.36"}
try:
response = requests.get(url, headers=header)
response.encoding = response.apparent_encoding
except requests.HTTPError as e:
print(e)
except urllib.error.URLError as e:
print(e)
else:
return response.text
def is_last_page(soup):
"""
判断该网页是不是最后一页了, 如果是的话,就返回False代表是最后一页
否则的话返回下一页的网页地址
"""
target = soup.find('a', {"title": "Next"})
if target is None:
return False
else:
return target["href"]
def spider(soup):
"""
爬取是搜索后的网页,获得书名,余量等
"""
tr_list = soup.find("table", {"cellspacing": "1"}).findAll("tr", {"valign": "baseline"})
total_data = ''
for tr in tr_list:
td_list = tr.findAll("td")
num = td_list[0].get_text().replace(" ", '').strip()
num = "序号: " + num
call_num = td_list[2].get_text().replace(" ", '').strip()
call_num = "索书号: " + call_num
name = td_list[3].get_text().replace("\n", '').strip()
name = "书名: " + name
author = td_list[4].get_text().replace("\n", '').strip()
author = "作者: " + author
year = td_list[5].get_text().replace(" ", '').strip()
year = "年代: " + year
info = td_list[6].get_text().replace(" ", '').strip()
info = "馆名(总/借出): " + info
# 获得链接,可从而获取更多的信息
try:
info_link = td_list[6].a["href"]
except Exception :
info_link = None
sort = td_list[7].get_text().replace(" ", '').strip()
sort = "类型: " + sort
data = num + '\n' + call_num + '\n' + name + '\n' + author + '\n' + year + '\n' + info + '\n' + sort + '\n'
if info_link is None:
spider_more_data = "无详细信息 \n"
else:
spider_more_data = spider_more(info_link)
total_data = data + spider_more_data
file.write(total_data)
file.write("-"*58+'\n')
total_data = ''
def spider_more(url):
"""
获得更多关于书籍
"""
html = get_html(url)
soup = BeautifulSoup(html, "lxml")
tr_list = soup.findAll("table", {"cellspacing": "2"})[1].find_all("tr")[1:]
num = 0
total_data =''
for tr in tr_list:
num += 1
td = tr.findAll("td")
status = td[2].get_text()
status = "单册状态: " + status
return_time = td[3].get_text()
return_time = "应还时间: " + return_time
location = td[5].get_text()
location = "馆藏位置: " + location
bar_code = td[8].get_text()
bar_code = "条码: " + bar_code
data ="第{}本书具体信息:\n".format(num) + status + "\n" + return_time + "\n" + location + "\n" + bar_code + "\n\n"
total_data += data
return "\n" + total_data
def main(url):
"""
首先爬取第一页的信息,并且判断第一页是否是最后一页
如果不是最后一页,在进行同样的操作
"""
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
spider(soup)
flag = is_last_page(soup)
while flag:
url = flag
html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
spider(soup)
flag = is_last_page(soup)
time.sleep(6)
class promote(threading.Thread):
def run(self):
print("正在下载数据中: ", end="")
while 1:
print(".", flush=True,end="")
time.sleep(2)
if __name__ == "__main__":
file = open("lib_data.txt", "a", encoding="utf-8")
url = input("请输入您在交大图书馆搜索后的网页链接(复制粘贴即可): ")
pro = promote()
pro.setDaemon(True)
pro.start()
start_time = time.time()
main(url)
end_time = time.time()
print("\n共用时 {} s".format(end_time-start_time))
程序执行之后直接生成 txt 文件,可直接查看
附:
交大图书馆官网:http://www.lib.sjtu.edu.cn/f/main/index.shtml
今日发现交大图书馆在首页搜索之后样式不是之前的样式了,推荐搜索的时候在
http://opac.lib.sjtu.edu.cn 搜不会出现其他问题1。
2019年9月22日 ↩︎