环境
- beautifulsoup4
- xlsxwriter
补上次爬取网页的分析代码:
from bs4 import BeautifulSoup
import os
import copy
from xlsxwriter.workbook import Workbook
def save_into_excel(persons_list,save_name):
headings = ['学号', '姓名', '专业班级', '书名', '作者', '索引号', '借阅时间']
data = []
for person in persons_list:
for book_info in person['books_list']:
data.append([person['no'], person['name'], save_name, book_info['title'], book_info['author'], book_info['index'], book_info['time']])
workbook = Workbook(save_name + '.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_row('A1', headings)
for row_num, row_data in enumerate(data):
worksheet.write_row(row_num + 1, 0, row_data)
workbook.close()
def getInfo(major):
persons_list = []
person = {
'no': 0,
'major': major,
'name' : '',
'books_list' : []
}
book_info = {
'title':'',
'author':'',
'index' :'',
'time' : ''
}
path = 'E:\\craw_lib'
for file_name in os.listdir(path):
file_path = os.path.join(path,file_name)
sep_pos = file_name.index('_')
no = file_name[2:sep_pos]
no = int(no)
if no != person['no']:
# 获取到新用户
if person['no'] != 0:
person_copy = copy.deepcopy(person)
persons_list.append(person_copy)
person['no'] = no
person['books_list'].clear()
# print("no : %s" % no)
filehander = open(file_path, "r", encoding='utf-8')
soup = BeautifulSoup(filehander, 'lxml')
#获取姓名
name_div = soup.find('div', 'navbar_info_zh')
strings = name_div.stripped_strings
list = []
for s in strings:
list.append(str(s))
name = list[2]
name = name[4:]
person['name'] = name
print(name)
table = soup.find('table', id = 'contentTable')
all_tr = table.find_all('tr')
for tr in all_tr:
#创建书单
tds = tr.find_all('td')
if len(tds) != 0:
borrow_flag = True
state_tag = tds[0]
for ss in state_tag.stripped_strings:
state = str(ss)
if state == '借书':
title_tag = tds[2]
author_tag = tds[3]
index_tag = tds[4]
time_tag = tds[7]
book_info['title'] = str(title_tag.string) #直接赋值时,深拷贝递归超限
book_info['author'] = str(author_tag.string)
book_info['index'] = str(index_tag.string)
book_info['time'] = str(time_tag.string)
else:
borrow_flag = False
if borrow_flag == True:
book_info_copy = copy.deepcopy(book_info)
person['books_list'].append(book_info_copy)
person_copy = copy.deepcopy(person)
persons_list.append(person_copy)
save_into_excel(persons_list,major)
if __name__ == '__main__':
major = input('please input a major:\n')
getInfo(major)
结果:
有关保存数据到xlsx,参考: