获取母校师资信息
import os
import csv
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
def make_dir(path):
if not os.path.exists(path):
os.mkdir(path)
def get_resp(url):
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}
resp = requests.get(url=url, headers=headers)
return resp
def get_all(text, selector):
soup = BeautifulSoup(text, 'lxml')
return soup.select(selector)
if __name__ == '__main__':
make_dir('石室中学师资')
resp = get_resp('http://www.cdshishi.net/project.aspx?mid=131&xk=0')
fliters = get_all(resp.text, 'body div.select-box.fr > select > option')
fliters = {x.text: 'http://www.cdshishi.net/project.aspx' + x.attrs['value'] for x in fliters[2:]}
for i in tqdm(fliters):
make_dir(f'石室中学师资/{i}')
resp = get_resp(fliters[i])
teachers = get_all(resp.text, 'body ul.pTeam-list.clearfix > li')
f = open(f'石室中学师资/{i}/{i}.txt', 'a', encoding='utf-8', newline='')
writer = csv.writer(f)
writer.writerow(['学科', '教师姓名', '教师头衔'])
for t in tqdm(teachers):
name = t.select_one('h3').text
honor = t.select_one('div.c').text
writer.writerow([i, name, honor])
img_link = 'http://www.cdshishi.net/' + t.attrs['imgs']
img = get_resp(img_link).content
open(f'石室中学师资/{i}/{name}老师.jpg', 'wb').write(img)