一、开始
- 方便查找自己需要寻找的地区,院校,专业等
- 结果生成csv格式,最后转为excel,方便观看
- 使用到的库:os,requests,lxml,csv,time
二、过程
- 首先需要进入其网址:https://yz.chsi.com.cn/zsml/queryAction.do
- 然后可以依次选择自己需要的信息
- 随后填写参数内容
- 代码如下
ssdm = '11'
dwmc = ''
mldm = '01'
yjxkdm = '0101'
zymc = '哲学'
xxfs = 1
path_csv = r"C:\Users\14347\Desktop\招生信息"
csv_name = f'{mldm + "-" + yjxkdm + "-" + zymc}.csv'
三、保存与转换
切记(第二次运行会覆盖第一次的文件,如需保存请更改文件名)
如需转换为excel格式(依次进行):
- 数据
- 从文本/CSV
- 选择需要转换的文件
- 加载
- 另存为
- 选择保存类型为Excel工作簿
随后便可以根据自己需求选择所要查找的信息
四、源代码
import os.path
import requests
from lxml import etree
import csv
import time
def find_data():
title = "https://yz.chsi.com.cn"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"
}
url = 'https://yz.chsi.com.cn/zsml/queryAction.do'
count = 0
page = 0
choose = True
page_all = 20
time_start = time.time()
with open(os.path.join(path_csv, csv_name), 'w', newline=''):
pass
while count != page_all:
count += 1
params = {
"ssdm": ssdm,
"dwmc": dwmc,
"mldm": mldm,
"mlmc": "",
"yjxkdm": yjxkdm,
"zymc": zymc,
"xxfs": xxfs,
"pageno": count,
}
response = requests.post(url, headers=header, params=params)
tree = etree.HTML(response.text)
url_college_tree = tree.xpath('//*[@id="form3"]/a/@href')
name_college_tree = tree.xpath('//*[@id="form3"]/a/text()')
if choose:
page_all = tree.xpath('//*[@href="#"]/text()')
page_all = int(page_all[-1])
choose = False
for i in range(len(url_college_tree)):
page += 1
with open(os.path.join(path_csv, csv_name), 'a', newline='') as f:
csv_write = csv.writer(f)
csv_write.writerow([f'第{page}所学校'])
csv_write.writerow(['招生单位', '所在地', '院系所', '研究方向', '科目', "招录人数"])
print(f'----------------------------------第{page}所学校---------------------------------------------')
csv_write.writerow([name_college_tree[i], tree.xpath(f'.//table/tbody/tr[{i + 1}]/td[2]/text()')[0]])
print("招生单位:", name_college_tree[i])
print("所在地:", tree.xpath(f'.//table/tbody/tr[{i + 1}]/td[2]/text()')[0])
url_college = title + url_college_tree[i]
response_college = requests.get(url_college, headers=header)
tree_college = etree.HTML(response_college.text)
search_college_tree_view = tree_college.xpath('.//table/tbody/tr/td[8]/a/@href')
school_college_tree = tree_college.xpath('.//table/tbody/tr/td[2]/text()')
direction_college_tree = tree_college.xpath('.//table/tbody/tr/td[4]/text()')
enrollment_count_tree = tree_college.xpath('.//table/tbody/tr/td[7]/script/text()')
for j in range(len(search_college_tree_view)):
url_subject = title + search_college_tree_view[j]
response_subject = requests.get(url_subject, headers=header)
tree_subject = etree.HTML(response_subject.text)
search_college_tree = tree_subject.xpath('//*[@class="zsml-result"]/table/tbody/tr/td/text()')
print("院系所:", school_college_tree[j])
print("研究方向:", direction_college_tree[j])
if enrollment_count_tree:
print("招录人数:", enrollment_count_tree[j].split(":")[1].split("(")[0])
print("科目:", end='')
subject_text = ''
for k in range(len(search_college_tree)):
subject = search_college_tree[k].replace("\r\n", '').replace(' ', '')
if subject != '':
subject_text += subject
subject_text += ','
print(subject, end=',')
print('\n')
csv_write.writerow([name_college_tree[i], tree.xpath(f'.//table/tbody/tr[{i + 1}]/td[2]/text()')[0],
school_college_tree[j], direction_college_tree[j], subject_text,
enrollment_count_tree[j].split(":")[1].split("(")[0]])
time_end = time.time()
print(f"----------------------总耗时:{time_end - time_start}----------------------")
if __name__ == '__main__':
ssdm = '11'
dwmc = ''
mldm = '01'
yjxkdm = '0101'
zymc = '哲学'
xxfs = 1
path_csv = r"C:\Users\14347\Desktop\招生信息"
csv_name = f'{mldm + "-" + yjxkdm + "-" + zymc}.csv'
find_data()