获取高校信息

说明

开发的系统中,高校信息的作用是个人选的教育经历打上标签,方便筛选优质人才。
传统一些的方式就是985/211,但是现在主流的方式是双一流。尽管如此,用985/211来筛选也存在很大程度上的合理性,所以从多个角度都进行了查询。

数据来源方面,使用了阳关高考网和中国教育在线两个网站。
阳关高考网:https://gaokao.chsi.com.cn/sch/search.do
中国教育在线:https://daxue.eol.cn/
查询流程:
1 通过阳关高考网选择民办大学,将所有民办大学筛选出来
2 通过中国教育在线筛选出985和211院校
3 查询阳光高考网数据列表,每个学校都判断是否是民办or985/211,打上标签

技术说明

开发语言选择的是Java,pom.xml中引入了jsoup用来网络请求

		<dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.12.1</version>
        </dependency>

高校表结构设计

直接将数据内容保存即可,单一表

import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.GeneratedValue;
import javax.persistence.Id;

/**
 * 高校信息
 */
@Entity
public class University {

	@Id
	@GeneratedValue
	private Long zid;
	
	/** 创建时间 **/
	@Column(length=19, nullable=false)
	private String createTime;
	
	/** 更新时间 **/
	@Column(length=19)
	private String updateTime;
	
	/** 是否删除 1是 0否 **/
	@Column(nullable=false)
	private Integer isDelete;
	
	/** 院校名称 **/
	@Column(length=300, nullable=false)
	private String name;
	
	/** 院校所在地 **/
	@Column(length=300, nullable=false)
	private String city;
	
	/** 院校隶属 **/
	@Column(length=300, nullable=false)
	private String owner;
	
	/** 学历层次 **/
	@Column(length=100, nullable=false)
	private String level;
	
	/** 院校特性 985|211、211、民办大学 **/
	@Column(length=100)
	private String feature;
	
	/** 一流大学 1是 0否 **/
	@Column(nullable=false)
	private Integer leadingUniversity;
	
	/** 一流学科 1是 0否 **/
	@Column(nullable=false)
	private Integer leadingDisciplines;

	/** 是否有研究生院 1有 0无 **/
	@Column(nullable=false)
	private Integer institute;

	public Integer getLeadingUniversity() {
		return leadingUniversity;
	}

	public void setLeadingUniversity(Integer leadingUniversity) {
		this.leadingUniversity = leadingUniversity;
	}

	public Integer getLeadingDisciplines() {
		return leadingDisciplines;
	}

	public void setLeadingDisciplines(Integer leadingDisciplines) {
		this.leadingDisciplines = leadingDisciplines;
	}

	public Long getZid() {
		return zid;
	}

	public void setZid(Long zid) {
		this.zid = zid;
	}

	public String getCreateTime() {
		return createTime;
	}

	public void setCreateTime(String createTime) {
		this.createTime = createTime;
	}

	public String getUpdateTime() {
		return updateTime;
	}

	public void setUpdateTime(String updateTime) {
		this.updateTime = updateTime;
	}

	public Integer getIsDelete() {
		return isDelete;
	}

	public void setIsDelete(Integer isDelete) {
		this.isDelete = isDelete;
	}

	public String getName() {
		return name;
	}

	public void setName(String name) {
		this.name = name;
	}

	public String getCity() {
		return city;
	}

	public void setCity(String city) {
		this.city = city;
	}

	public String getOwner() {
		return owner;
	}

	public void setOwner(String owner) {
		this.owner = owner;
	}

	public String getLevel() {
		return level;
	}

	public void setLevel(String level) {
		this.level = level;
	}

	public String getFeature() {
		return feature;
	}

	public void setFeature(String feature) {
		this.feature = feature;
	}

	public Integer getInstitute() {
		return institute;
	}

	public void setInstitute(Integer institute) {
		this.institute = institute;
	}
	
}

辅助方法

用来判断某个学校是否是985/211,之所以没直接对比学校名称,是考虑到如果某学校是985/211,那么分校也理应是985/211。

	private boolean schoolContain(String name, List<String> school) {
		boolean contain =false;
		for(String item : school) {
			if(name.contains(item)) {
				contain = true;
				break;
			}
		}
		return contain;
	}

代码流程

获取985院校列表

		//985院校
		String url = "https://daxue.eol.cn/985.shtml";
		Document doc = Jsoup.connect(url).get();
		List<String> school985 = new ArrayList<>();
		Elements eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
		for(Element ele : eles) {
			Elements items = ele.getElementsByTag("td");
			school985.add(items.get(items.size()-3).text());
		}

获取211院校列表

		//211院校
		url = "https://daxue.eol.cn/211.shtml";
		doc = Jsoup.connect(url).get();
		List<String> school211 = new ArrayList<>();
		eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
		for(Element ele : eles) {
			Elements items = ele.getElementsByTag("td");
			school211.add(items.get(items.size()-3).text());
		}

获取民办大学列表

		// 民办大学
		url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&yxmc=&ssdm=&yxls=&xlcc=&yxjbz=2";
		doc = Jsoup.connect(url).get();
		eles = doc.getElementById("PageForm").getElementsByTag("li");
		Element ele = eles.get(eles.size() - 3);
		int count = Integer.parseInt(ele.text());
		List<String> minbanSchoolName = new ArrayList<>();
		for (int i = 0; i < count; i++) {
			url = "https://gaokao.chsi.com.cn/sch/search--searchType-1,yxjbz-2,start-" + (i * 20) + ".dhtml";
			doc = Jsoup.connect(url).get();
			eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
			for (int j = 1, num = eles.size(); j < num; j++) {
				Elements items = eles.get(j).getElementsByTag("td");
				minbanSchoolName.add(items.get(0).text());
			}
			Thread.sleep(2000);
		}

查询高校列表并保存到数据库

		// 查询列表数据
		url = "https://gaokao.chsi.com.cn/sch/search.do";
		doc = Jsoup.connect(url).get();
		eles = doc.getElementById("PageForm").getElementsByTag("li");
		ele = eles.get(eles.size() - 3);
		count = Integer.parseInt(ele.text());
		for (int i = 0; i < count; i++) {
			url = "https://gaokao.chsi.com.cn/sch/search--start-" + (i * 20) + ".dhtml";
			doc = Jsoup.connect(url).get();
			eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
			for (int j = 1, num = eles.size(); j < num; j++) {
				Elements items = eles.get(j).getElementsByTag("td");
				
				University school = new University();
				school.setCity(items.get(1).text());
				school.setCreateTime(Dates.now());
				school.setLeadingUniversity(StringUtil.isBlank(items.get(4).text())?0:1);
				school.setLeadingDisciplines(StringUtil.isBlank(items.get(5).text())?0:1);
				if(minbanSchoolName.contains(items.get(0).text())) {
					school.setFeature("民办");
				} else {
					if(schoolContain(items.get(0).text(), school985)) {
						school.setFeature("985|211");
					} else if(schoolContain(items.get(0).text(), school211)) {
						school.setFeature("211");
					}
				}
				school.setInstitute(StringUtil.isBlank(items.get(6).text())?0:1);
				school.setIsDelete(0);
				school.setLevel(items.get(3).text());
				school.setName(items.get(0).text());
				school.setOwner(items.get(2).text());
				schoolRepository.save(school);
				
			}
			Thread.sleep(2000);
		}

完整测试类

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import com.lootaa.xcl.basedata.dao.UniversityRepository;
import com.lootaa.xcl.basedata.db.University;
import com.lootaa.xcl.basedata.util.Dates;

@SpringBootTest
class XclBasedataApplicationSchoolTests {

	@Autowired UniversityRepository schoolRepository;
	
	@Test
	void loadSchool() throws Exception {
		schoolRepository.deleteAll();
		
		//985院校
		String url = "https://daxue.eol.cn/985.shtml";
		Document doc = Jsoup.connect(url).get();
		List<String> school985 = new ArrayList<>();
		Elements eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
		for(Element ele : eles) {
			Elements items = ele.getElementsByTag("td");
			school985.add(items.get(items.size()-3).text());
		}
		
		//211院校
		url = "https://daxue.eol.cn/211.shtml";
		doc = Jsoup.connect(url).get();
		List<String> school211 = new ArrayList<>();
		eles = doc.getElementsByTag("tbody").first().getElementsByTag("tr");
		for(Element ele : eles) {
			Elements items = ele.getElementsByTag("td");
			school211.add(items.get(items.size()-3).text());
		}
		
		// 民办大学
		url = "https://gaokao.chsi.com.cn/sch/search.do?searchType=1&yxmc=&ssdm=&yxls=&xlcc=&yxjbz=2";
		doc = Jsoup.connect(url).get();
		eles = doc.getElementById("PageForm").getElementsByTag("li");
		Element ele = eles.get(eles.size() - 3);
		int count = Integer.parseInt(ele.text());
		List<String> minbanSchoolName = new ArrayList<>();
		for (int i = 0; i < count; i++) {
			url = "https://gaokao.chsi.com.cn/sch/search--searchType-1,yxjbz-2,start-" + (i * 20) + ".dhtml";
			doc = Jsoup.connect(url).get();
			eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
			for (int j = 1, num = eles.size(); j < num; j++) {
				Elements items = eles.get(j).getElementsByTag("td");
				minbanSchoolName.add(items.get(0).text());
			}
			Thread.sleep(2000);
		}

		// 查询列表数据
		url = "https://gaokao.chsi.com.cn/sch/search.do";
		doc = Jsoup.connect(url).get();
		eles = doc.getElementById("PageForm").getElementsByTag("li");
		ele = eles.get(eles.size() - 3);
		count = Integer.parseInt(ele.text());
		for (int i = 0; i < count; i++) {
			url = "https://gaokao.chsi.com.cn/sch/search--start-" + (i * 20) + ".dhtml";
			doc = Jsoup.connect(url).get();
			eles = doc.getElementsByClass("ch-table").first().getElementsByTag("tr");
			for (int j = 1, num = eles.size(); j < num; j++) {
				Elements items = eles.get(j).getElementsByTag("td");
				
				University school = new University();
				school.setCity(items.get(1).text());
				school.setCreateTime(Dates.now());
				school.setLeadingUniversity(StringUtil.isBlank(items.get(4).text())?0:1);
				school.setLeadingDisciplines(StringUtil.isBlank(items.get(5).text())?0:1);
				if(minbanSchoolName.contains(items.get(0).text())) {
					school.setFeature("民办");
				} else {
					if(schoolContain(items.get(0).text(), school985)) {
						school.setFeature("985|211");
					} else if(schoolContain(items.get(0).text(), school211)) {
						school.setFeature("211");
					}
				}
				school.setInstitute(StringUtil.isBlank(items.get(6).text())?0:1);
				school.setIsDelete(0);
				school.setLevel(items.get(3).text());
				school.setName(items.get(0).text());
				school.setOwner(items.get(2).text());
				schoolRepository.save(school);
				
			}
			Thread.sleep(2000);
		}

	}
	
	private boolean schoolContain(String name, List<String> school) {
		boolean contain =false;
		for(String item : school) {
			if(name.contains(item)) {
				contain = true;
				break;
			}
		}
		return contain;
	}

}

保存的数据

在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

lootaa

你的鼓励是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值