获取人人网的高校数据。
人人网的院系接口,参数为高校id: http://www.renren.com/GetDep.do?id=
人人网 (国家/省/高校 js文件): http://s.xnimg.cn/a13819/allunivlist.js
仅供学习参考
仅供学习参考
仅供学习参考
1.通过fastJson解析获取到的高校json
2.通过正则表达式解析获取到的院系html
package com.test;
import java.io.File;
import java.io.IOException;
import java.io.PrintStream;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
/**
* @desc 获取人人网高校数据
* @author wjw
* @date 2016年12月8日下午1:21:01
*/
public class GetSchoolSQL {
// 构建省的sql文件
public static File provinceFile = new File("D:/province.sql");
// 构建高校的sql文件
public static File schoolFile = new File("D:/school.sql");
// 构建院系的sql文件
public static File depFile = new File("D:/dep.sql");
public static void getDate() throws ClientProtocolException, IOException{
HttpClient client = new DefaultHttpClient();
ResponseHandler<String> responseHandler = new BasicResponseHandler();
String depUrl = "http://www.renren.com/GetDep.do?id=";
String allunivs = "http://s.xnimg.cn/a13819/allunivlist.js";
HttpGet get = new HttpGet(allunivs);
StringBuffer sb = new StringBuffer(client.execute(get, responseHandler));
String str=sb.toString();
System.out.println("获取高校js文件完成,下一步解析js文件的JSON数据:");
JSONArray objList= JSONObject.parseArray(str.substring(str.indexOf("[{")));//获取json部分
// 对获取jsonarry 取中国部分id="00"
JSONArray china = null;//取中国部分provs
if(objList!=null){
for(int i=0;i<objList.size();i++){
JSONObject obj=objList.getJSONObject(i);
if(obj.getIntValue("id") != 00){//搞不懂json中国家id为字符串这里需要int类型判断
continue;
}else{
china=obj.getJSONArray("provs");
}
}
}
System.out.println("中国高校JSON数据:\r\n"+china.toJSONString());
System.out.println("=============================开始解析JSON==================================");
StringBuilder provinceSql = new StringBuilder();//省sql
StringBuilder schoolSql = new StringBuilder();//大学sql
StringBuilder depSql = new StringBuilder();//院系sql
for(int i=0;china!=null && i<china.size();i++){//遍历省
JSONObject pObj=china.getJSONObject(i);
provinceSql.append("insert into province(pro_id,pro_name)values('"+pObj.getIntValue("id")+ "','"+ convertFromHex(pObj.getString("name")) + "');\n");
JSONArray univs = pObj.getJSONArray("univs");//univs 获取省学校集合
for(int j=0;univs!=null && j<univs.size();j++){//遍历省学校
JSONObject sObj=univs.getJSONObject(j);
//添加省学校sql
schoolSql.append("insert into school(pro_id,sch_id,sch_name)values('"+pObj.getIntValue("id")+ "','"+ sObj.getIntValue("id")+ "','"+ convertFromHex(sObj.getString("name"))+"');\n");
//获取学校院系
HttpGet getDep = new HttpGet(depUrl + sObj.getIntValue("id"));
ResponseHandler<String> depHandler = new BasicResponseHandler();
String depHtml=client.execute(getDep, depHandler);//获取院系html
// 下面解析学校对应的院系html
Pattern pattern = Pattern.compile("<option value='&#(.*?)</option>",Pattern.MULTILINE); //排除value为空的项
Matcher matcher = pattern.matcher(depHtml);
while (matcher.find()) {
String value = "";
String data = matcher.group();// date结果为<option value='其它院系'>其它院系</option>
Pattern p = Pattern.compile("value='(.*?)'", Pattern.MULTILINE);
Matcher m = p.matcher(data);
if (m.find()) {
value = m.group(1);
}
String id = UUID.randomUUID().toString().replaceAll("-", "").toUpperCase();
depSql.append("insert into dep(id,sch_id,dep_name)values('"+id+"','"+sObj.getIntValue("id") +"','"+convertFromDec(value)+"');\n");
}
}
}
System.out.println("=============================JSON解析完成==================================");
PrintStream ps = new PrintStream(provinceFile);
ps.print(provinceSql.toString());
ps.close();
PrintStream ps2 = new PrintStream(schoolFile);
ps2.print(schoolSql.toString());
ps2.close();
PrintStream ps3 = new PrintStream(depFile);
ps3.print(depSql.toString());
ps3.close();
System.out.println("sql文件已经生成!");
System.out.println("省sql文件"+provinceFile.getAbsolutePath());
System.out.println("学校sql文件"+schoolFile.getAbsolutePath());
System.out.println("院系sql文件"+depFile.getAbsolutePath());
}
public static void main(String[] args){
try {
getDate();
} catch (IOException e) {
e.printStackTrace();
}
}
// 转换&#xxxxx;形式Unicode
private static String convertFromDec(String code) {
StringBuffer sb = new StringBuffer(code);
int startPos;
int endPos;
while ((startPos = sb.indexOf("&#")) > -1) {
endPos = sb.indexOf(";");
String tmp = sb.substring(startPos + 2, endPos);
sb.replace(startPos, endPos + 1, Character.toString((char) Integer
.parseInt(tmp, 10)));
}
return code = sb.toString();
}
// 转换16进制的Unicode,
private static String convertFromHex(String code) {
StringBuffer sb = new StringBuffer(code);
int pos;
while ((pos = sb.indexOf("\\u")) > -1) {
String tmp = sb.substring(pos, pos + 6);
sb.replace(pos, pos + 6, Character.toString((char) Integer
.parseInt(tmp.substring(2), 16)));
}
return code = sb.toString();
}
}