之前做了一个获取世界大学的小程序,其实之前那个程序并不是很复杂,之所以很久才整理出来,主要就是那个转码的地方遇到了瓶颈,但到后来解决了那个瓶颈之后程序就写的顺风顺水了。按照之前的思路,其实人人网上不仅有世界大学的信息,还有全国的中学信息,这个信息量相比世界大学的那个要庞大很多了,其实找寻这个接口还是花费了一段时间的,下面就简要介绍一下这个程序。
首先说明一下,这里的中学是指高中和初中,这和人人网上是一致的,全国的省市不包括澳门和台湾,其中香港作为一个直辖市来看待,这和人人网上面的处理也是一致的。
可能有的人会疑惑那些入口是怎么找到的,其实并不复杂,考虑到完整性,我也简要介绍一下在人人网上找寻我们需要索取数据的接口的方法。
这里是一个我们登录人人网之后看到的第一个界面:
然后点击左边列表的校友录即可看到下面的界面:
但是关键的是怎样知道一共有多少页面呢?通过分析该页面的js源码可以得到下面的一个文件
把上面介绍的两个文件下载下来之后,通过阅读第一个文件可以看出人人网是怎样组织这些文件的,如下图所示:
接着对cityArray.js文件处理一下,主要目的就是去除掉空链接的地址,得到的结果如下:
"1100:北京市" = ["110101:东城区","110102:西城区","110103:崇文区","110104:宣武区","110105:朝阳区","110106:丰台区","110107:石景山区","110108:海淀区","110109:门头沟区","110111:房山区","110112:通州区","110113:顺义区","110114:昌平区","110115:大兴区","110116:怀柔区","110117:平谷区","110228:密云县","110229:延庆县"];
"3100:上海市" = ["310101:黄浦区","310103:卢湾区","310104:徐汇区","310105:长宁区","310106:静安区","310107:普陀区","310108:闸北区","310109:虹口区","310110:杨浦区","310112:闵行区","310113:宝山区","310114:嘉定区","310115:浦东新区","310116:金山区","310117:松江区","310118:青浦区","310119:南汇区","310120:奉贤区","310230:崇明县"];
"1200:天津市" = ["120101:和平区","120102:河东区","120103:河西区","120104:南开区","120105:河北区","120106:红桥区","120107:塘沽区","120108:汉沽区","120109:大港区","120110:东丽区","120111:西青区","120112:津南区","120113:北辰区","120114:武清区","120115:宝坻区","120221:宁河县","120223:静海县","120225:蓟县"];
"5000:重庆市" = ["500101:万州区","500102:涪陵区","500103:渝中区","500104:大渡口区","500105:江北区","500106:沙坪坝区","500107:九龙坡区","500108:南岸区","500109:北碚区","500110:万盛区","500111:双桥区","500112:渝北区","500113:巴南区","500114:黔江区","500115:长寿区","500116:江津区","500117:合川区","500118:永川区","500119:南川区","500222:綦江县","500223:潼南县","500224:铜梁县","500225:大足县","500226:荣昌县","500227:璧山县","500228:梁平县","500229:城口县","500230:丰都县","500231:垫江县","500232:武隆县","500233:忠县","500234:开县","500235:云阳县","500236:奉节县","500237:巫山县","500238:巫溪县","500240:石柱土家族自治县","500241:秀山土家族苗族自治县","500242:酉阳土家族苗族自治县","500243:彭水苗族土家族自治县"];
"2300:黑龙江省" = ["2301:哈尔滨市","2302:齐齐哈尔市","2303:鸡西市","2304:鹤岗市","2305:双鸭山市","2306:大庆市","2307:伊春市","2308:佳木斯市","2309:七台河市","2310:牡丹江市","2311:黑河市","2312:绥化市","2327:大兴安岭地区"];
"2200:吉林省" = ["2201:长春市","2202:吉林市","2203:四平市","2204:辽源市","2205:通化市","2206:白山市","2207:松原市","2208:白城市","2224:延边朝鲜族自治州"];
"2100:辽宁省" = ["2101:沈阳市","2102:大连市","2103:鞍山市","2104:抚顺市","2105:本溪市","2106:丹东市","2107:锦州市","2108:营口市","2109:阜新市","2110:辽阳市","2111:盘锦市","2112:铁岭市","2113:朝阳市","2114:葫芦岛市"];
"3700:山东省" = ["3701:济南市","3702:青岛市","3703:淄博市","3704:枣庄市","3705:东营市","3706:烟台市","3707:潍坊市","3708:济宁市","3709:泰安市","3710:威海市","3711:日照市","3712:莱芜市","3713:临沂市","3714:德州市","3715:聊城市","3716:滨州市","3717:菏泽市"];
"1400:山西省" = ["1401:太原市","1402:大同市","1403:阳泉市","1404:长治市","1405:晋城市","1406:朔州市","1407:晋中市","1408:运城市","1409:忻州市","1410:临汾市","1411:吕梁市"];
"6100:陕西省" =["6101:西安市","6102:铜川市","6103:宝鸡市","6104:咸阳市","6105:渭南市","6106:延安市","6107:汉中市","6108:榆林市","6109:安康市","6110:商洛市"];
"1300:河北省" =["1301:石家庄市","1302:唐山市","1303:秦皇岛市","1304:邯郸市","1305:邢台市","1306:保定市","1307:张家口市","1308:承德市","1309:沧州市","1310:廊坊市","1311:衡水市"];
"4100:河南省" =["4101:郑州市","4102:开封市","4103:洛阳市","4104:平顶山市","4105:安阳市","4106:鹤壁市","4107:新乡市","4108:焦作市","4109:濮阳市","4110:许昌市","4111:漯河市","4112:三门峡市","4113:南阳市","4114:商丘市","4115:信阳市","4116:周口市","4117:驻马店市","4118:济源市"];
"4200:湖北省" =["4201:武汉市","4202:黄石市","4203:十堰市","4205:宜昌市","4206:襄樊市","4207:鄂州市","4208:荆门市","4209:孝感市","4210:荆州市","4211:黄冈市","4212:咸宁市","4213:随州市","4228:恩施土家族苗族自治州","429004:仙桃市","429005:潜江市","429006:天门市","429021:神农架林区"];
"4300:湖南省" =["4301:长沙市","4302:株洲市","4303:湘潭市","4304:衡阳市","4305:邵阳市","4306:岳阳市","4307:常德市","4308:张家界市","4309:益阳市","4310:郴州市","4311:永州市","4312:怀化市","4313:娄底市","4331:湘西土家族苗族自治州"];
"4600:海南省" =["4601:海口市","4602:三亚市","469001:五指山市","469002:琼海市","469003:儋州市","469005:文昌市","469006:万宁市","469007:东方市","469025:定安县","469026:屯昌县","469027:澄迈县","469028:临高县","469030:白沙黎族自治县","469031:昌江黎族自治县","469033:乐东黎族自治县","469034:陵水黎族自治县","469035:保亭黎族苗族自治县","469036:琼中黎族苗族自治县"];
"3200:江苏省" =["3201:南京市","3202:无锡市","3203:徐州市","3204:常州市","3205:苏州市","3206:南通市","3207:连云港市","3208:淮安市","3209:盐城市","3210:扬州市","3211:镇江市","3212:泰州市","3213:宿迁市"];
"3600:江西省" =["3601:南昌市","3602:景德镇市","3603:萍乡市","3604:九江市","3605:新余市","3606:鹰潭市","3607:赣州市","3608:吉安市","3609:宜春市","3610:抚州市","3611:上饶市"];
"4400:广东省" =["4401:广州市","4402:韶关市","4403:深圳市","4404:珠海市","4405:汕头市","4406:佛山市","4407:江门市","4408:湛江市","4409:茂名市","4412:肇庆市","4413:惠州市","4414:梅州市","4415:汕尾市","4416:河源市","4417:阳江市","4418:清远市","4419:东莞市","4420:中山市","4451:潮州市","4452:揭阳市","4453:云浮市"];
"4500:广西壮族自治区" =["4501:南宁市","4502:柳州市","4503:桂林市","4504:梧州市","4505:北海市","4506:防城港市","4507:钦州市","4508:贵港市","4509:玉林市","4510:百色市","4511:贺州市","4512:河池市","4513:来宾市","4514:崇左市"];
"5300:云南省" =["5301:昆明市","5303:曲靖市","5304:玉溪市","5305:保山市","5306:昭通市","5307:丽江市","5308:普洱市","5309:临沧市","5323:楚雄彝族自治州","5325:红河哈尼族彝族自治州","5326:文山壮族苗族自治州","5328:西双版纳傣族自治州","5329:大理白族自治州","5331:德宏傣族景颇族自治州","5333:怒江傈僳族自治州","5334:迪庆藏族自治州"];
"5200:贵州省" =["5201:贵阳市","5202:六盘水市","5203:遵义市","5204:安顺市","5222:铜仁地区","5223:黔西南布依族苗族自治州","5224:毕节地区","5226:黔东南苗族侗族自治州","5227:黔南布依族苗族自治州"];
"5100:四川省" =["5101:成都市","5103:自贡市","5104:攀枝花市","5105:泸州市","5106:德阳市","5107:绵阳市","5108:广元市","5109:遂宁市","5110:内江市","5111:乐山市","5113:南充市","5114:眉山市","5115:宜宾市","5116:广安市","5117:达州市","5118:雅安市","5119:巴中市","5120:资阳市","5132:阿坝藏族羌族自治州","5133:甘孜藏族自治州","5134:凉山彝族自治州"];
"1500:内蒙古自治区" =["1501:呼和浩特市","1502:包头市","1503:乌海市","1504:赤峰市","1505:通辽市","1506:鄂尔多斯市","1507:呼伦贝尔市","1508:巴彦淖尔市","1509:乌兰察布市","1522:兴安盟","1525:锡林郭勒盟","1529:阿拉善盟"];
"6400:宁夏回族自治区" =["6401:银川市","6402:石嘴山市","6403:吴忠市","6404:固原市","6405:中卫市"];
"6200:甘肃省" =["6201:兰州市","6202:嘉峪关市","6203:金昌市","6204:白银市","6205:天水市","6206:武威市","6207:张掖市","6208:平凉市","6209:酒泉市","6210:庆阳市","6211:定西市","6212:陇南市","6229:临夏回族自治州","6230:甘南藏族自治州"];
"6300:青海省" =["6301:西宁市","6321:海东地区","6322:海北藏族自治州","6323:黄南藏族自治州","6325:海南藏族自治州","6326:果洛藏族自治州","6327:玉树藏族自治州","6328:海西蒙古族藏族自治州"];
"5400:西藏自治区" =["5401:拉萨市","5421:昌都地区","5422:山南地区","5423:日喀则地区","5424:那曲地区","5425:阿里地区","5426:林芝地区"];
"6500:新疆维吾尔自治区" =["6501:乌鲁木齐市","6502:克拉玛依市","6521:吐鲁番地区","6522:哈密地区","6523:昌吉回族自治州","6527:博尔塔拉蒙古自治州","6528:巴音郭楞蒙古自治州","6529:阿克苏地区","6530:克孜勒苏柯尔克孜自治州","6531:喀什地区","6532:和田地区","6540:伊犁哈萨克自治州","6542:塔城地区","6543:阿勒泰地区","659001:石河子市","659002:阿拉尔市","659003:图木舒克市","659004:五家渠市"];
"3400:安徽省" =["3401:合肥市","3402:芜湖市","3403:蚌埠市","3404:淮南市","3405:马鞍山市","3406:淮北市","3407:铜陵市","3408:安庆市","3410:黄山市","3411:滁州市","3412:阜阳市","3413:宿州市","3414:巢湖市","3415:六安市","3416:亳州市","3417:池州市","3418:宣城市"];
"3300:浙江省" =["3301:杭州市","3302:宁波市","3303:温州市","3304:嘉兴市","3305:湖州市","3306:绍兴市","3307:金华市","3308:衢州市","3309:舟山市","3310:台州市","3311:丽水市"];
"3500:福建省" =["3501:福州市","3502:厦门市","3503:莆田市","3504:三明市","3505:泉州市","3506:漳州市","3507:南平市","3508:龙岩市","3509:宁德市"];
"8100:香港特别行政区" =["8101:中西区","8102:湾仔区","8103:东区","8104:南区","8105:油尖旺区","8106:深水埗区","8107:九龙城区","8108:黄大仙区","8109:观塘区","8110:荃湾区","8111:葵青区","8112:沙田区","8113:西贡区","8114:大埔区","8115:北区","8116:元朗区","8117:屯门区","8118:离岛区"];
通过人人网告诉我们的那种构造连接的方式,然后就可以获取这些页面并对其解析了,解析的源代码如下:
CityBean.java
package com.school;
public class CityBean {
private Integer id;
private String name;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
CityLiBean.java
package com.school;
public class CityLiBean {
private String li;
private String name;
public String getLi() {
return li;
}
public void setLi(String li) {
this.li = li;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
ProBean.java
package com.school;
public class ProBean {
private Integer id;
private String name;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
注:不要去纠结上面这三个类是怎么写的,他们只是我在处理数据的时候封装的一种手段,主要的意义就是将将数据之间的关联慢慢建立起来。
School.java
package com.school;
//学校
public class School {
private Long id;//学校编号
private String name;//高中学校的名称
private String area;//学校所在的区县名称
private Long href;//学校的链接编号,对应网页中的 href="10000569"
private String city;//高中学校所在的城市,如果有的话,
private Long cid;//城市编号
private String province;//学校所在的省市
private Long pid;//省市编号,如北京-1101等
public Long getId() {
return id;
}
public void setId(Long id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getArea() {
return area;
}
public void setArea(String area) {
this.area = area;
}
public Long getHref() {
return href;
}
public void setHref(Long href) {
this.href = href;
}
public String getCity() {
return city;
}
public void setCity(String city) {
this.city = city;
}
public Long getCid() {
return cid;
}
public void setCid(Long cid) {
this.cid = cid;
}
public String getProvince() {
return province;
}
public void setProvince(String province) {
this.province = province;
}
public Long getPid() {
return pid;
}
public void setPid(Long pid) {
this.pid = pid;
}
}
SchoolDao.java
package com.school;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class SchoolDao {
private static Connection connection = null;
public static Connection getConnection() {
try {
Class.forName("com.mysql.jdbc.Driver");
connection = DriverManager.getConnection(
"jdbc:mysql://localhost:9906/school", "root", "root");
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
return connection;
}
// 插入数据的方法
public static boolean insertHighschool(School school) {
boolean bool = false;
getConnection();
String sql = "insert into highschool(name,area,href,city,cid,province,pid) values(?,?,?,?,?,?,?)";
try {
PreparedStatement statement = connection.prepareStatement(sql);
statement.setString(1, school.getName());
statement.setString(2, school.getArea());
statement.setLong(3, school.getHref());
statement.setString(4, school.getCity());
statement.setLong(5, school.getCid());
statement.setString(6, school.getProvince());
statement.setLong(7, school.getPid());
if (statement.executeUpdate() > 0) {
bool = true;
}
statement.close();
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
return bool;
}
// 插入数据的方法
public static boolean insertJunoirschool(School school) {
boolean bool = false;
getConnection();
String sql = "insert into juniorschool(name,area,href,city,cid,province,pid) values(?,?,?,?,?,?,?)";
try {
PreparedStatement statement = connection.prepareStatement(sql);
statement.setString(1, school.getName());
statement.setString(2, school.getArea());
statement.setLong(3, school.getHref());
statement.setString(4, school.getCity());
statement.setLong(5, school.getCid());
statement.setString(6, school.getProvince());
statement.setLong(7, school.getPid());
if (statement.executeUpdate() > 0) {
bool = true;
}
statement.close();
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
return bool;
}
}
HtmlConverter.java
package com.school;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicResponseHandler;
import org.apache.http.impl.client.DefaultHttpClient;
public class HtmlConverter {
private String content = null;
//通过输入的url地址获取其内容
public String getContent(String url){
HttpClient client = new DefaultHttpClient();
try {
HttpGet get = new HttpGet(url);
ResponseHandler<String> responseHandler = new BasicResponseHandler();
content = client.execute(get, responseHandler);
content = escape(content);
} catch (ClientProtocolException e) {
//e.printStackTrace();
System.out.println("没有找到文件");
content = null;
return content;
} catch (IOException e) {
//e.printStackTrace();
System.out.println("没有找到文件");
content = null;
return content;
} finally {
client.getConnectionManager().shutdown();
}
return content;
}
//将类似于赵庄中学的字符转换成中文
public String escape(String str){
String res = null;
StringBuffer sb = new StringBuffer();
try {
while (str.length() > 0) {
if (str.startsWith("&#")) {
int x = Integer.parseInt(str.substring(2, str.indexOf(";")), 10);
sb.append((char) x);
str = str.substring(8);
} else {
sb.append(str.charAt(0));
str = str.substring(1);
}
}
res = sb.toString();
} catch (Exception e) {
e.printStackTrace(System.err);
}
return res;
}
//负责读取city.txt文件并且将获取到的js文件保存下来
//第一个参数表示city.txt的文件路径,第二个参数表示学校类型:是高中还是初中(解析后的url路径不一样)
//如初中(北京):http://support.renren.com/juniorschool/1201.html
//高中(北京):http://support.renren.com/highschool/1101.html
public void ReadWrite(File fileName,String schoolType){
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
String line = "";
while ((line = br.readLine())!=null){
String[] pros = line.split("=");
if (pros.length==2) {//表示读取的格式正确,第一部分形如"1100:北京市",第二部分形如["110101:东城区",...,"110102:西城区"]
String[] pro = pros[0].trim().replaceAll("\"", "").split(":");
ProBean proBean = new ProBean();
proBean.setId(Integer.parseInt(pro[0]));
proBean.setName(pro[1]);
pros[1] = pros[1].trim();
pros[1] = pros[1].substring(1, pros[1].length()-2);
String[] prolist = pros[1].replaceAll("\"", "").split(",");
if (prolist[0].split(":")[0].length()>4 ||
proBean.getName().equals("香港特别行政区")) {//表明这是一个直辖市或者特区,即没有更多的地级市
String url = "http://support.renren.com/"+schoolType+"/"+(proBean.getId()+1)+".html";
String content = getContent(url);
String file = "files//"+schoolType+"//"+proBean.getName()+"-"+(proBean.getId()+1)+".txt";
File outFile = new File(file);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile)));
if (content!=null && !content.trim().equals("")) {
bw.write(content);
}
bw.close();
parseHtml(new File(file),proBean,null,schoolType);
}else {//表明有更多的地级市,要对每个地级市进行遍历
for (int j = 0; j < prolist.length; j++) {
String[] city = prolist[j].split(":");
CityBean cityBean = new CityBean();
cityBean.setId(Integer.parseInt(city[0]));
cityBean.setName(city[1]);
String url = "http://support.renren.com/"+schoolType+"/"+cityBean.getId()+".html";
String content = getContent(url);
String fileDir = "files//"+schoolType+"//"+(proBean.getName()+"-"+proBean.getId());
new File(fileDir).mkdir();
String file = fileDir+"//"+(cityBean.getName()+"-"+cityBean.getId())+".txt";
File outFile = new File(file);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outFile)));
if (content!=null && !content.trim().equals("")) {
bw.write(content);
}
bw.close();
parseHtml(new File(file), proBean, cityBean,schoolType);
}
}
}
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
//将html页面解析成容易识别的对象,并插入到数据库中
public void parseHtml(File fileName,ProBean proBean,CityBean cityBean,String schoolType){
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
StringBuffer src = new StringBuffer();
String line = "";
while ((line = br.readLine())!=null){
src.append(line);
}
String[] uls = src.toString().split("</ul>");
List<CityLiBean> cityLiBeans = new ArrayList<CityLiBean>();
if (uls.length>1) {
String[] lis = uls[0].split("</li>");
for (int i = 0; i < lis.length; i++) {//解析出这个文件的目录结构
CityLiBean cityLiBean = new CityLiBean();
Pattern patternName = Pattern.compile(">.*</a>");
Matcher matcherName = patternName.matcher(lis[i]);
if (matcherName.find()) {
String matched = matcherName.group();
cityLiBean.setName(matched.substring(matched.indexOf("\">")+2,matched.indexOf("</")));
}
Pattern patternLi = Pattern.compile("('.*')");
Matcher matcherLi = patternLi.matcher(lis[i]);
if (matcherLi.find()) {
String matched = matcherLi.group();
cityLiBean.setLi(matched.substring(matched.indexOf("'")+1,matched.lastIndexOf("'")));
}
cityLiBeans.add(cityLiBean);
}
//遍历后面的ul标签,根据li标签匹配所属区县
for (int i = 1; i < uls.length; i++) {
String[] schools = uls[i].split("</li>");
CityLiBean cityLiBean = new CityLiBean();//该类的属性name对应school的属性area
if (schools.length>1) {
School school = new School();
Pattern patternId = Pattern.compile("id=\".+style");
Matcher matcherId = patternId.matcher(schools[0]);
if (matcherId.find()) {
String matched = matcherId.group();
String id = matched.substring(matched.indexOf("\"")+1,matched.lastIndexOf("\""));
for (CityLiBean c : cityLiBeans) {
if (id.equals(c.getLi())) {//确定是哪一个区县下面的学校列表
cityLiBean = c;
}
}
}
school.setArea(cityLiBean.getName());
Pattern patternHref = Pattern.compile("href=\".+\"");
Matcher matcherHref = patternHref.matcher(schools[0]);
if (matcherHref.find()) {
String matched = matcherHref.group();
matched = matched.substring(matched.indexOf("\"")+1,matched.lastIndexOf("\""));
school.setHref(Long.parseLong(matched));
}
Pattern patternSchool = Pattern.compile("\">.+</a>");
Matcher matcherSchool = patternSchool.matcher(schools[0]);
if (matcherSchool.find()) {
String matched = matcherSchool.group();
matched = matched.substring(matched.lastIndexOf("\"")+2,matched.lastIndexOf("<"));
school.setName(matched);
}
if (cityBean!=null) {//不是直辖市或者香港特区
school.setCity(cityBean.getName());
school.setCid(cityBean.getId().longValue());
}else {
school.setCity("");
school.setCid((long)0);
}
school.setProvince(proBean.getName());
school.setPid(proBean.getId().longValue());
if (schoolType.equals("highschool")) {
SchoolDao.insertHighschool(school);
}else {
SchoolDao.insertJunoirschool(school);
}
System.out.println(school.getName()+school.getArea()+school.getCity()+school.getProvince());
}
for (int j = 1; j < schools.length; j++) {
School school = new School();
school.setArea(cityLiBean.getName());
Pattern patternHref = Pattern.compile("href=\".+\"");
Matcher matcherHref = patternHref.matcher(schools[j]);
if (matcherHref.find()) {
String matched = matcherHref.group();
matched = matched.substring(matched.indexOf("\"")+1,matched.lastIndexOf("\""));
school.setHref(Long.parseLong(matched));
}
Pattern patternSchool = Pattern.compile("\">.+</a>");
Matcher matcherSchool = patternSchool.matcher(schools[j]);
if (matcherSchool.find()) {
String matched = matcherSchool.group();
matched = matched.substring(matched.lastIndexOf("\"")+2,matched.lastIndexOf("<"));
school.setName(matched);
}
if (cityBean!=null) {//不是直辖市或者香港特区
school.setCity(cityBean.getName());
school.setCid(cityBean.getId().longValue());
}else {
school.setCity("");
school.setCid((long)0);
}
school.setProvince(proBean.getName());
school.setPid(proBean.getId().longValue());
if (schoolType.equals("highschool")) {
SchoolDao.insertHighschool(school);
}else {
SchoolDao.insertJunoirschool(school);
}
System.out.println(school.getName()+"\t"+school.getArea()+"\t"+school.getCity()+"\t"+school.getProvince());
}
}
}
br.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
new HtmlConverter().ReadWrite(new File("src//city.txt"),"highschool");
new HtmlConverter().ReadWrite(new File("src//city.txt"),"juniorschool");
}
}
DROP TABLE IF EXISTS `juniorschool`;
CREATE TABLE `juniorschool` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(500) DEFAULT NULL,
`area` varchar(100) DEFAULT NULL,
`href` int(11) DEFAULT NULL,
`city` varchar(100) DEFAULT NULL,
`cid` int(11) DEFAULT NULL,
`province` varchar(100) DEFAULT NULL,
`pid` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=91157 DEFAULT CHARSET=utf8;
DROP TABLE IF EXISTS `highschool`;
CREATE TABLE `highschool` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(500) DEFAULT NULL,
`area` varchar(100) DEFAULT NULL,
`href` int(11) DEFAULT NULL,
`city` varchar(100) DEFAULT NULL,
`cid` int(11) DEFAULT NULL,
`province` varchar(100) DEFAULT NULL,
`pid` int(11) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=21353 DEFAULT CHARSET=utf8;