1 packagecom.thunisoft.kms.java.lvl2.exam;2
3 importjava.io.BufferedReader;4 importjava.io.FileInputStream;5 importjava.io.FileOutputStream;6 importjava.io.IOException;7 importjava.io.InputStream;8 importjava.io.InputStreamReader;9 importjava.io.Reader;10 importjava.net.URL;11 importjava.net.URLConnection;12 importjava.util.Vector;13 importjava.util.regex.Matcher;14 importjava.util.regex.Pattern;15
16 importjavax.swing.text.MutableAttributeSet;17 importjavax.swing.text.html.HTML;18 importjavax.swing.text.html.HTMLEditorKit;19 importjavax.swing.text.html.HTMLEditorKit.ParserCallback;20 importjavax.swing.text.html.parser.ParserDelegator;21
22 /**
23 * Title:
24 * Description:
25 * Copyright: Copyright (c) 2007
26 * Company:
27 *28 *@authorkeep at it29 *@version1.030 * @date 2013-12-431 */
32 public class GrapWeatherInfo extendsParserCallback33 {34 /**是否是table标签*/
35 protected boolean isTable = false;36 /**是否是a标签*/
37 protected boolean isAlink = false;38 /**是否是div标签*/
39 protected boolean isDiv = false;40 /**是否是td标签*/
41 protected boolean isTd = false;42 /**放符合条件的元素*/
43 protected static Vector element = new Vector();44 protected static String paragraphText = newString();45 /**要获取文件在网络中的URL*/
46 private static final String FILE_URL = "http://www.weather.com.cn/weather/101010100.shtml";47 /**文件在本地磁盘的存储位置*/
48 private static final String FILE_LOCATION = "E:/url.html";49
50 /**构造方法*/
51 publicGrapWeatherInfo()52 {53
54 }55
56 /**
57 * 开始解析58 *59 *@paramr60 */
61 private static voidstartParse(Reader r)62 {63 try
64 {65 ParserDelegator ps = new ParserDelegator();//负责每次在调用其 parse66 //方法时启动一个新的67 //DocumentParser
68 HTMLEditorKit.ParserCallback parser = new GrapWeatherInfo();//解析结果驱动这些回调方法。
69 ps.parse(r, parser, true);//解析给定的流并通过解析的结果驱动给定的回调。
70 Vector link =element;71 String temp = "";72 for (int i = 1; i < link.size(); i++)73 {74 if (link.get(i).contains("星期"))75 {76 temp =link.get(i);77 }78 if (link.get(i).equals(";"))79 {80 System.out.println();81 }82 else if (!link.get(i).equals(">"))83 {84 //Pattern p = Pattern.compile("\\s*|\t|\r|\n");85 //Matcher m = p.matcher(link.get(i));
86 if (link.get(i).endsWith("夜间")87 && !link.get(i - 1).contains("星期"))88 {89 System.out.println();90 System.out.print(temp + " ");91 System.out.print(link.get(i) + " ");92 }93 else
94 {95 System.out.print(link.get(i) + " ");96 }97 }98 }99
100 }101 catch(Exception e)102 {103 e.printStackTrace();104 }105 }106
107 /**
108 * 处理文本109 *110 *@paramdata111 *@parampos112 */
113 public void handleText(char[] data, intpos)114 {115 Pattern p = Pattern.compile("\\s*|\t|\r|\n");116 Matcher m = null;117 if(isAlink)118 {119 String tempParagraphText = newString(data);120 m =p.matcher(tempParagraphText);121 if (paragraphText != null)122 {123 //符合条件的添加到集合中去
124 element.addElement(m.replaceAll(""));125 }126 }127 else if(isTd)128 {129 String tempParagraphText = newString(data);130 m =p.matcher(tempParagraphText);131 if (paragraphText != null)132 {133 //符合条件的添加到集合中去
134 element.addElement(m.replaceAll(""));135 }136 }137 }138
139 /**
140 * 处理开始标签141 *142 *@paramt143 *@parama144 *@parampos145 */
146 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, intpos)147 {148 //System.out.println("start: "+t+" "+a.getAttribute(HTML.Attribute.ID)+" "+a.getAttribute(HTML.Attribute.CLASS));149 //如果是
150 if (t ==HTML.Tag.DIV)151 {152 //7d 是要解析的div的id属性,用来和其他的div区分
153 if ("7d".equals(a.getAttribute(HTML.Attribute.ID)))154 {155 //说明是要找的div
156 isDiv = true;157 }158 }159 //如果是
160 if (t ==HTML.Tag.TABLE)161 {162 //yuBaoTable 是要解析的table的class属性,用来和其他的table区分
163 if ("yuBaoTable".equals(a.getAttribute(HTML.Attribute.CLASS)))164 {165 //说明是要找的table
166 isTable = true;167 }168 }169 //如果是,加上是id=7d的限制
170 if (t == HTML.Tag.A &&isDiv)171 {172
173 if (a.getAttribute(HTML.Attribute.ID) == null)174 {175 if (a.getAttribute(HTML.Attribute.HREF) != null ?a176 .getAttribute(HTML.Attribute.HREF).toString()177 .endsWith(".php") : false)178 {179 //说明是要找的
180 isAlink = true;181 }182
183 }184 }185 if (t == HTML.Tag.TD &&isDiv)186 {187 isTd = true;188 }189 }190
191 /**
192 * 解析出问题时的处理方法193 *194 *@paramerrorMsg195 *@parampos196 */
197 public void handleError(String errorMsg, intpos)198 {199 }200
201 /**
202 * 处理普通tag203 *204 *@paramt205 *@parama206 *@parampos207 */
208 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, intpos)209 {210 handleStartTag(t, a, pos);211 }212
213 /**
214 * getter method215 *216 *@return
217 */
218 public staticString getParagraphText()219 {220 returnparagraphText;221 }222
223 /**
224 * 处理注释225 *226 *@paramdata227 *@parampos228 */
229 public void handleComment(char[] data, intpos)230 {231 }232
233 /**
234 * 处理end tag235 *236 *@paramt237 *@parampos238 */
239 public void handleEndTag(HTML.Tag t, intpos)240 {241 //System.out.println("end: "+t+" "+pos);242 //如果是标签
243 if (t ==HTML.Tag.A)244 {245 if(isAlink)246 {247 isAlink = false;248 }249 }//如果是
标签250 else if (t == HTML.Tag.TABLE && isAlink == false)251 {252 if(isTable)253 {254 isTable = false;255 //一个table标签解析完的时候,element中加入一个;元素用来分隔每个table中的文本,方便输出
256 }257 element.addElement(new String(";"));258 }//如果是
259 else if (t == HTML.Tag.DIV && isTable == false)260 {261 if (isDiv == true && isTable == false)262 {263 isDiv = false;264 }265 }266 else if (t ==HTML.Tag.TD)267 {268 isTd = false;269 }270 }271
272 /**
273 * 程序的入口274 *275 *@paramargs276 */
277 public static voidmain(String args[])278 {279 InputStream input = null;280 FileOutputStream fos = null;281 BufferedReader brd = null;282 try
283 {284 //设置要提取的文件的URL
285 URL url = newURL(FILE_URL);286 //建立连接
287 URLConnection conn =url.openConnection();288 conn.connect();289 //获取输入流
290 input =conn.getInputStream();291 //new 一个具体的文件输出流
292 fos = newFileOutputStream(FILE_LOCATION);293 byte[] b = new byte[1024];294 int read = 0;295 //输出
296 while ((read = input.read(b)) != -1)297 {298 fos.write(b, 0, read);299 }300 //获取HTML文件流,以UTF-8编码
301 brd = new BufferedReader(new InputStreamReader(newFileInputStream(302 FILE_LOCATION), "UTF-8"));303 //开始解析HTML
304 startParse(brd);305 }306 catch(Exception e)307 {308 e.printStackTrace();309 }310 finally
311 {312 //关闭资源
313 if (input != null)314 {315 try
316 {317 input.close();318 }319 catch(IOException e)320 {321 input = null;322 }323 }324
325 if (fos != null)326 {327 try
328 {329 fos.close();330 }331 catch(IOException e)332 {333 fos = null;334 }335 }336
337 if (brd != null)338 {339 try
340 {341 brd.close();342 }343 catch(IOException e)344 {345 brd = null;346 }347 }348 }349 }350 }