java html类_用JAVA基本类库去解析HTML

最新推荐文章于 2024-02-21 09:11:09 发布

李惠玥

最新推荐文章于 2024-02-21 09:11:09 发布

阅读量356

点赞数

文章标签： java html类

本文链接：https://blog.csdn.net/weixin_42355744/article/details/114056572

版权

1 packagecom.thunisoft.kms.java.lvl2.exam;2

3 importjava.io.BufferedReader;4 importjava.io.FileInputStream;5 importjava.io.FileOutputStream;6 importjava.io.IOException;7 importjava.io.InputStream;8 importjava.io.InputStreamReader;9 importjava.io.Reader;10 importjava.net.URL;11 importjava.net.URLConnection;12 importjava.util.Vector;13 importjava.util.regex.Matcher;14 importjava.util.regex.Pattern;15

16 importjavax.swing.text.MutableAttributeSet;17 importjavax.swing.text.html.HTML;18 importjavax.swing.text.html.HTMLEditorKit;19 importjavax.swing.text.html.HTMLEditorKit.ParserCallback;20 importjavax.swing.text.html.parser.ParserDelegator;21

22 /**

23 * Title:
24 * Description:
25 * Copyright: Copyright (c) 2007
26 * Company:
27 *28 *@authorkeep at it29 *@version1.030 * @date 2013-12-431 */

32 public class GrapWeatherInfo extendsParserCallback33 {34 /**是否是table标签*/

35 protected boolean isTable = false;36 /**是否是a标签*/

37 protected boolean isAlink = false;38 /**是否是div标签*/

39 protected boolean isDiv = false;40 /**是否是td标签*/

41 protected boolean isTd = false;42 /**放符合条件的元素*/

43 protected static Vector element = new Vector();44 protected static String paragraphText = newString();45 /**要获取文件在网络中的URL*/

46 private static final String FILE_URL = "http://www.weather.com.cn/weather/101010100.shtml";47 /**文件在本地磁盘的存储位置*/

48 private static final String FILE_LOCATION = "E:/url.html";49

50 /**构造方法*/

51 publicGrapWeatherInfo()52 {53

54 }55

56 /**

57 * 开始解析58 *59 *@paramr60 */

61 private static voidstartParse(Reader r)62 {63 try

64 {65 ParserDelegator ps = new ParserDelegator();//负责每次在调用其 parse66 //方法时启动一个新的67 //DocumentParser

68 HTMLEditorKit.ParserCallback parser = new GrapWeatherInfo();//解析结果驱动这些回调方法。

69 ps.parse(r, parser, true);//解析给定的流并通过解析的结果驱动给定的回调。

70 Vector link =element;71 String temp = "";72 for (int i = 1; i < link.size(); i++)73 {74 if (link.get(i).contains("星期"))75 {76 temp =link.get(i);77 }78 if (link.get(i).equals(";"))79 {80 System.out.println();81 }82 else if (!link.get(i).equals(">"))83 {84 //Pattern p = Pattern.compile("\\s*|\t|\r|\n");85 //Matcher m = p.matcher(link.get(i));

86 if (link.get(i).endsWith("夜间")87 && !link.get(i - 1).contains("星期"))88 {89 System.out.println();90 System.out.print(temp + " ");91 System.out.print(link.get(i) + " ");92 }93 else

94 {95 System.out.print(link.get(i) + " ");96 }97 }98 }99

100 }101 catch(Exception e)102 {103 e.printStackTrace();104 }105 }106

107 /**

108 * 处理文本109 *110 *@paramdata111 *@parampos112 */

113 public void handleText(char[] data, intpos)114 {115 Pattern p = Pattern.compile("\\s*|\t|\r|\n");116 Matcher m = null;117 if(isAlink)118 {119 String tempParagraphText = newString(data);120 m =p.matcher(tempParagraphText);121 if (paragraphText != null)122 {123 //符合条件的添加到集合中去

124 element.addElement(m.replaceAll(""));125 }126 }127 else if(isTd)128 {129 String tempParagraphText = newString(data);130 m =p.matcher(tempParagraphText);131 if (paragraphText != null)132 {133 //符合条件的添加到集合中去

134 element.addElement(m.replaceAll(""));135 }136 }137 }138

139 /**

140 * 处理开始标签141 *142 *@paramt143 *@parama144 *@parampos145 */

146 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, intpos)147 {148 //System.out.println("start: "+t+" "+a.getAttribute(HTML.Attribute.ID)+" "+a.getAttribute(HTML.Attribute.CLASS));149 //如果是

150 if (t ==HTML.Tag.DIV)151 {152 //7d 是要解析的div的id属性，用来和其他的div区分

153 if ("7d".equals(a.getAttribute(HTML.Attribute.ID)))154 {155 //说明是要找的div

156 isDiv = true;157 }158 }159 //如果是

160 if (t ==HTML.Tag.TABLE)161 {162 //yuBaoTable 是要解析的table的class属性，用来和其他的table区分

163 if ("yuBaoTable".equals(a.getAttribute(HTML.Attribute.CLASS)))164 {165 //说明是要找的table

166 isTable = true;167 }168 }169 //如果是,加上是id=7d的限制

170 if (t == HTML.Tag.A &&isDiv)171 {172

173 if (a.getAttribute(HTML.Attribute.ID) == null)174 {175 if (a.getAttribute(HTML.Attribute.HREF) != null ?a176 .getAttribute(HTML.Attribute.HREF).toString()177 .endsWith(".php") : false)178 {179 //说明是要找的

180 isAlink = true;181 }182

183 }184 }185 if (t == HTML.Tag.TD &&isDiv)186 {187 isTd = true;188 }189 }190

191 /**

192 * 解析出问题时的处理方法193 *194 *@paramerrorMsg195 *@parampos196 */

197 public void handleError(String errorMsg, intpos)198 {199 }200

201 /**

202 * 处理普通tag203 *204 *@paramt205 *@parama206 *@parampos207 */

208 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, intpos)209 {210 handleStartTag(t, a, pos);211 }212

213 /**

214 * getter method215 *216 *@return

217 */

218 public staticString getParagraphText()219 {220 returnparagraphText;221 }222

223 /**

224 * 处理注释225 *226 *@paramdata227 *@parampos228 */

229 public void handleComment(char[] data, intpos)230 {231 }232

233 /**

234 * 处理end tag235 *236 *@paramt237 *@parampos238 */

239 public void handleEndTag(HTML.Tag t, intpos)240 {241 //System.out.println("end: "+t+" "+pos);242 //如果是标签

243 if (t ==HTML.Tag.A)244 {245 if(isAlink)246 {247 isAlink = false;248 }249 }//如果是

标签

250 else if (t == HTML.Tag.TABLE && isAlink == false)251 {252 if(isTable)253 {254 isTable = false;255 //一个table标签解析完的时候,element中加入一个;元素用来分隔每个table中的文本，方便输出

256 }257 element.addElement(new String(";"));258 }//如果是

259 else if (t == HTML.Tag.DIV && isTable == false)260 {261 if (isDiv == true && isTable == false)262 {263 isDiv = false;264 }265 }266 else if (t ==HTML.Tag.TD)267 {268 isTd = false;269 }270 }271

272 /**

273 * 程序的入口274 *275 *@paramargs276 */

277 public static voidmain(String args[])278 {279 InputStream input = null;280 FileOutputStream fos = null;281 BufferedReader brd = null;282 try

283 {284 //设置要提取的文件的URL

285 URL url = newURL(FILE_URL);286 //建立连接

287 URLConnection conn =url.openConnection();288 conn.connect();289 //获取输入流

290 input =conn.getInputStream();291 //new 一个具体的文件输出流

292 fos = newFileOutputStream(FILE_LOCATION);293 byte[] b = new byte[1024];294 int read = 0;295 //输出

296 while ((read = input.read(b)) != -1)297 {298 fos.write(b, 0, read);299 }300 //获取HTML文件流，以UTF-8编码

301 brd = new BufferedReader(new InputStreamReader(newFileInputStream(302 FILE_LOCATION), "UTF-8"));303 //开始解析HTML

304 startParse(brd);305 }306 catch(Exception e)307 {308 e.printStackTrace();309 }310 finally

311 {312 //关闭资源

313 if (input != null)314 {315 try

316 {317 input.close();318 }319 catch(IOException e)320 {321 input = null;322 }323 }324

325 if (fos != null)326 {327 try

328 {329 fos.close();330 }331 catch(IOException e)332 {333 fos = null;334 }335 }336

337 if (brd != null)338 {339 try

340 {341 brd.close();342 }343 catch(IOException e)344 {345 brd = null;346 }347 }348 }349 }350 }

李惠玥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java html类_用JAVA基本类库去解析HTML

1 packagecom.thunisoft.kms.java.lvl2.exam;23 importjava.io.BufferedReader;4 importjava.io.FileInputStream;5 importjava.io.FileOutputStream;6 importjava.io.IOException;7 importjava.io.InputStream;8 imp...
复制链接

扫一扫