java利用正则表达式截取网页数据

首先FetchMsg是一个用来读取动态网页,并把网页源码写入StringBuffer实例里面,

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;

public class FetchWebMsg
{
  private StringBuffer content;
  private BufferedReader br;
  private String str = "";

  public String fetchWeb(String url)
    throws Exception
  {
    URL newUrl = new URL(url);
    URLConnection urlConnection = newUrl.openConnection();

    this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), "utf-8"));
    this.content = new StringBuffer();
    while ((this.str = this.br.readLine()) != null)
    {
      this.content.append(this.str);
    }
    this.br.close();
    return this.content.toString();
  }

  public String fetchWeb(String url, String charset) throws Exception
  {
    URL newUrl = new URL(url);
    URLConnection urlConnection = newUrl.openConnection();

    this.br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream(), charset));
    this.content = new StringBuffer();
    while ((this.str = this.br.readLine()) != null)
    {
      this.content.append(this.str);
    }
    this.br.close();
    return this.content.toString(); }

  /*public static void main(String[] args) {
    FetchWebMsg f = new FetchWebMsg();
    try {
      System.out.println(f.fetchWeb("http://www.weather.com.cn/html/weather/101120101.shtml"));
    }
    catch (Exception e) {
      e.printStackTrace();
    }
  }*/
}

以上只是从网页上动态读取数据,下面是利用jdk自带的Patern类和Matcher类去截取数据

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class DouBanBookWebMsg
{
  public String getMsg()
    throws Exception
  {
    String url = "http://book.douban.com/chart?";
    String regx = null;

    FetchWebMsg fetchMsg = new FetchWebMsg();
   
    /**
     * <p>description:\是转义字符,\s是匹配空白符+表示匹配空白符一次或者多次[^>]匹配除了>意外的字符.*?可以匹配任意字符串其中.是匹配除了换行意外的任何字符<p>
     * <p>description:*是重复零次或更多次,?是重复零次或一次,.+?匹配除了换行意外的字符零次或者更多次,\\s+.*?匹配任意的空白符零次或者多次</p>
     *
     *
     *
     */

    regx = "<ul\\s*class=\"chart-dashed-list\">.*?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>.+?</li>.+?<li.+?>.+?<img\\s+.*?\\s+src=\"(.+?)\"/>.+?<h2.+?>.+?<a.+?>(.+?)</a>.+?</h2>.+?<p.+?>(.+?)</p>(.+?)</li>(.+?)</ul>";

    String webHtmlText = fetchMsg.fetchWeb("http://book.douban.com/chart?");

    String result = geMsgByRegx(regx, webHtmlText);

    return result;
  }

  public String geMsgByRegx(String regx, String webHtmlText)
  {
    StringBuffer result = new StringBuffer();
    Pattern p = Pattern.compile(regx);
    Matcher macher = p.matcher(webHtmlText);
    while (macher.find())
    {
      result.append("[{\"photoAddr\":\"" + macher.group(1).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(2).trim() + "\"");
      String[] authMsg = macher.group(3).trim().split("/");
      result.append(",\"auth\":\"" + authMsg[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg[1] + "\"");
      result.append(",\"press\":\"" + authMsg[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(4).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(5).trim() + "\"");
      String[] authMsg1 = macher.group(6).trim().split("/");
      result.append(",\"auth\":\"" + authMsg1[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg1[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(7).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(8).trim() + "\"");
      String[] authMsg2 = macher.group(9).trim().split("/");
      result.append(",\"auth\":\"" + authMsg2[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg2[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(10).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(11).trim() + "\"");
      String[] authMsg3 = macher.group(12).trim().split("/");
      result.append(",\"auth\":\"" + authMsg3[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg3[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(13).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(14).trim() + "\"");
      String[] authMsg4 = macher.group(15).trim().split("/");
      result.append(",\"auth\":\"" + authMsg4[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg4[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(16).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(17).trim() + "\"");
      String[] authMsg5 = macher.group(18).trim().split("/");
      result.append(",\"auth\":\"" + authMsg5[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg5[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(19).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(20).trim() + "\"");
      String[] authMsg6 = macher.group(21).trim().split("/");
      result.append(",\"auth\":\"" + authMsg6[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg6[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(22).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(23).trim() + "\"");
      String[] authMsg7 = macher.group(24).trim().split("/");
      result.append(",\"auth\":\"" + authMsg7[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg7[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(25).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(26).trim() + "\"");
      String[] authMsg8 = macher.group(27).trim().split("/");
      result.append(",\"auth\":\"" + authMsg8[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg8[2] + "\"}");

      result.append(",{\"photoAddr\":\"" + macher.group(28).trim().replace("spic", "lpic") + "\"");
      result.append(",\"bookName\":\"" + macher.group(29).trim() + "\"");
      String[] authMsg9 = macher.group(30).trim().split("/");
      result.append(",\"auth\":\"" + authMsg9[0] + "\"");
      result.append(",\"publicationDate\":\"" + authMsg1[1] + "\"");
      result.append(",\"press\":\"" + authMsg9[2] + "\"}]");
    }

    return result.toString();
  }
}

转载于:https://my.oschina.net/u/1034481/blog/337625

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值