java html正则表达式_java正则表达式解析html示例分享

package work;

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;

import org.apache.commons.httpclient.HttpClient;

import org.apache.commons.httpclient.HttpException;

import org.apache.commons.httpclient.HttpStatus;

import org.apache.commons.httpclient.methods.GetMethod;

import org.apache.commons.httpclient.params.HttpMethodParams;

public class chuanboyi {

public static void main(String[] args){

// TODO Auto-generated method stub

StringBuffer html = new StringBuffer();

HttpClient httpclient = new HttpClient();

//创建GET方法实例

GetMethod getMethod = new GetMethod("https://www.jb51.net");

//使用系统提供的默认恢复策略

getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

try{

//执行GET方法

int statusCode = httpclient.executeMethod(getMethod);

if(statusCode != HttpStatus.SC_OK){

System.out.println("Method is wrong " + getMethod.getStatusLine());

}

InputStream responseBody = getMethod.getResponseBodyAsStream();

BufferedReader reader = new BufferedReader(new InputStreamReader(responseBody,"utf-8"));

String line = reader.readLine();

while(line != null){

html.append(line).append("\n");

line = reader.readLine();

}

reader.close();

//正则表达式

String regex = "

[\\s\\S]+.*";

String regexa ="(?<=

)[\\s\\S]+?(?=)";

Pattern pattern = Pattern.compile(regex);

Matcher m = pattern.matcher(html);

StringBuffer str = new StringBuffer();

int i = 0;

while(m.find()){

str.append(m.group());

}

pattern = Pattern.compile(regexa);

m = pattern.matcher(str);

while(m.find()){

attrs(m.group());

i++;

}

System.out.println("共有"+i+"条数据!");

}catch (HttpException e) {

// TODO: handle exception

System.out.println("Please check your provided http address!");

e.printStackTrace();

}catch (IOException e) {

// TODO: handle exception

System.out.println("the line is wrong!");

e.printStackTrace();

}finally{

getMethod.releaseConnection();//释放链接

}

}

public static void attrs(String str){

//获取url的正则表达式

String regexURL = "[a-z]+-[0-9]+\\.html";

//获取Name的正则表达式

String regexName = "(?<=title=\")[[\\w-\\s][^x00-xff]]+(?=\")";

//获取图片的正则表达式

String regexPicture = "images.*\\.jpg";

Pattern patternURL = Pattern.compile(regexURL);

Pattern patternName = Pattern.compile(regexName);

Pattern patternPicture = Pattern.compile(regexPicture);

Matcher mURL = patternURL.matcher(str);

Matcher mName = patternName.matcher(str);

Matcher mPicture = patternPicture.matcher(str);

if(mName.find()){

System.out.println("名字:"+mName.group());

}

if(mURL.find()){

System.out.println("链接:"+mURL.group());

}

if(mPicture.find()){

System.out.println("图片:"+mPicture.group());

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值