java获取url内容及正则匹配链接图片地址

import java.net.*;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;

public class QuestionResult {
private Socket socket;
private String host = "www.01hr.com";
private int port = 80;
private String filePath = "E:\\index.txt";
private ArrayList urlList, imgList;

// 建立socket链接
public void initSocket() throws Exception {
socket = new Socket(host, port);
}
// 取得页面内容方法1
public void getHttpUrlContent() throws Exception {
URL url = new URL("http://www.01hr.com");
InputStream input = url.openStream();
BufferedReader br = new BufferedReader(new InputStreamReader(input));
String data;
File file = new File(filePath);
FileWriter fw = new FileWriter(file);
BufferedWriter bw = new BufferedWriter(fw);
while ((data = br.readLine()) != null) {
matchUrl(data);
matchImage(data);
bw.write(data);
bw.newLine();
}
bw.close();
fw.close();
br.close();
input.close();

}

// 取得页面内容方法2
public void getHttpSocketContent() throws Exception {
StringBuffer sb = new StringBuffer("GET " + "/index.html"
+ " HTTP/1.1\r\n");
sb.append("Host: " + host + "\r\n");
sb.append("Accept: */*\r\n");
sb.append("Accept-Language: zh-cn\r\n");
sb.append("Accept-Encoding: gzip, deflate\r\n");
sb.append("User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)\r\n");
sb.append("Connection: Keep-Alive\r\n\r\n");

OutputStream socketOut = socket.getOutputStream();
socketOut.write(sb.toString().getBytes());

InputStream socketIn = socket.getInputStream();
BufferedReader br = new BufferedReader(new InputStreamReader(socketIn));

File file = new File(filePath);
FileWriter fw = new FileWriter(file);
BufferedWriter bw = new BufferedWriter(fw);

String data;
int i = 0;
while ((data = br.readLine()) != null) {
if (i > 9) {// 不显示返回报文
matchUrl(data);
matchImage(data);
bw.write(data);
bw.newLine();
}
i++;
}

bw.close();
fw.close();
br.close();
socketIn.close();
socketOut.close();
socket.close();
}
//匹配链接内容
public void matchUrl(String str) {
urlList = new ArrayList();
String regEx = "<a.*?href=\"(.*?)\"";
Pattern pat = Pattern.compile(regEx);
Matcher mat = pat.matcher(str);
while (mat.find()) {
urlList.add(mat.group(1));
}
}
//匹配图片内容
public void matchImage(String str) {
imgList = new ArrayList();
String regEx = "<img.*?src=\"(.*?)\"";
Pattern pat = Pattern.compile(regEx);
Matcher mat = pat.matcher(str);
while (mat.find()) {
imgList.add(mat.group(1));
}
}
//打印匹配后的地址
public void printContent(String contentName,ArrayList al){
System.out.println(contentName);
for(int i=0;i<al.size();i++){
System.out.println(al.get(i));
}
}

public static void main(String args[]) throws Exception {
QuestionResult client = new QuestionResult();
/*client.initSocket();//取得页面内容方法2
client.getHttpSocketContent();*/
client.getHttpUrlContent();取得页面内容方法1
client.printContent("---------------链接地址打印---------------",client.urlList);
client.printContent("---------------图片地址打印---------------",client.imgList);
}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值