正则表达式抓取文件内容中的http链接地址



import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

//正则表达式抓取网页数据
public class HtmlAddressCatch {

public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
    
}

//给定http链接抓取地址
public static void getWebTextContent(String webaddress){
try {
URL url = new URL(webaddress);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
FileOutputStream file = new FileOutputStream("D:\text.txt");
InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流,一次读取更多的字节
BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节
int i=0;
String regex = "https?://\w+\.\w+\.\w+";
Pattern p = Pattern.compile(regex);
while((i=packetreader.read())!=-1)
{
String str = packetreader.readLine();
Matcher m = p.matcher(str);
while(m.find())
{
file.write((m.group()+"\r\n").getBytes());
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

// 从本地test.html文件抓取http链接和邮箱地址
public static void getLocalTextContent(String localaddress,String targetaddress){
try {
FileInputStream reader = new FileInputStream(localaddress);
FileOutputStream writer = new FileOutputStream(targetaddress);
byte[] buf = new byte[200];
int point = 0;
//String regex = "https?://\w+\.\w+\.\w+";http链接抓取
String regex = "\w+@\w+\.\w+";//邮箱地址抓取
Pattern p = Pattern.compile(regex);
while((point=reader.read(buf))>0)
{
Matcher m = p.matcher(new String(buf));
while(m.find())
{
writer.write((m.group()+"\r\n").getBytes());
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

转载于:https://www.cnblogs.com/akiradunn/p/5855073.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值