正则表达式抓取文件内容中的http链接地址

最新推荐文章于 2022-04-07 21:40:31 发布

weixin_30326741

最新推荐文章于 2022-04-07 21:40:31 发布

阅读量528

点赞数

文章标签： java 前端 python ViewUI

原文链接：http://www.cnblogs.com/akiradunn/p/5855073.html

版权



import java.io.BufferedReader;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.MalformedURLException;

import java.net.URL;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

//正则表达式抓取网页数据
public class HtmlAddressCatch {

public static void main(String[] args) {

  String webaddress = "https://www.zhihu.com/people/Akira_Dunn";
  HtmlAddressCatch.getWebTextContent(webaddress);
  /*String localaddress = "D:\\test\\test.html";
  String targetaddress = "D:\\test\\http.txt";
  HtmlAddressCatch.getLocalTextContent(localaddress , targetaddress);*/
    
}

//给定http链接抓取地址
public static void getWebTextContent(String webaddress){
try {
URL url = new URL(webaddress);
HttpURLConnection con = (HttpURLConnection)url.openConnection();
FileOutputStream file = new FileOutputStream("D:\text.txt");
InputStreamReader read = new InputStreamReader(con.getInputStream());//使用InputStreamReader是为了将InputStream字节流转换成为字符流，一次读取更多的字节
BufferedReader packetreader = new BufferedReader(read);//使用BufferedReader是为了在InputStreamReader的基础上一次读取更多的字节
int i=0;
String regex = "https?://\w+\.\w+\.\w+";
Pattern p = Pattern.compile(regex);
while((i=packetreader.read())!=-1)
{
String str = packetreader.readLine();
Matcher m = p.matcher(str);
while(m.find())
{
file.write((m.group()+"\r\n").getBytes());
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

// 从本地test.html文件抓取http链接和邮箱地址
public static void getLocalTextContent(String localaddress,String targetaddress){
try {
FileInputStream reader = new FileInputStream(localaddress);
FileOutputStream writer = new FileOutputStream(targetaddress);
byte[] buf = new byte[200];
int point = 0;
//String regex = "https?://\w+\.\w+\.\w+";http链接抓取
String regex = "\w+@\w+\.\w+";//邮箱地址抓取
Pattern p = Pattern.compile(regex);
while((point=reader.read(buf))>0)
{
Matcher m = p.matcher(new String(buf));
while(m.find())
{
writer.write((m.group()+"\r\n").getBytes());
}
}
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

转载于:https://www.cnblogs.com/akiradunn/p/5855073.html

weixin_30326741

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
正则表达式抓取文件内容中的http链接地址

import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStreamReader;im...
复制链接

扫一扫