package com.regex;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
* 网页爬虫:其实就是一个程序用于在互联网中获取符合指定 规定的数据
*
* 功能:爬取邮箱地址
*/
public class Demo1 {
public static void main(String[] args) throws IOException {
List<String> list = getMailsByWeb();
for (String str : list) {
System.out.println(str + " ");
}
}
public static List<String> getMailsByWeb() throws IOException {
// 1.读取文件
URL url = new URL("http://127.0.0.1:8088/Test/index.html");
BufferedReader br = new BufferedReader(new InputStreamReader(
url.openStream()));
// 2.对读取的数据进行规则的匹配,从中获取符合规则的数据
String mail_regex = "\\w+@\\w+(\\.\\w+)+";
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(mail_regex);
String line = null;
while ((line = br.readLine()) != null) {
Matcher m = p.matcher(line);
while (m.find()) {
// 3.将符合规则的数据存储到集合中
list.add(m.group());
}
}
return list;
}
public static List<String> getMails() throws IOException {
// 1.读取文件
BufferedReader br = new BufferedReader(new FileReader("D:\\index.html"));
// 2.对读取的数据进行规则的匹配,从中获取符合规则的数据
String mail_regex = "\\w+@\\w+(\\.\\w+)+";
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(mail_regex);
String line = null;
while ((line = br.readLine()) != null) {
Matcher m = p.matcher(line);
while (m.find()) {
// 3.将符合规则的数据存储到集合中
list.add(m.group());
}
}
return list;
}
}
正则表达式(网页爬虫)
最新推荐文章于 2022-11-20 15:15:26 发布