import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//爬去电影天堂的最新电影
public class pp {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
//读取网页源代码
URL url = new URL("http://www.dytt8.net/");
InputStream in = url.openStream();
BufferedInputStream buff = new BufferedInputStream(in);
byte[] bt = new byte[1024];
int num;
String html = "";
while((num = buff.read(bt))!=-1) {
String line = new String(bt,0,num,"gbk");
html += line;
}
//System.out.println(html);
//正则匹配得到需要的结果
String regex = "<a href=\'/html/gndy/(.*?)\'>(.*?)</a><br/>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(html);
while(matcher.find()){
System.out.println(matcher.groupCount());
System.out.println(matcher.group(0));
System.out.println(matcher.group(1));
System.out.println(matcher.group(2));
}
}
}
java爬虫--step1--简单网页获取
最新推荐文章于 2024-11-15 18:38:47 发布