爬取href超链接,正则指定目标结果



import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Test2 {  
	private Pattern pattern2;
	public Test2() {
		// TODO Auto-generated constructor stub
		pattern2=Pattern.compile("<span[^>]+>(主楼[^<]*)</span>");
	}
	public void download(String string) throws IOException
	{
		URL url=new URL(string);
		HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection();
		BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf8"));
		String dst="content.txt";
		
		BufferedWriter bufferedWriter=new BufferedWriter(new FileWriter(dst));
		StringBuffer content=new StringBuffer();
		String line=null;
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedWriter.write(content.toString());
		bufferedWriter.close();
		bufferedReader.close();
		
	}
	public void visit(String string,String name) throws IOException
	{
		URL url=new URL(string);
		HttpURLConnection httpURLConnection=(HttpURLConnection) url.openConnection();
		BufferedReader bufferedReader=new BufferedReader(new InputStreamReader(httpURLConnection.getInputStream(),"utf-8"));
		StringBuffer content=new StringBuffer();
		String line=null;
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedReader.close();
		
		// <span style="mso-spacerun:'yes'; font-size:10.5000pt; font-family:'楷体_GB2312'; ">主楼二区137</span>
		// <span style="mso-spacerun:'yes'; font-size:14.0000pt; font-family:'楷体_GB2312'; ">导师姓名</span>
		Matcher res = pattern2.matcher(content);
		if(res.find()) {
			System.out.println(name+"\t"+res.group(1));
		}
	}
	public void test() throws IOException
	{
		String dst="content.txt";
		BufferedReader bufferedReader=new BufferedReader(new FileReader(dst));
		String line=null;
		StringBuffer content=new StringBuffer();
		while ((line=bufferedReader.readLine())!=null) {
			content.append(line);
		}
		bufferedReader.close();
		Pattern pattern=Pattern.compile("<a\\s.*?href=\"(/plus/view.php[^\"]+)\"[^>]*>(.*?)</a>");
		Matcher res = pattern.matcher(content);
		while(res.find()) {
			visit("http://ste.xidian.edu.cn"+res.group(1), res.group(2));
		}
	}
    public static void main(String[] args) {
		// TODO Auto-generated method stub 
    	try {
    		new Test2().test();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
    }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值