java之网页爬虫

最新推荐文章于 2024-08-13 03:11:52 发布

北山bf

最新推荐文章于 2024-08-13 03:11:52 发布

阅读量360

点赞数 1

分类专栏： # java

本文为博主原创文章，未经博主允许不得转载。

本文链接：https://blog.csdn.net/qq_38318622/article/details/79272854

版权

java 专栏收录该内容

19 篇文章 2 订阅

订阅专栏

一、网页爬虫

在我看来，所谓的网页爬虫就是将网页上你需要的内容筛选出来保存到你的本地而已。那么我们用代码实现以下吧。

package oyz.cn.Httpclent;

import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

public class Pc {

    public static void main(String[] args) throws ClientProtocolException, IOException {
    
        pwy();
    }
    
    
    
    public static void pwy() throws ClientProtocolException, IOException {
        
        CloseableHttpClient H=HttpClients.createDefault();
        HttpGet get=new HttpGet("https://www.jianshu.com/");
         CloseableHttpResponse response = H.execute(get);
         HttpEntity entity =  response.getEntity();
         String result = EntityUtils.toString(entity, "UTF-8");
         //System.out.println("网页内容:"+result);
         
         String zz="\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))"; //使用正则表达式匹配连接地址
        
        //System.out.println(result);
         Pattern p=Pattern.compile(zz);
         Matcher m=p.matcher(result);
         int c=1;
         FileWriter x=new FileWriter("C:\\Users\\Administrator\\Desktop\\连接地址.txt");
            while(m.find()) 
            {
                
                System.out.println("爬回来的连接地址"+c+"是 ： " +m.group());
                String a=m.group();
                x.write("爬回来的连接地址"+c+"是 ： "+m.group());
                x.write("\r\n\r\n");
                c++;
                
            }
        
        
          x.close();
         response.close();
         H.close();
    }
    

}

效果图：