JAVA日记之javaJDK原生简单爬虫

java原生爬虫

① 指定一个种子url放入到队列中
② 从队列中获取某个URL
③ 使用HTTP协议发起网络请求
④ 在发起网络请求的过程中,需要将域名转化成IP地址,也就是域名解析
⑤ 得到服务器的响应,此时是二进制的输入流
⑥ 将二进制的输入流转换成HTML文档,并解析内容(我们要抓取的内容,比如标题)。
⑦ 将解除出来的内容保持到数据库
⑧ 记录当前URL,并标记为已爬取,避免下次重复爬取。
⑨ 从当前的HTML文档中,解析出页面中包含的其它URL,以供下次爬取
⑩ 判断解析出来的URL是否已经爬取过了,如果已经爬取就丢弃掉
⑪将还没爬取过的URL,存放到等待爬取的URL队列中。
⑫ 重复以上的步骤,指导等待爬取的URL队列中没有数据

1.Get请求

package code.demo;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;

import java.net.URL;
import java.net.URLConnection;

/*
模拟浏览器运行原理

发送请求得到响应


 */
public class GET {

    public static void main(String[] args) throws IOException {

        //1.资源地址url

        String zy= "https://www.baidu.com";

        //2.发送一网络请求client-server(web容器)
        //JDK  url
        URL url = new URL(zy);
        URLConnection urlConnection = url.openConnection();

        //3.发送数据get请求
        //4.得到相应数据,输入流(html二进制文档,字符流0)
        InputStream is = urlConnection.getInputStream();
        //5打印结果
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));


        String line=null;
        while ((line=bufferedReader.readLine())!=null){

            System.out.println(line);
        }



    }

}

2.Post请求

package code.demo;

import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;

/*
模拟浏览器运行原理

发送请求得到响应


 */
public class POST {

    public static void main(String[] args) throws IOException {

        //1.指定资源地址url

        String zy = "https://www.baidu.com";

        //2.发送一网络请求client-server(web容器)
        //JDK  url
        URL url = new URL(zy);
        HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();

        //设置请求头属性
        urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");
        //打开dooutput,post必须加上,get默认即ok
        urlConnection.setDoOutput(true);
        //设置请求方式
        urlConnection.setRequestMethod("POST");

        //3.发送数据POST请求
        OutputStream outputStream = urlConnection.getOutputStream();
        outputStream.write("username=zhangsan&password=lisi".getBytes());
        outputStream.flush();
        outputStream.close();

        //4.得到相应数据,输入流(html二进制文档,字符流0)
        InputStream is = urlConnection.getInputStream();
        //5打印结果
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));
        String line = null;
        while ((line = bufferedReader.readLine()) != null) {
            System.out.println(line);
        }


    }

}

使用阿帕奇的HttpClient包框架
1.get请求

导入pom
		<dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.3</version>
        </dependency>
package code.demo;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;

public class HttpClienGet {

    public static void main(String[] args) throws IOException {

        //1.指定url

        String url= "http://www.baidu.com";
        //2.打开一个连接
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpClient httpclient = HttpClients.createDefault();

        //3.发送数据

        //4.执行并获得数据
        CloseableHttpResponse execute = httpclient.execute(httpGet);
        HttpEntity entity = execute.getEntity();

        //5.打印
        String s = EntityUtils.toString(entity, Charset.forName("utf-8"));
        System.out.println(s);
    }
}

2.post请求

package code.demo;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

public class HttpClienPost {

    public static void main(String[] args) throws IOException {

        //1.指定url

        String url= "http://www.baidu.com";
        //2.打开一个连接
        HttpPost httpPost = new HttpPost(url);
        CloseableHttpClient httpclient = HttpClients.createDefault();

        //3.发送数据
        List<NameValuePair> paramerter= new ArrayList<NameValuePair>();
        paramerter.add(new BasicNameValuePair("username","zhangsan"));
        paramerter.add(new BasicNameValuePair("password","lisi"));
        httpPost.setEntity(new UrlEncodedFormEntity(paramerter));

        //4.执行并获得数据
        CloseableHttpResponse execute = httpclient.execute(httpPost);
        HttpEntity entity = execute.getEntity();

        //5.打印
        String s = EntityUtils.toString(entity, Charset.forName("utf-8"));
        System.out.println(s);
    }
}


使用阿帕奇fluent更加加简洁

导入坐标
 		<dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>fluent-hc</artifactId>
            <version>4.5.3</version>
        </dependency>

1.get and post请求

package code.demo;

import org.apache.http.client.fluent.Form;
import org.apache.http.client.fluent.Request;

import java.io.IOException;
import java.nio.charset.Charset;

/*

流畅的api
更简练
 */
public class HttpClientFluent {

    public static void main(String[] args) throws IOException {
        //get请求
        String s = Request.Get("http://www.baidu.com").execute().returnContent().asString(Charset.forName("utf-8"));
        System.out.println(s);

        //post请求,最好写上正确的地址和提交数据,不然报错
        String request = Request.Post("http://www.baidu.com").bodyForm(Form.form().add("username", "vip").add("password", "123").build()).execute().returnContent().asString(Charset.forName("utf-8"));
        System.out.println(request);
    }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值