java原生爬虫
① 指定一个种子url放入到队列中
② 从队列中获取某个URL
③ 使用HTTP协议发起网络请求
④ 在发起网络请求的过程中,需要将域名转化成IP地址,也就是域名解析
⑤ 得到服务器的响应,此时是二进制的输入流
⑥ 将二进制的输入流转换成HTML文档,并解析内容(我们要抓取的内容,比如标题)。
⑦ 将解除出来的内容保持到数据库
⑧ 记录当前URL,并标记为已爬取,避免下次重复爬取。
⑨ 从当前的HTML文档中,解析出页面中包含的其它URL,以供下次爬取
⑩ 判断解析出来的URL是否已经爬取过了,如果已经爬取就丢弃掉
⑪将还没爬取过的URL,存放到等待爬取的URL队列中。
⑫ 重复以上的步骤,指导等待爬取的URL队列中没有数据
1.Get请求
package code.demo;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
/*
模拟浏览器运行原理
发送请求得到响应
*/
public class GET {
public static void main(String[] args) throws IOException {
//1.资源地址url
String zy= "https://www.baidu.com";
//2.发送一网络请求client-server(web容器)
//JDK url
URL url = new URL(zy);
URLConnection urlConnection = url.openConnection();
//3.发送数据get请求
//4.得到相应数据,输入流(html二进制文档,字符流0)
InputStream is = urlConnection.getInputStream();
//5打印结果
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));
String line=null;
while ((line=bufferedReader.readLine())!=null){
System.out.println(line);
}
}
}
2.Post请求
package code.demo;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
/*
模拟浏览器运行原理
发送请求得到响应
*/
public class POST {
public static void main(String[] args) throws IOException {
//1.指定资源地址url
String zy = "https://www.baidu.com";
//2.发送一网络请求client-server(web容器)
//JDK url
URL url = new URL(zy);
HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
//设置请求头属性
urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36");
//打开dooutput,post必须加上,get默认即ok
urlConnection.setDoOutput(true);
//设置请求方式
urlConnection.setRequestMethod("POST");
//3.发送数据POST请求
OutputStream outputStream = urlConnection.getOutputStream();
outputStream.write("username=zhangsan&password=lisi".getBytes());
outputStream.flush();
outputStream.close();
//4.得到相应数据,输入流(html二进制文档,字符流0)
InputStream is = urlConnection.getInputStream();
//5打印结果
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(is));
String line = null;
while ((line = bufferedReader.readLine()) != null) {
System.out.println(line);
}
}
}
使用阿帕奇的HttpClient包框架
1.get请求
导入pom
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
package code.demo;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URL;
import java.nio.charset.Charset;
public class HttpClienGet {
public static void main(String[] args) throws IOException {
//1.指定url
String url= "http://www.baidu.com";
//2.打开一个连接
HttpGet httpGet = new HttpGet(url);
CloseableHttpClient httpclient = HttpClients.createDefault();
//3.发送数据
//4.执行并获得数据
CloseableHttpResponse execute = httpclient.execute(httpGet);
HttpEntity entity = execute.getEntity();
//5.打印
String s = EntityUtils.toString(entity, Charset.forName("utf-8"));
System.out.println(s);
}
}
2.post请求
package code.demo;
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
public class HttpClienPost {
public static void main(String[] args) throws IOException {
//1.指定url
String url= "http://www.baidu.com";
//2.打开一个连接
HttpPost httpPost = new HttpPost(url);
CloseableHttpClient httpclient = HttpClients.createDefault();
//3.发送数据
List<NameValuePair> paramerter= new ArrayList<NameValuePair>();
paramerter.add(new BasicNameValuePair("username","zhangsan"));
paramerter.add(new BasicNameValuePair("password","lisi"));
httpPost.setEntity(new UrlEncodedFormEntity(paramerter));
//4.执行并获得数据
CloseableHttpResponse execute = httpclient.execute(httpPost);
HttpEntity entity = execute.getEntity();
//5.打印
String s = EntityUtils.toString(entity, Charset.forName("utf-8"));
System.out.println(s);
}
}
使用阿帕奇fluent更加加简洁
导入坐标
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<version>4.5.3</version>
</dependency>
1.get and post请求
package code.demo;
import org.apache.http.client.fluent.Form;
import org.apache.http.client.fluent.Request;
import java.io.IOException;
import java.nio.charset.Charset;
/*
流畅的api
更简练
*/
public class HttpClientFluent {
public static void main(String[] args) throws IOException {
//get请求
String s = Request.Get("http://www.baidu.com").execute().returnContent().asString(Charset.forName("utf-8"));
System.out.println(s);
//post请求,最好写上正确的地址和提交数据,不然报错
String request = Request.Post("http://www.baidu.com").bodyForm(Form.form().add("username", "vip").add("password", "123").build()).execute().returnContent().asString(Charset.forName("utf-8"));
System.out.println(request);
}
}