在做项目或要利用其他网站内容进行数据收集或分析时,想为我所用,通常需要分三步:
1、将目标页面进行爬取,获取原内容;
2、根据爬取的内容,进行分析规律,然后进行分解提取,获取想要的数据内容和格式;
3、基于内容和格式,做后续的分析和事情
1、Java代码-----将目标页面进行爬取,获取原内容;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.IOException;
public class getContentbyUrl{
public static void main(String[] args) {
//1.创建httpclient(相当于打开一个浏览器)
CloseableHttpClient httpClient = HttpClients.createDefault();
//2.创建get请求(相当于浏览器输入网址)
HttpGet request = new HttpGet("www.baidu.com");
CloseableHttpResponse response = null;
try {
//3.执行get请求(相当于输入网址后敲回车键)
response = httpClient.execu