Jsoup爬虫获取网站内容简单案例
jsoup【官网中文文档】 是一款 Java的HTML解析器,通过创建HttpClient对象并向指定的URI网址发送请求并获取响应的内容,并对响应内容进行进一步的解析来进一步获取有价值的主要内容。
-
开发工具: IDEA,Maven,JDK1.8;
-
pom依赖:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
3 代码展示:
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class CrawlerFirst {
public static void main(String[] args) throws IOException {
//1创建HttpClient对象
CloseableHttpClient closeableHttpClient=HttpClients.createDefault();
//2创建请求
String uri="http://www.kuwo.cn";
HttpGet httpGet=new HttpGet(uri);
//3发起请求并获得响应
CloseableHttpResponse closeableHttpResponse=closeableHttpClient.execute(httpGet);
//4解析响应并获取数据
//首先判断状态码,200表示成功
if(closeableHttpResponse.getStatusLine().getStatusCode()==200){
//获取响应体,并给出编码
HttpEntity httpEntity= closeableHttpResponse.getEntity();
String content=EntityUtils.toString(httpEntity,"utf8");
System.out.println(content);
}
}
}