项目托管平台: 码云地址:
https://gitee.com/HDMBS/JavaSpiderDemo.git
本程序依赖Maven_Jar!!!
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.9</version>
</dependency>
public static void main(String[] args) {
//可以根据访问网站的地区来确定是否try catch ;国内基本用不到try catch ,国外因为延迟等各种原因需要try catch;
// 访问网址
final String URL = "https://www.tuicool.com/";
// 创建可关闭的HttpClient实例对象(新版本才可以)相当于创建了一个模拟浏览器
CloseableHttpClient httpclient = HttpClients.createDefault()
// 一般爬虫请求都用Get,Get请求在HTTP请求协议里代表安全的查看:这个请求对象里可以添加http的请求头等
HttpGet httpGet = new HttpGet(URL)
try {
// 用浏览器模拟对象httpClient,发送一个Get请求:可以通过这个响应对象获得很多http的响应信息
respond = httpclient.execute(httpGet);
} catch (ClientProtocolException e) {
System.out.println("1HTTP协议异常!!!!!!!!!!!!!!!!!!!!!!!!!!");
e.printStackTrace();
} catch (IOException e) {
System.out.println("2IO异常!!!!!!!!!!!!!!!!!!!!!!!!!!");
e.printStackTrace();
}
// 获取返回的网页实体
HttpEntity entity = respond.getEntity();
try {
// 获取网页实体对象转换为字符串,并指定最终编码
entitys = EntityUtils.toString(entity, "utf-8");
} catch (ParseException e1) {// 解析异常
e1.printStackTrace();
} catch (IOException e1) {// io异常
e1.printStackTrace();
}
System.out.println(entitys);
try {
// 关闭流资源
httpclient.close();
} catch (IOException e) {
e.printStackTrace();
}
try {
// 关闭流资源
respond.close();
} catch (IOException e) {
e.printStackTrace();
}