1 使用JSoup对虎扑新闻进行爬取
目标网站:虎扑体育网
爬取内容:链接+标题
工具:idea +Jsoup
效果:
开始:
引入Jsoup依赖:
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
编写Jsoup工具类:
package Jsoup;
import com.sun.xml.internal.ws.developer.MemberSubmissionEndpointReference;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class JsoupTest {
public void jsoupList(String url){
try {
//拿到网页内容
Document document= org.jsoup.Jsoup.connect(url).get();
//获取想拿的值,通过css选择器
Elements elements=document.select("div.news-list>ul>li>div.list-hd>h4>a");
//循环输出即可
for (Element element:
elements) {
String d_url=element.attr("href");
String title=element.ownText();
System.out.println("详情页链接:"+d_url+",详情页标题:"+title);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
测试运行:
public static void main(String[] args){
String url="https://voice.hupu.com/nba";
JsoupTest jsoupTest=new JsoupTest();
jsoupTest.jsoupList(url);
}
成功!!!
原理分析
目标网站分析:对目标网站进行分析,了解抓取的位置(通过Css选择器)
想要拿到对应的东西 就需要通过css:div.news-list>ul>li>div.list-hd>h4>a
2 使用httpclient+正则表达式爬取虎扑
目标网站:虎扑体育网
爬取内容:链接+标题
工具:idea +HttpClient+正则
效果:
开始:
导入相关的包:
<!--Httpclient相关的包 -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.10</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>4.5.10</version>
</dependency>
如下片段 是每一个标题的通用结构:所以结构需要严格把握,因为存在多个a的地方
<div class="news-list">
<ul>
<li>
<div class="list-hd">
<h4> <a href="https://voice.hupu.com/nba/2491729.html" target="_blank">初生牛犊不怕虎!PJ-华盛顿半场三分8中5独砍17分</a> </h4>
</div>
</li>
</ul>
</div>
代码:
package HttpClient;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class HttpClientPc {
public static void main(String[] args){
String url="https://voice.hupu.com/nba";
HttpClientPc httpClientPc=new HttpClientPc();
httpClientPc.httpClientList(url);
}
/**
* httpclient + 正则表达式 获取虎扑新闻列表页
* @param url 虎扑新闻列表页url
*/
public void httpClientList(String url){
try {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpclient.execute(httpGet);
//对返回数据进行处理
if (response.getStatusLine().getStatusCode() == 200) {
HttpEntity entity = response.getEntity();
String body = EntityUtils.toString(entity,"utf-8");
if (body!=null) {
/*
* 替换掉换行符、制表符、回车符,去掉这些符号,正则表示写起来更简单一些
* 只有空格符号和其他正常字体
*/
Pattern p = Pattern.compile("\t|\r|\n");
Matcher m = p.matcher(body);
body = m.replaceAll("");
/*
* 提取列表页的正则表达式
* 去除换行符之后的 li
* <div class="list-hd"> <h4> <a href="https://voice.hupu.com/nba/2485167.html" target="_blank">与球迷亲切互动!凯尔特人官方晒球队开放训练日照片</a> </h4> </div>
*/
Pattern pattern = Pattern
.compile("<div class=\"list-hd\">\\s* <h4>\\s* <a href=\"(.*?)\"\\s* target=\"_blank\">(.*?)</a>\\s* </h4>\\s* </div>" );
Matcher matcher = pattern.matcher(body);
// 匹配出所有符合正则表达式的数据
while (matcher.find()){
// String info = matcher.group(0);
// System.out.println(info);
// 提取出链接和标题
System.out.println("详情页链接:"+matcher.group(1)+" ,详情页标题:"+matcher.group(2));
}
}else {
System.out.println("处理失败!!!获取正文内容为空");
}
} else {
System.out.println("处理失败!!!返回状态码:" + response.getStatusLine().getStatusCode());
}
}catch (Exception e){
e.printStackTrace();
}
}
}