导入HttpComponents的包
下载之后解压,找到bin目录,导入这三个包就行
或者是使用maven
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
抓取代码
新建一个实体类,保存抓取的数据
public class WebEntity implements Serializable{
public String id;
public String url;
public String content;
/**
* 获取网页内容
*
* @param path
* @return
*/
public static List<WebEntity> catchWebContent(String path) {
List<WebEntity> list = new ArrayList<>();
//创建httpclient对象
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(path);
CloseableHttpResponse response = null;
try {
//使用httpclient发送请求
response = httpClient.execute(httpGet);
//响应码为200是表示成功
if (response.getStatusLine().getStatusCode() == 200) {
//需要设置编码,这里主要看抓取的页面的编码,编码不一致会使结果乱码
String content = EntityUtils.toString(response.getEntity(), "UTF-8");
regxContent(content,list);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (response != null) {
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return list;
}
/**
* 使用正则表达式匹配抓取的内,这里关于正则的使用就不讲解了
* @param content
* @param list
*/
public static void regxContent(String content, List<WebEntity> list) {
//匹配所有a标签
String regex_str="<a[^>]+>[^<]*</a>";
Pattern pattern = Pattern.compile(regex_str);
Matcher matcher = pattern.matcher(content);
while (matcher.find()){
WebEntity webEntity = new WebEntity();
webEntity.setId(genUUID());
String href = matcher.group();
webEntity.setContent(href);
list.add(webEntity);
}
}
/**
* 得到UUID值
*
* @return
*/
public static String genUUID() {
return UUID.randomUUID().toString().replace("-", "");
}
测试
//这里抓取的新浪新闻首页的信息
String url = "http://news.sina.com.cn/";
HttpUtil httpUtil = new HttpUtil();
List<WebEntity> webEntities = httpUtil.catchWebContent(url);
for(WebEntity webEntity:webEntities){
System.out.println(webEntity.getContent());
System.out.println(webEntity.getUrl());
System.out.println("===================================================");
}