Java爬虫历险记 – (1)爬取百度首页的logo
在这篇文章里,介绍两种方式来获取百度网页的logo: (1)Httpclient (2) jsoup + Httpclient ,详细的运行结果可以参看文章末的参考资料。代码使用的.jar包,如下图:
第一种:只使用Httpclient
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
public class TestOne {
static String sendGet( String url ){
// 定义一个字符串用来存储网页内容
String result ="";
// 定义一个缓冲字符输入流
BufferedReader in = null;
try{
// 将string转成url对象
URL realUrl = new URL( url );
// 初始化一个链接到那个url的链接
URLConnection connection = realUrl.openConnection();
// 开始实际的连接
connection.connect();
// 初始化BufferedReader输入流来读取URL的响应
in = new BufferedReader( new InputStreamReader( connection.getInputStream() ) );
// 用来临时存储抓取的每一行数据
String line;
while( ( line = in.readLine() ) != null ){
//遍历抓取到的每一行并将其存储到result里面
result += line;
}
}catch(Exception e){
System.out.println( "发送GET请求出现异常!" + e );
e.printStackTrace();
}
// 使用finally来关闭输入流
finally{
try{
if( in != null ){
in.close();
}
}catch(Exception e){
e.printStackTrace();
}
}
return result;
}
static String RegexString( String targetStr , String patternStr ){
// 定义一个样式模板,此中使用正则表达式,括号中是要抓取的内容
// 相当于埋好了陷阱匹配的地方就会掉下去
Pattern pattern = Pattern.compile( patternStr );
// 定义一个matcher用来做匹配
Matcher matcher = pattern.matcher( targetStr );
// 如果找到了
if( matcher.find() ){
return matcher.group(1);
}
return "Nothing";
}
public static String get(String url){
String filename = "";
String tergetUrl = "http://" + url;
try {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(tergetUrl);
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
if (response != null && response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
System.out.println(response.getStatusLine());
HttpEntity entity = response.getEntity();
filename = download(entity);
}
} finally {
httpclient.close();
response.close();
}
}catch (Exception e){
e.printStackTrace();
}
return filename;
}
private static String download(HttpEntity resEntity) {
//图片要保存的路径
String dirPath = "d:\\pic\\";
//图片名称,可以自定义生成
String fileName = "b_logo.png";
//如果没有目录先创建目录,如果没有文件名先创建文件名
File file = new File(dirPath);
if(file == null || !file.exists()){
file.mkdir();
}
String realPath = dirPath.concat(fileName);
File filePath = new File(realPath);
if (filePath == null || !filePath.exists()) {
try {
filePath.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
}
//得到输入流,然后把输入流放入缓冲区中,缓冲区--->输出流flush,关闭资源
BufferedOutputStream out = null;
InputStream in = null;
try {
if (resEntity == null) {
return null;
}
in = resEntity.getContent();
out = new BufferedOutputStream(new FileOutputStream(filePath));
byte[] bytes = new byte[1024];
int len = -1;
while((len = in.read(bytes)) != -1){
out.write(bytes,0,len);
}
out.flush();
out.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (in != null) {
in.close();
}
} catch (IOException e) {
}
}
return filePath.toString();
}
public static void main( String[] args ) {
// TODO Auto-generated method stub
// 定义即将访问的链接
String url = "https://www.baidu.com/";
// 访问链接并获取内容
String result = sendGet( url );
// 使用正则匹配图片的src内容
String imgSrc = RegexString( result , "src=//(.+?) " );
System.out.println( imgSrc );
// 将图片获取到本地
get(imgSrc);
}
}
注意点:
(1)正则表达式匹配:”src=//(.+?) ” 在 ) 后面有一个空格,否则匹配不成功。
(2)要导入Httpclient的. jar包
(3)在get图片时候,要将其路径补充完整: String tergetUrl = “http://” + url;
第二种:jsoup + Httpclient
import java.io.*;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TestTwo {
public static void main(String args[]){
String url = "http://www.baidu.com";
String result = getBaiduPic(url);
String picUrl = "Https:"+result;
//System.out.println(picUrl);
downPicture(picUrl);
}
public static String getBaiduPic(String url){
Document doc;
String result="";
try{
//从一个URL加载一个Document
doc = Jsoup.connect(url).get();
//使用选择器语法来查找元素,类似于DOM的方法,返回一组元素,如下图键值对
Elements ListDiv = doc.getElementsByAttributeValue("id", "lg");
for(Element element: ListDiv){
//根据标签名称在element中匹配符合条件的项
Elements links = element.getElementsByTag("img");
for(Element link: links){
//取出src属性对应的值,也就是图片链接
result = link.attr("src");
System.out.println(result);
}
}
}catch(Exception e){
e.printStackTrace();
}
return result;
}
public static void downPicture(String url){
java.io.InputStream inputStream = null;
OutputStream outputStream = null;
File file = new File("D://pic");
try {
//1.初始化HttpClient对象
CloseableHttpClient httpClient = HttpClientBuilder.create().build();
//2.创建一个HttpGet方法
HttpGet httpGet = new HttpGet(url);
//3.执行请求
HttpResponse response = httpClient.execute(httpGet);
//4.获取返回状态码
int returnCode = response.getStatusLine().getStatusCode();
if(returnCode == 200){
//创建文件夹
file.mkdir();
HttpEntity entity = response.getEntity();
//初始化inputStream
inputStream = entity.getContent();
outputStream = new FileOutputStream(new File("D://pic//LOGO.jpg"));
int temp = -1;
while((temp = inputStream.read())!=-1){
outputStream.write(temp);
}
httpGet.abort();
}
}catch(Exception e){
e.printStackTrace();
}finally{
if(inputStream!=null){
try {
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(outputStream!=null){
try {
outputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
注意点:
(1)在downPicture()中,java.io.InputStream inputStream = null; 由于 HttpEntity entity = response.getEntity(); 的影响。
参考资料:
(1)行走江湖的少侠哥 – 第2节—小任务,爬取百度LOGO链接并下载图片 : http://blog.csdn.net/sinat_32588261/article/details/72287108
(2)Mr_river – Java简单爬虫系列 : https://my.oschina.net/u/2519530/blog/597359
(3)汪海的实验室 – [Java]知乎下巴第1集:爬虫世界百度不仅仅可以拿来测网速 : http://blog.csdn.net/pleasecallmewhy/article/details/17594303
(这个博客可以看下讨论区)