这里只解析一下代码,所需工具jsoup、HttpClient
httpCLient获取html后,用jsoup解析html,再用java来获取所需要的信息。
之前写的有点问题,今天改了一下。因为通过hash值拼接的地址是个临时地址,存在数据库后一天就失效了,所以我改了一下。先把爬到的歌曲下载到本地,然后上传到七牛云的对象存储空间,再返回这个地址。最后把七牛云上的地址存到数据库中,这样就是永久的了。
HTTPManage:
package com.after.demo.spider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 19:06
*/
public class HtmlManage {
public Document manage(String html){
Document doc = Jsoup.parse(html);
return doc;
}
}
HttpGetConnect:
package com.after.demo.spider;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.BasicHttpClientConnectionManager;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
/**
* @author www.xyjz123.xyz
* @description
* @date 2019/4/19 19:05
*/
public class HttpGetConnect {
/**
* 获取html内容
* @param url
* @param charsetName UTF-8、GB2312
* @return
* @throws IOException
*/
public static String connect(String url,String charsetName) throws IOException{
BasicHttpClientConnectionManager connManager = new BasicHttpClientConnectionManager();
CloseableHttpClient httpclient = HttpClients.custom()
.setConnectionManager(connManager)
.build();
String content = "";
try{
HttpGet httpget = new HttpGet(url);
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(5000)
.setConnectTimeout(50000)
.setConnectionRequestTimeout(50000)
.build();
httpget.setConfig(requestConfig);
httpget.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
httpget.setHeader("Accept-Encoding", "gzip,deflate,sdch");
httpget.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpget.setHeader("Connection", "keep-alive");
httpget.setHeader("Upgrade-Insecure-Requests", "1");
httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
httpget.setHeader("cache-control", "max-age=0");
CloseableHttpRespon