先找到视频的请求url,发现url中vid(猜测是视频参数)是要进行传参的,
在代码检查中查找vid,找到参数vid。
提取参数vid,拼接url,然后模拟请求。
即可进行视频下载
package com.example.shares.utils;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.io.FileUtils;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlingJinDong {
//获取String类型页面
public static String getHtmlStr(String url) throws IOException {
WebClient webClient = new WebClient();
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setUseInsecureSSL(true);
HtmlPage htmlPage = webClient.getPage(url);
String html =htmlPage.asXml();
webClient.close();
return html;
}
//写入文件
public static void writerIntoFile(String html,String path) throws IOException {
//新建文件,并放入文件写入流中
File file = new File(path);
FileWriter fileWriter = new FileWriter(file);
//把String类型的html写入文件写入流中
fileWriter.write(html);
fileWriter.close();
}
//读出文件
public static String readFile(String html,String path) throws IOException {
//通过缓冲取读取文件读入流中的文件
BufferedReader bufferedReader = new BufferedReader(new FileReader(path));
StringBuffer sb = new StringBuffer();
//将缓冲区的文件读出到字符串中
String contentLine = bufferedReader.readLine();
while(contentLine!=null){
sb.append(contentLine);
sb.append("\r\n");
contentLine=bufferedReader.readLine();
}
bufferedReader.close();
return sb.toString();//转为字符串输出
}
public static void main(String args[]) throws IOException {
String url="https://item.jd.com/100003311437.html";
String htmlFilePath = "D:/picture/jidong/taideng.html";
String resourcesFile = "D:/picture/jidong/";
String html = getHtmlStr(url);
writerIntoFile(html,htmlFilePath);
String html1 = readFile(html,htmlFilePath);
System.out.println("====================== 开始爬取 ========================");
//获取页面标题
String str1="<div class=\"sku-name\">(.+?)</div>";//正则表达式
Pattern pattern = Pattern.compile(str1,Pattern.DOTALL);
Matcher matcher = pattern.matcher(html1);
while(matcher.find()){
System.out.println(matcher.group(1).replaceFirst("<img .+?>","").trim());
}
//获取视频
//"infoVideoId":"126016285","mainVideoId":"99177800"
//先查找请求中的响应VideoId
String str2 ="infoVideoId\":\"(\\d+?)\",\"mainVideoId\":\"(\\d+?)\"";
Pattern pattern1 = Pattern.compile(str2);
Matcher matcher1 = pattern1.matcher(html1);
String infoVideoId=null,mainVideoId=null;
while(matcher1.find()) {
infoVideoId = matcher1.group(1);
mainVideoId = matcher1.group(2);
}
System.out.println("infoVideoId:"+infoVideoId+" \nmainVideoId:"+mainVideoId);
//进行请求模拟,获取请求响应头
String infoStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v2?vid="+infoVideoId,"utf-8");
String mainStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v3?vid="+mainVideoId,"utf-8");
System.out.println("请求响应:"+"infoStr"+infoStr+"\n"+mainStr);
//在请求响应头中查找MP4对应的url,并存入list中
String str3="\"playUrl\":\"(.+?)\"";
Pattern pattern2 = Pattern.compile(str3);
Matcher matcher2 = pattern2.matcher(infoStr);
Matcher matcher3 = pattern2.matcher(mainStr);
List<String> list = new ArrayList<>();
while(matcher2.find()) {
list.add(matcher2.group(1));
System.out.println("matcher2"+matcher2.group(1));
}
while(matcher3.find()) {
list.add(matcher3.group(1));
System.out.println("matcher3"+matcher3.group(1));
}
//开始准备视频下载
System.out.println("------------------------开始准备下载----------------------");
for(int i=0;i<list.size();i++){
System.out.print("正在下载 ====="+list.get(i));
URL urlMp4 = new URL(list.get(i));
URLConnection con = urlMp4.openConnection();
con.setConnectTimeout(10*1000);
InputStream inputStream = con.getInputStream();
FileUtils.copyInputStreamToFile(inputStream, new File(resourcesFile + i + ".mp4"));
}
}
}