京东简单爬虫

最新推荐文章于 2024-04-24 11:13:10 发布

假装得大佬

最新推荐文章于 2024-04-24 11:13:10 发布

阅读量5k

点赞数

分类专栏： java爬虫

本文链接：https://blog.csdn.net/qq_44895567/article/details/117980251

版权

java爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

先找到视频的请求url，发现url中vid（猜测是视频参数）是要进行传参的，
在代码检查中查找vid，找到参数vid。
提取参数vid，拼接url，然后模拟请求。
即可进行视频下载

package com.example.shares.utils;

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import org.apache.commons.io.FileUtils;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CrawlingJinDong {
    //获取String类型页面
    public static String getHtmlStr(String url) throws IOException {
        WebClient webClient = new WebClient();
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setJavaScriptEnabled(false);
        webClient.getOptions().setUseInsecureSSL(true);
        HtmlPage htmlPage = webClient.getPage(url);
        String html =htmlPage.asXml();
        webClient.close();
        return html;
    }

    //写入文件
    public static void writerIntoFile(String html,String path) throws IOException {
       //新建文件，并放入文件写入流中
        File  file = new File(path);
        FileWriter fileWriter = new FileWriter(file);
        //把String类型的html写入文件写入流中
        fileWriter.write(html);
        fileWriter.close();
    }

    //读出文件
    public static String readFile(String html,String path) throws IOException {
        //通过缓冲取读取文件读入流中的文件
        BufferedReader bufferedReader = new BufferedReader(new FileReader(path));
        StringBuffer sb = new StringBuffer();
        //将缓冲区的文件读出到字符串中
        String contentLine = bufferedReader.readLine();
        while(contentLine!=null){
            sb.append(contentLine);
            sb.append("\r\n");
            contentLine=bufferedReader.readLine();
        }
        bufferedReader.close();
        return sb.toString();//转为字符串输出
    }


    public static void main(String args[]) throws IOException {
        String url="https://item.jd.com/100003311437.html";
        String htmlFilePath = "D:/picture/jidong/taideng.html";
        String resourcesFile = "D:/picture/jidong/";
        String html = getHtmlStr(url);
        writerIntoFile(html,htmlFilePath);
        String html1 = readFile(html,htmlFilePath);

        System.out.println("======================  开始爬取  ========================");

        //获取页面标题
        String str1="<div class=\"sku-name\">(.+?)</div>";//正则表达式
        Pattern pattern = Pattern.compile(str1,Pattern.DOTALL);
        Matcher matcher = pattern.matcher(html1);
        while(matcher.find()){
            System.out.println(matcher.group(1).replaceFirst("<img .+?>","").trim());
        }

        //获取视频
        //"infoVideoId":"126016285","mainVideoId":"99177800"
        //先查找请求中的响应VideoId
        String str2 ="infoVideoId\":\"(\\d+?)\",\"mainVideoId\":\"(\\d+?)\"";
        Pattern pattern1 = Pattern.compile(str2);
        Matcher matcher1 = pattern1.matcher(html1);
        String infoVideoId=null,mainVideoId=null;
        while(matcher1.find()) {
            infoVideoId = matcher1.group(1);
            mainVideoId = matcher1.group(2);
        }
        System.out.println("infoVideoId:"+infoVideoId+"  \nmainVideoId:"+mainVideoId);

        //进行请求模拟，获取请求响应头
        String infoStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v2?vid="+infoVideoId,"utf-8");
        String mainStr=HttpClientUtil.get("https://cd.jd.com/tencent/video_v3?vid="+mainVideoId,"utf-8");
        System.out.println("请求响应："+"infoStr"+infoStr+"\n"+mainStr);

        //在请求响应头中查找MP4对应的url，并存入list中
        String str3="\"playUrl\":\"(.+?)\"";
        Pattern pattern2 = Pattern.compile(str3);
        Matcher matcher2 = pattern2.matcher(infoStr);
        Matcher matcher3 = pattern2.matcher(mainStr);
        List<String> list = new ArrayList<>();
        while(matcher2.find()) {
            list.add(matcher2.group(1));
            System.out.println("matcher2"+matcher2.group(1));
        }
        while(matcher3.find()) {
            list.add(matcher3.group(1));
            System.out.println("matcher3"+matcher3.group(1));
        }

        //开始准备视频下载
        System.out.println("------------------------开始准备下载----------------------");
        for(int i=0;i<list.size();i++){
            System.out.print("正在下载  ====="+list.get(i));
            URL urlMp4 = new URL(list.get(i));
            URLConnection con = urlMp4.openConnection();
            con.setConnectTimeout(10*1000);
            InputStream inputStream = con.getInputStream();
            FileUtils.copyInputStreamToFile(inputStream, new File(resourcesFile + i + ".mp4"));
        }

    }
}

假装得大佬

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
京东简单爬虫

先找到视频的请求url，发现url中vid（猜测是视频参数）是要进行传参的，在代码检查中查找vid，找到参数vid。提取参数vid，拼接url，然后模拟请求。即可进行视频下载package com.example.shares.utils;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlPage;import org.apache.commons.io.F
复制链接

扫一扫