搜索式抓取豆瓣影片播放来源

最新推荐文章于 2021-06-09 10:42:35 发布

写代码的辉哥哥

最新推荐文章于 2021-06-09 10:42:35 发布

阅读量986

点赞数 1

分类专栏： http请求文章标签： Java http协议 url 豆瓣搜索

本文链接：https://blog.csdn.net/qq_36537809/article/details/77944692

版权

http请求专栏收录该内容

4 篇文章 0 订阅

订阅专栏

package com.jitv.tv.test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.Test;

import com.aspire.commons.util.JsonUtil;

/**
 * @author xiaominghui@9ikandian.com
 * @date 2017-9-11 下午12:13:22
 * @describe
 */
public class GradTest {

	/**
	 * 搜索的连接--https://www.douban.com/search?q=
	 */
	private static final String douban = "https://www.douban.com/search?q=";

	/**
	 * 搜索页面的来源列表的css路径 .result .content .title
	 */
	private static final String cssQuery = ".result .content .title";

	/**
	 * 获取到视频类型的css路径--h3 span:first-child
	 */
	private static final String cssQuery2 = "h3 span:first-child";

	/**
	 * 视频类型名称--[电视剧]
	 */
	private static final String vType1 = "[电视剧]";

	/**
	 * 视频类型名称--[电影]
	 */
	private static final String vType2 = "[电影]";

	/**
	 * 简介的css路径--- .rating-info .subject-cast
	 */
	private static final String infoCssPath = ".rating-info .subject-cast";

	/**
	 * 查询搜索结果中保存视频ID的a便签的css路径 h3 a
	 */
	private static final String urlCss = "h3 a";

	/**
	 * 点击事件 onclick
	 */
	private static final String onclick = "onclick";

	/**
	 * 播放来源的a的css路径 -- .page .card section ul li a
	 */
	private static final String aCssPath = ".page .card section ul li a";

	/**
	 * 点击事件中，保存ID的key值 sid
	 */
	private static final String idKey = "sid";

	/**
	 * header头文件的name---User-Agent
	 */
	private static final String headerName = "User-Agent";

	/**
	 * header头文件的值
	 * <p>
	 * Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X)
	 * AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143
	 * Safari/601.1
	 * </p>
	 */
	private static final String headerValue = "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1";

	/**
	 * a标签的href属性name---href
	 */
	private static final String href = "href";

	/**
	 * 最终获取到的播放地址的css路径-- span:first-child
	 */
	private static final String souUrl = "span:first-child";

	/**
	 * 来源地址 url
	 */
	private static final String sourceUrl = "url";

	/**
	 * 来源名称 name
	 */
	private static final String sourceName = "name";

	/**
	 * 来源页面的url拼接前缀 https://m.douban.com/movie/subject/
	 */
	private static final String url1 = "https://m.douban.com/movie/subject/";

	/**
	 * 来源页面的url拼接后缀 /vendors?from=subject
	 */
	private static final String url2 = "/vendors?from=subject";
	
	/**
	 * video 表中的电影字段type ，电视电影
	 */
	private static final String movie = "movie";
	
	/**
	 * video 表中的电影字段type，电视剧
	 */
	private static final String TV = "tv";

	@Test
	public void testOne() throws IOException {

		List<Map<String, Object>> videoList = getVideoList(
				"人民的名义", 
				"李路",
				"陆毅,张丰毅,吴刚,许亚军,张志坚,柯蓝,胡静,张凯丽,赵子琪,白志迪,李建义,高亚麟,丁海峰,冯雷,李光复,张晞临,徐光宇,陶慧敏,黄俊鹏,阚犇犇,唐菀,岳秀清,许文广,李威,施大生,侯勇,王丽云", 
				"2017", 
				"tv");

		System.out.println(videoList);
	}

	/**
	 * 
	 * @param seek
	 * @param director
	 * @param actor
	 * @param time
	 * @param type TODO
	 * @return
	 * @author XiaoMingHui
	 * @throws IOException
	 * @date 2017-9-12 上午9:55:19
	 */
	public List<Map<String, Object>> getVideoList(String seek, String director,
			String actor, String time, String type) throws IOException {

		String url = douban + seek;

		// 搜索页面下的结果列表HTML页面
		Document document = Jsoup.connect(url).get();

		Iterator<Element> it = document.select(cssQuery).iterator();

		while (it.hasNext()) {
			Element title = it.next();

			// 检查是否符合条件
			if (checkFiltration(title, type, director, actor, time, seek)) 
				continue;

			String videoIdJson = title.select(urlCss).attr(onclick);

			// 指定位置截取json
			videoIdJson = videoIdJson.substring(13, videoIdJson.length() - 1);

			Map<String, Object> map = JsonUtil.toBean(videoIdJson, Map.class);
			// 拼接成url再直接请求
			Connection con = Jsoup.connect(url1 + map.get(idKey) + url2);

			con.header(headerName, headerValue);

			List<Map<String, Object>> maps = new ArrayList<>();

			// 来源的a标签
			Iterator<Element> it2 = con.get().select(aCssPath).iterator();
			while (it2.hasNext()) {
				Element alabel = it2.next();

				Map<String, Object> videoMap = new HashMap<>();
				// 来源播放地址
				videoMap.put(sourceUrl, alabel.attr(href));
				// 来源name
				videoMap.put(sourceName, alabel.select(souUrl).html());

				maps.add(videoMap);
			}
			return maps;
		}
		return new ArrayList<>();
	}
	
	
	/**
	 * 过滤影片，只有在条件符合的情况下，才返回false，不然一律返回true
	 * 
	 * @param title
	 * @param type
	 * @param director
	 * @param actor
	 * @param time
	 * @param seek
	 * @return
	 * @author XiaoMingHui
	 * @date 2017-9-12 下午1:10:19
	 */
	private boolean checkFiltration(Element title, String type,
			String director, String actor, String time, String seek) {
		
		String videoType = title.select(cssQuery2).html();

		if (!vType1.equals(videoType) && !vType2.equals(videoType))
			return true;

		// 区分电影和电视剧的类型，进行过滤
		if (movie.equals(type) || TV.equals(type))
			if (!(movie.equals(type) ? vType2 : vType1).equals(videoType))
				return true;

		String[] infos = title.select(infoCssPath).html().split("/");

		// 导演过滤
		if (StringUtils.isNotBlank(director)) {
			if (!director.contains(infos[1].trim()))
				return true;
		}
		
		// 主演过滤
		if (StringUtils.isNotBlank(actor)) {
			if (!actor.contains(infos[2].trim()))
				return true;
		}
		// 年代过滤
		if (StringUtils.isNotBlank(time)) {
				if (!time.contains(infos[3].trim()))
					return true;
		}

		// 搜索名称过滤
		if (!title.select(urlCss).html().trim().equals(seek))
			return true;

		return false;
	}
	
	

}

写代码的辉哥哥

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
搜索式抓取豆瓣影片播放来源

package com.jitv.tv.test;import java.io.IOException;import java.util.ArrayList;import java.util.HashMap;import java.util.Iterator;import java.util.List;import java.util.Map;import org.apache.
复制链接

扫一扫