转载【Java爬取某姐的小视频】

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * 功能:爬取某姐的小视频
 * @author cxd
 *
 */
public class WebSpiderDemo1 {
 
	public static void main(String[] args) throws Exception {
 
		String source = "http://www.budejie.com/video/";
		String destDir = "D:/rob/";
 
		Map<String, String&gt; urlMap = getUrlInSource(source);
 
		for (Map.Entry<String, String> entry : urlMap.entrySet()) {
			String title = entry.getKey();// 视频名称
			String url = entry.getValue();// 视频url
			File destFile = new File(destDir + title + ".mp4");
			download(url, destFile);
		}
	}
 
	/**
	 * 通过视频的URL下载该视频并存入本地
	 * 
	 * @param url      视频的URL
	 * @param destFile 视频存入的位置
	 * @throws IOException
	 */
	public static void download(String url, File destFile) throws IOException {
		URL videoUrl = new URL(url);
 
		InputStream is = videoUrl.openStream();
		FileOutputStream fos = new FileOutputStream(destFile);
 
		int len = 0;
		byte[] buffer = new byte[1024];
		while ((-1) != (len = is.read(buffer))) {
			fos.write(buffer, 0, len);
		}
		fos.flush();
 
		if (null != fos) {
			fos.close();
		}
 
		if (null != is) {
			is.close();
		}
	}
 
	/**
	 * 获取视频的URL地址和视频名称存入hashMap
	 * 
	 * @param source
	 * @return
	 * @throws IOException
	 */
	public static Map<String, String> getUrlInSource(String source) throws IOException {
 
		Map<String, String> hashMap = new HashMap<>();
 
		for (int index = 1; index <= 1; index++) { // 页数最大为50,自己玩嘛,就只爬取了一页。
			String pageUrl = source + index;
			URL url = new URL(pageUrl);
			InputStream is = url.openStream();
 
//			若遇到反爬机制则使用该方法将程序伪装为浏览器进行访问
//			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//			conn.setRequestMethod("GET");
//			conn.setRequestProperty("user-agent",
//					"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
//			BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
 
			BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
 
			String info = null;
			String title = null;
			// 此处不要用==null进行判断,因为网页中有很多行都是null,否则会报java.lang.NullPointerException。
			for (int i = 0; i < 10000; i++) {
				info = br.readLine();
 
				if (null != info) {
					String urlRegex = "href=\"(.*?\\.mp4)";
 
					if (info.contains("data-text")) {
						title = info;
					}
 
					Pattern pattern = Pattern.compile(urlRegex);
					Matcher matcher = pattern.matcher(info);
					if (matcher.find()) {
						for (int j = 0; j <= matcher.groupCount(); j++) {
							String tmp = matcher.group(j);
							if (!tmp.startsWith("href=")) {
								String videoTitle = getTitle(title.trim());
								hashMap.put(videoTitle, tmp);
							}
						}
					}
				}
			}
		}
		return hashMap;
	}
 
	/**
	 * 清洗整理titile字符串,
	 * 
	 * @param info
	 * @return
	 */
	private static String getTitle(String info) {
 
		int len = info.length();
		String title = info.substring(11, len - 2);
		return title;
	}
}

6b45eaad5f7c21280b9b1e467e2c1c5aea7.jpg

转载于:https://my.oschina.net/u/3267498/blog/3009530

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值