获取页面中a标签的地址并写入文件中

最新推荐文章于 2023-03-31 20:49:13 发布

置顶樱梦雪苹宝

最新推荐文章于 2023-03-31 20:49:13 发布

阅读量387

点赞数

分类专栏：页面解析 Document Elements 文章标签：页面解析 Document Elements

本文链接：https://blog.csdn.net/yingmengxuepingbao/article/details/104170680

版权

页面解析同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

Document

2 篇文章 0 订阅

订阅专栏

Elements

2 篇文章 0 订阅

订阅专栏

这两天研究了下youku的视频路径，学习了下Document，Elements 做个备份。

package com.zx.com.cn.dao;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
 
/**
 * java实现爬虫
 */
public class Dss {
	/**
	   * 获取url源（只能获取当前的第一页，分页后的数据获取不到。）
	 * @param urlyuan url路径  "http://list.youku.com/albumlist/show/id_27558795.html?spm=a2h0k.8191403.0.0&sf=10101"
	 * @param toFilePath 生成文件的路径 "G:\\indext.html"
	 */
	@SuppressWarnings("unused")
	private static void getUrlYuan(String urlyuan,String toFilePath) {
		File des = new File(toFilePath);
		   if (!des.exists()) { // 判断是否存在，不存在就创建
		    try {
		    	// 创建文件
				des.createNewFile();
			} catch (IOException e) {
				e.printStackTrace();
			} 
		   }
		try {
	        URL url=new URL(urlyuan);
	        BufferedReader reader=new BufferedReader(new InputStreamReader(url.openStream(),"utf-8"));
	        BufferedWriter writer=new BufferedWriter(new FileWriter(toFilePath));
	        String line;
	        while((line=reader.readLine())!=null){
	            System.out.println(line);
	            writer.write(line);
	            writer.newLine();
	        }
	        reader.close();
	        writer.close();
	    } catch (MalformedURLException e) {
	        e.printStackTrace();
	    } catch (IOException e) {
	        e.printStackTrace();
	    }
	}
	
	/**
	 * 截取指定字段
	 */
	public static String  subString(String str, String strStart, String strEnd) {
        /* 找出指定的2个字符在 该字符串里面的 位置 */
        int strStartIndex = str.indexOf(strStart)+3;
        int strEndIndex = str.indexOf(strEnd);
 
        /* index为负数 即表示该字符串中没有该字符 */
		/*//此方法中明确存在指定的两个字符。
		 * if (strStartIndex < 0) { return "字符串 :" + str + " 不存在 " + strStart +
		 * ", 无法截取目标字符串"; } if (strEndIndex < 0) { return "字符串 :" + str + " 不存在 " +
		 * strEnd + ", 无法截取目标字符串"; }
		 */
        /* 开始截取 */
        String result = str.substring(strStartIndex, strEndIndex);
        return result;
    }
	
	/**
	 * 将数据（字符串）写入文档。
	 * @param line
	 * @param toFilePath
	 */
	private static void toFile(String line, String toFilePath) {
		File des = new File(toFilePath);
	   if (!des.exists()) { // 判断是否存在，不存在就创建
		    try {
		    	// 创建文件
				des.createNewFile();
			} catch (IOException e) {
				e.printStackTrace();
			} 
	   }
		BufferedWriter writer;
		try {
			writer = new BufferedWriter(new FileWriter(toFilePath));
            System.out.println("line = "+line);
            writer.write(line);
            writer.newLine();
	        writer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	/**
	 * 从一个网站获取和解析一个HTML文档（只能解析第一页的数据）
	 * @param url  http://list.youku.com/albumlist/show/id_27558795.html?spm=a2h0k.8191403.0.0&sf=10101
	 * 真实路径：   "http://player.youku.com/embed/"+id
	 */
	@SuppressWarnings("unused")
	private static void getHTML(String url) {
		StringBuffer linkSB=new StringBuffer();
		try {
			Document doc = Jsoup.connect(url).get();
			//数据提取
			Elements links = doc.select("div.p-thumb").select("a[href]"); //div class:p-thumb 下的带有href属性的a元素
			for (Element link : links) {
				String linkHref = link.attr("href");//获取href中的数据。
				System.out.println("linkHref = "+linkHref);
				
				//截取id 拼接真实路径
				String id = subString(linkHref, "id_", ".html");
				String realURL = "http://player.youku.com/embed/"+id;
				System.out.println("realURL = " +realURL);
				linkSB.append(realURL);
				linkSB.append("\r\n");
			}
			toFile(linkSB.toString(), "G:\\test.txt");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	/**
	 * 解析文件位于网站的本地文件系统
	 * @param localFilePath 本地文件路径
	 * 
	 * parse(input, "UTF-8", "http://example.com/")
	 * parse(File in, String charsetName, String baseUri) baseUri 参数用于解决文件中URLs是相对路径的问题。如果不需要可以传入一个空的字符串。
	 */
	private static void getLocalFile(String localFilePath) {
		StringBuffer linkSB=new StringBuffer();
		Document doc =null;
		if(localFilePath!=null && localFilePath!="") {
			try {
				File input = new File(localFilePath);
				doc = Jsoup.parse(input, "UTF-8", "");
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		//数据提取
		//Element content = doc.getElementById("content");
		//Elements links = content.getElementsByTag("a");
		//数据提取
		Elements links = doc.select("div.p-thumb").select("a[href]"); //div class:p-thumb 下的带有href属性的a元素
		for (Element link : links) {
			String linkHref = link.attr("href");//获取href中的数据。
			//截取id 拼接真实路径
			String id = subString(linkHref, "id_", ".html");
			String realURL = "http://player.youku.com/embed/"+id;
			System.out.println("realURL = " +realURL);
			linkSB.append(realURL);
			linkSB.append("\r\n");
		}
		toFile(linkSB.toString(), "G:\\url.txt");
	}
	
	public static void main(String[] args) {
		//获取网页中指定标签下的a连接
		//getHTML("http://list.youku.com/albumlist/show/id_27558795.html?spm=a2h0k.8191403.0.0&sf=10101");	
		
		/*//获取整个页面的源码下载。
		 * String urlyuan = "http://list.youku.com/albumlist/show/id_27558795.html?spm=a2h0k.8191403.0.0&sf=10101"; 
		 * String toFilePath = "G:\\indext.html"; //获取url源
		 * getUrlYuan(urlyuan, toFilePath);
		 */
		
		getLocalFile("G:\\1.html");
	}
	
	
}