用jsoup爬取uuu9的漫画

最新推荐文章于 2022-06-11 12:03:17 发布

fuckcdn

最新推荐文章于 2022-06-11 12:03:17 发布

阅读量1.2k

点赞数

分类专栏： util 文章标签： Java OS F# Firebug .net

util 专栏收录该内容

53 篇文章 0 订阅

订阅专栏

这两天打算把魔兽世界的官方漫画<王者归来>看看,体会下刀疤男的复仇历程.google后发现uuu9上有中文版,但是只能在线看,每次都要点击图片最大化看,很烦.于是想下载下来看.用firebug查看,发现页面写的还算规范.就是用jsoup解析图片地址,然后用url下载下来.很简单,也不想用多线程了.就这样直接一个一个来吧.毕竟我家里4M的网速不是盖的.

package pic;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Spider {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		File dir = new File("F:/国王归来");
		if (!dir.exists()) {
			dir.mkdirs();
		}
		Spider spider = new Spider();
		spider.catalog(dir, "http://wow.uuu9.com/2008/200812/187521.shtml");
		spider.prey(dir);
	}

	/**
	 * 分析目录
	 * 
	 * @param address
	 */
	private void catalog(File dir, String address) {
		try {
			URL url = new URL(address);
			Document doc = Jsoup.parse(url, 1000 * 3);
			Element body = doc.body();
			Element textworld = body.getElementsByClass("textworld").first();
			Element table = textworld.getElementsByTag("table").first();
			Elements hrefs = table.getElementsByTag("a");

			Map<File, String> map = new LinkedHashMap<File, String>();
			File catalog = new File(dir, "catalog.txt");
			BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(
					new FileOutputStream(catalog), "UTF-8"));
			for (Element a : hrefs) {
				if (a.children().size() == 0) {
					continue;
				}
				Element strong = a.child(0);
				if (strong != null
						&& strong.tagName().equalsIgnoreCase("strong")) {
					String title = strong.text()
							.replaceAll("[\\.\\-\\:：]", "_")
							.replaceAll("\\s", "");
					File f = new File(dir, title);
					if (!f.exists()) {
						f.mkdirs();
					}
					String href = a.attr("href");
					bw.write(title + "(" + href + ")\r\n");

					map.put(f, href);
				}
			}
			bw.close();

			for (Map.Entry<File, String> entry : map.entrySet()) {
				File f = entry.getKey();
				Set<String> set = new LinkedHashSet<String>();
				this.section(set, entry.getValue());
				OutputStreamWriter osw = new OutputStreamWriter(
						new FileOutputStream(new File(f, "catalog.txt")),
						"UTF-8");
				int i = 1;
				for (String src : set) {
					osw.write(i++ + "(" + src + ")\r\n");
				}
				osw.close();
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 * 分析章节
	 * 
	 * @param address
	 *            章节地址
	 */
	private void section(Set<String> set, String address) {
		try {
			URL url = new URL(address);
			Document doc = Jsoup.parse(url, 1000 * 10);
			Element body = doc.body();
			Element div = body.getElementsByClass("textworld").first();
			Element img = div.getElementsByTag("img").first();

			String src = img.attr("src");
			System.out.println(src);
			set.add(src);

			Element none = div.getElementById("pagecount");
			Element links = none.previousElementSibling();
			Element font = links.getElementsByTag("font").first();
			Element next = font.nextElementSibling();
			if (next.text().matches("\\[\\d+\\]")) {
				this.section(set, next.absUrl("href"));
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * 下载图片
	 * 
	 * @param dir
	 */
	private void prey(File dir) {
		for (File f : dir.listFiles()) {
			if (!f.isDirectory()) {
				continue;
			}
			File catalog = new File(f, "catalog.txt");
			if (!catalog.exists()) {
				continue;
			}
			try {
				BufferedReader reader = new BufferedReader(
						new InputStreamReader(new FileInputStream(catalog),
								"UTF-8"));
				String line = null;
				while ((line = reader.readLine()) != null) {
					String path = line.substring(line.indexOf("(") + 1, line
							.length() - 1);
					try {
						URL url = new URL(path);
						HttpURLConnection con = (HttpURLConnection) url
								.openConnection();
						InputStream is = con.getInputStream();
						OutputStream os = new FileOutputStream(new File(f, path
								.substring(path.lastIndexOf("/") + 1)));
						byte[] b = new byte[1024 * 4];
						int l = -1;
						while ((l = is.read(b)) != -1) {
							os.write(b, 0, l);
						}
						os.flush();
						os.close();
						con.disconnect();
						System.out.println(path + " download to "
								+ dir.getCanonicalPath() + " complete.");
					} catch (MalformedURLException e) {
						e.printStackTrace();
					} catch (IOException e) {
						e.printStackTrace();
					} catch (Exception e) {
						e.printStackTrace();
					}
				}
				reader.close();
				if (catalog.renameTo(new File(f, "catalog"))) {
					catalog.delete();
				}
			} catch (UnsupportedEncodingException e) {
				e.printStackTrace();
			} catch (FileNotFoundException e) {
				e.printStackTrace();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

fuckcdn

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
用jsoup爬取uuu9的漫画

这两天打算把魔兽世界的官方漫画&lt;王者归来&gt;看看,体会下刀疤男的复仇历程.google后发现uuu9上有中文版,但是只能在线看,每次都要点击图片最大化看,很烦.于是想下载下来看.用firebug查看,发现页面写的还算规范.就是用jsoup解析图片地址,然后用url下载下来.很简单,也不想用多线程了.就这样直接一个一个来吧.毕竟我家里4M的网速不是盖的. package pic;...
复制链接

扫一扫