16年年底,发现了一个叫半次元的Coser网站,看图片一个一个看太麻烦,直接写个爬虫吧所有图片记录下来重新构成一个只有图片的html...
第一部分:界面
package easyspider.menu;
import easyspider.menu.logging.LoggingWriter;
public class MenuSystem {
public static void init(){
System.err.println("EasySprider [版本\t1.2]");
LoggingWriter.printMessage("生成的HTML默认存放在C盘根目录下...");
LoggingWriter.printMessage("如果需要手动设置保存盘符路径,请直接输入盘符:etc(C:)");
changeSavePath();
}
private static void changeSavePath(){
}
}
第二部分:实际体
/**
* @author o.kEnnponN
* 重构时间:2016年11月26日00:26:31
* 1. 在1.0的基础上友好了界面
* 2.
*
*
*
* */
package easyspider;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import easyspider.menu.MenuSystem;
public class CrawSystem {
public static void main() {
MenuSystem.init();//程序初始化 版本信息打印
CloseableHttpClient httpclient = HttpClients.createDefault();
String html = null;
HttpGet get = new HttpGet("http://bcy.net/coser");
System.out.println("日志:请求 ---- " + get.getURI());
try {
CloseableHttpResponse resp = httpclient.execute(get);
HttpEntity entity = resp.getEntity();
System.out.println("---------------------------------------");
if (entity != null) {
System.out.println("日志:响应长度:---- " + entity.getContentLength());
System.out.println("日志:响应文档:---- ");
}
html = EntityUtils.toString(entity);
System.out.println(html.length());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Document document = Jsoup.parse(html);
// http://img5.bcyimg.com/editor/flag/1789q/15469140839911e6b4b9bbaf8b79bd63.jpg
Elements imgs = document.select("img");
List tags = analysis(imgs);
}
public static List<String> analysis(Elements tags) {
List<String> newTags = new ArrayList();
String s = new String(tags.toString());
String[] s1 = s.split("<img");
String s2 = null;
StringBuffer sb = new StringBuffer();
sb.append("<link href='http://cdn.bootcss.com/bootstrap/2.3.2/css/bootstrap.min.css' rel='stylesheet'>");
sb.append("<div class='container'>");
sb.append("<h1>本页面由o.kEnnponN开发的爬虫软件生成 - 软件版本:V1.0 Bata</h1>");
for (int i = 0; i < s1.length; i++) {
if (s1[i].indexOf("img9.bcyimg.com/coser") != -1) {
s2 = s1[i].replace("2X3", "~");
s2 = s2.substring(s2.indexOf("http"));
s2 = s2.substring(0, s2.indexOf("jpg") + 3);
System.out.println(s2);
if(!s2.equals("ht")){
sb.append("<img src='");
sb.append(s2);
sb.append("'>");
}
// insert(s2);
}
}
sb.append("</div>");
System.out.println(s2.length()+"****************************");
createHtml(new String(sb));
return newTags;
}
public static void createHtml(String imgs) {
byte[] bytes = imgs.getBytes();
System.out.println(bytes.length+"--------------------------");
File file = null;
FileOutputStream output = null;
BufferedOutputStream buffer = null;
try {
file = new File("C://htmls/" + new SimpleDateFormat("yyyy年MM月dd日HH时mm分ss秒").format(new Date()) + ".html");
output = new FileOutputStream(file);
buffer = new BufferedOutputStream(output);
buffer.write(bytes);
buffer.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (buffer != null)
buffer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
PS,当年的Jsoup玩的还没有现在6,所以代码看起来比较2B...