用java抓取一本小说

 自学Python 来写一个爬虫吧 ----> JAVA实现


1.HttpOpener.java 模拟浏览器行为,获取页面信息。

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.client.methods.RequestBuilder;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;

import com.alibaba.fastjson.JSONObject;

public class HttpOpener {
	private static HttpOpener opener = null;
	private static HttpClient httpClient = null;
	private static HttpClientContext httpClientContext = null;
	private static String cookiefile = "cookie";
	public HttpClientContext getHttpClientContext() {
		return httpClientContext;
	}
	private HttpOpener() {//私有构造方法 单例模式 共享cookie
		//构造头
	    List<Header> headers = new ArrayList<Header>();
	    headers.add(new BasicHeader(HttpHeaders.ACCEPT, "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"));
	    headers.add(new BasicHeader(HttpHeaders.USER_AGENT, "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36"));
	    headers.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate, sdch"));
	    headers.add(new BasicHeader(HttpHeaders.CACHE_CONTROL, "max-age=0"));
	    headers.add(new BasicHeader(HttpHeaders.CONNECTION, "keep-alive"));
	    headers.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-CN,zh;q=0.8"));
	    //构造HttpClient
	    try {
	    	httpClient = HttpClientBuilder.create()
	    			.setDefaultCookieStore(readCookie())
	    			.setDefaultHeaders(headers)
	    			.build();
		} catch (Exception e) {//第一次打开?cookie文件不存在?……
			httpClient = HttpClients.custom()
					.setDefaultHeaders(headers)
					.build();
		}
	    httpClientContext = HttpClientContext.create();
	    try {
	    	httpClientContext.setCookieStore(readCookie());
		} catch (Exception e) {
			e.printStackTrace();
		}
	    
	}
	
	public static HttpOpener getOpener() {
		if(opener==null) {
			opener = new HttpOpener();
		}
		return opener;
	}
	public HttpResponse doGet(String url,Map<String,String> data) throws URISyntaxException, ClientProtocolException, IOException {
		//get方式 构造路径参数 就是?a=b&c=d 表单参数
	    List<NameValuePair> dataList = new ArrayList<NameValuePair>();
	    if (data!=null) {
	    Set<String> keys = data.keySet();
	    for (String key:keys) {
	    	dataList.add(new BasicNameValuePair(key,data.get(key)));
	    }}
	    //构造请求路径,并添加参数
	    URI uri = new URIBuilder(url).addParameters(dataList).build();
	    //构造请求
	    HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build();
	    //获取结果
	    HttpResponse httpResponse = httpClient.execute(httpUriRequest,httpClientContext);
	    saveCookie();//存cookie
	    return httpResponse;
	}
	public HttpResponse doPost(String url,Map<String,String> data) throws Exception{
		HttpPost httpPost = new HttpPost(url);
        httpPost.setHeader("ContentType", "application/json");
        httpPost.setHeader("Connection", "Close");
		if(url.startsWith("http:")) {//http方式的post
			String jsonObj = JSONObject.toJSONString(data);
		    // 构建消息实体
	        StringEntity entity = new StringEntity(jsonObj, Charset.forName("UTF-8"));
	        entity.setContentEncoding("UTF-8");
	        entity.setContentType("application/json");
	        // 发送Json格式的数据请求
	        httpPost.setEntity(entity);
	    }else if(url.startsWith("https:")) {//https方式的post
	    	SSLContext sslcontext = createIgnoreVerifySSL();
	    	// 设置协议http和https对应的处理socket链接工厂的对象
	        Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
	            .register("https:", new SSLConnectionSocketFactory(sslcontext))
	            .build();
	        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
	        HttpClients.custom().setConnectionManager(connManager);

	        List<NameValuePair> datalist = new ArrayList<NameValuePair>();
	        if(data!=null){//表单数据
	            for (Entry<String, String> entry : data.entrySet()) {
	            	datalist.add(new BasicNameValuePair(entry.getKey(), entry.getValue()));
	            }
	        }
	        httpPost.setEntity(new UrlEncodedFormEntity(datalist,"UTF-8"));
	    }else {
	    	throw new Exception("Error Url,it should been started with 'http' or 'https'.");
	    }
		HttpResponse httpResponse = httpClient.execute(httpPost,httpClientContext);
	    return httpResponse;
	}
	private void saveCookie() throws IOException{
		List<Cookie> cookies = httpClientContext.getCookieStore().getCookies();
		File file = new File(cookiefile);
			try {
				file.createNewFile();
			} finally {
				OutputStream writer = new FileOutputStream(file);
				StringBuffer sb = new StringBuffer("");
				for(Cookie cookie:cookies) {
					sb.append(cookie.toString());
					sb.append("\n");
				}
				writer.write(sb.toString().getBytes(), 0, sb.toString().getBytes().length);
				writer.close();
			}
	}
	@SuppressWarnings("resource")
	private CookieStore readCookie() throws IOException{
		CookieStore cookieStore = httpClientContext.getCookieStore();
		if(cookieStore == null) {
			cookieStore = new BasicCookieStore();
		}
		File file = new File(cookiefile);
		BufferedReader reader;
		try {
			reader = new BufferedReader(new FileReader(file));
		} catch (Exception e) {
			return null;
		}
		String text = reader.readLine();
		BasicClientCookie cookie;
		Date date = new Date();
		Long time = date.getTime()+Integer.MAX_VALUE;
		while(text!=null&&text!="") {
			Map<String,String> ckmap = new HashMap<String,String>();
			text = text.replace("[","");
			String[] list = text.split("]");
			for (String str:list) {
				if(str.length()>5) {
					ckmap.put(str.split(": ", 2)[0],str.split(": ", 2)[1]);
				}
			}
			cookie = new BasicClientCookie(ckmap.get("name"), ckmap.get("value"));
			cookie.setDomain(ckmap.get("domain"));
			cookie.setPath(ckmap.get("path"));
			cookie.setVersion(Integer.parseInt(ckmap.get("version")));
			cookie.setExpiryDate((ckmap.get("expiryDate")==null)?null:(new Date(time)));
			cookieStore.addCookie(cookie);
			text = reader.readLine();
		}
		return cookieStore;
	}
	//绕过https验证 百度的……
	private static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
        SSLContext sc = SSLContext.getInstance("SSLv3");

        // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
        X509TrustManager trustManager = new X509TrustManager() {
            public void checkClientTrusted(
                    java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
                    String paramString) throws CertificateException {
            }


            public void checkServerTrusted(
                    java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
                    String paramString) throws CertificateException {
            }


            public java.security.cert.X509Certificate[] getAcceptedIssuers() {
                return null;
            }
        };

        sc.init(null, new TrustManager[] { trustManager }, null);
        return sc;
    }
}

    这里需要的就是doGet方法,获取页面。构造方法可以再添加代理……(日后再说)

    接下来就是对页面获取的内容进行处理,得到需要的部分,这里我创建了一工具类完成部分操作:

package Utils;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.http.conn.HttpHostConnectException;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;

public class Utils {
	//常见字 可能出现在抓取内容里且编码方式错了不会有的字 中文网站用中文,全角标点等 越多越准确,概率大的放前面能加快速度 
	private static final String[] strs= {"章","了","。","一","二","三","四","五","六","七","八","九","十","百","千","我"};
	/**
	 * 字节-->文件
	 * @param bs
	 * @param path 保存路径
	 * @return 保存路径
	 * @throws IOException
	 */
	public static String WriteInFile(byte[] bs,String path) throws IOException {//以字节方式写入文件
		File img = new File(path);
		FileOutputStream writer = new FileOutputStream(img);
		writer.write(bs);
		writer.close();
		return path;
	}
	/**
	 * 输入流 --> 文件
	 * @param is 输入流
	 * @param path 保存路径
	 * @return 保存路径
	 * @throws IOException
	 */
	public static String WriteInFile(InputStream is,String path) throws IOException {//以字节方式写入文件
		File img = new File(path);
		FileOutputStream writer = new FileOutputStream(img);
		byte[] bs = null;
		is.read(bs);
		is.close();
		writer.write(bs);
		writer.close();
		return path;
	}
	/**
	 * 输入流转字符串 默认编码 实际上用不到
	 * @param in 输入流
	 * @return 字符串
	 * @throws IOException
	 */
	public static String Stream2String(InputStream in) throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(in));
		StringBuffer sb = new StringBuffer();
		String line;
		while((line=reader.readLine())!=null) {
			sb.append(line);
		}
		reader.close();
		in.close();
		return sb.toString();
	}
	/**
	 * 输入流转字符串 制定编码方式
	 * @param in 输入流
	 * @param code 编码方式
	 * @return 字符串
	 * @throws IOException
	 */
	public static String Stream2String(InputStream in,String code) throws IOException {
		BufferedReader reader = new BufferedReader(new InputStreamReader(in,code));
		StringBuffer sb = new StringBuffer();
		String line;
		while((line=reader.readLine())!=null) {
			if(line.startsWith("<!")){//去掉网页前面的 <!DOCTYPE html> 影响Jsoup正常工作
				continue;
			}
			sb.append(line);	
		}
		reader.close();
		in.close();
		return sb.toString();
	}
	/**
	 * XML转MAP
	 * @param str XML
	 * @return MAP
	 */
	public static Map<String,Object> XML2Map(String str){//XML 跟 html 还是很不一样的
		Document doc = null;
        try {
            doc = DocumentHelper.parseText(str);
        } catch (DocumentException e) {
            e.printStackTrace();
        }
        Map<String, Object> map = new HashMap<String, Object>();
        if (doc == null){
        	return null;
        }
        Element rootElement = doc.getRootElement();
        element2map(rootElement,map);
        return map;
	}
	/**
	 * 遍历 XML 树状结构嘛
	 * @param outele
	 * @param outmap
	 * @return
	 */
	@SuppressWarnings("unchecked")
	private static Map<String, Object> element2map(Element outele, Map<String, Object> outmap) {
		List<Element> list = outele.elements();//返回子节点数目
        int size = list.size();
        if(size == 0){//当前节点是无子节点 
            outmap.put(outele.getName(), outele.getTextTrim());
        }else if(size == 1){//当前节点只有一个子节点 
            Map<String, Object> innermap = new HashMap<String, Object>();
            Element ele1 = list.get(0);
            element2map(ele1,innermap);
            outmap.put(outele.getName(), innermap);
        }else if(size > 1){//当前节点有多个子节点 
            Map<String, Object> innermap = new HashMap<String, Object>();
            for(Element ele1 : list){
                String eleName = ele1.getName();
                Object obj =  innermap.get(eleName);//获取MASTER
                if(obj == null){//如果该MASTER不存在,现在有一个MASTER过来
                    element2map(ele1,innermap);
                }else{
                    if(obj instanceof java.util.Map){//如果没有生成过list,把原来的单个map合并到新的list
                        innermap.remove(eleName);
                        List<Map<String, Object>> list1 = new ArrayList<Map<String, Object>>();
                        list1.add((Map<String, Object>) obj);
                        Map<String, Object> map1 = new HashMap<String, Object>();
                        element2map(ele1,map1);
                        list1.add((Map<String, Object>) map1.get(eleName));
                        innermap.put(eleName, list1);
                    }else if(obj instanceof java.util.List){//如果已经生成过list
                        element2map(ele1,innermap);
                        ((List<Map<String, Object>>)obj).add(innermap);
                    }
                }
            }
            outmap.put(outele.getName(), innermap);
        }
        return outmap;
	}
	/**url处理 原始页面上存在多种url 全部处理为完整的路径
	 * 
	 * @param Baseurl 根目录
	 * @param Lasturl 当前目录
	 * @param url 任务url
	 * @return 完整url
	 * @throws Exception
	 */
	public static String Url2Url(String Baseurl,String Lasturl,String url) throws Exception{
		if(url.startsWith("http://")||url.startsWith("https://")){
			//完整路径 http://www.baidu.com
			return url;
		}else if(url.indexOf("/")<0||url.startsWith("../")){
			//相对路径 ../../index.html(上两级目录下的index.html)
			String[] urls = url.split("../");//给个 ../ 抵消当前路径的一个/
			for(int i=0;i<urls.length;i++){
				Lasturl = Lasturl.substring(0,Lasturl.lastIndexOf("/"));
			}
			return Lasturl +"/"+ urls[urls.length-1];
		}else if(url.indexOf("/")==0){
			//绝对路径 /en/index.html (网站根目录下的en/index.html)
			return Baseurl + url.substring(1);
		}else{
			throw new Exception("Error Url");//这就是在逗我了
		}
		
	}
	//URL处理的批量方法
	public static List<String> Url2Url(String Baseurl,String Lasturl,List<String> urls){
		for(int i=0;i<urls.size();i++){
			try{
				urls.set(i, Url2Url(Baseurl, Lasturl, urls.get(i)));
			}
			catch(Exception e){
			}
		}
		return urls;
	}
	//用opener获取url上的资源,编码方式为code,循环css下的child元素的value属性值
	public static List<String> Url2Get(HTTPOpener opener,String url,String code,String css,String child,String value,boolean notest) {
		List<String> list = new ArrayList<>();
		try {
			InputStream html = opener.doGet(url, null).getEntity().getContent();
			/*
			 * 由于重复尝试编码,解决InputStream只能的问题一次
			 */
			ByteArrayOutputStream htmlcopy = new ByteArrayOutputStream();  
			byte[] buffer = new byte[1024];  
			int len;  
			while ((len = html.read(buffer)) > -1 ) {  
				htmlcopy.write(buffer, 0, len);  
			}  
			htmlcopy.flush(); 
			//常见放前面 加快速度||范围小的放前面 减少异常
			String[] basecodes = {"UTF-8","GBK","ASCII","ISO-8859-1","GB2312","Unicode"};
			String[] codes;
			if(code==null){//未指定编码
				codes = basecodes;
			}else{//指定编码 加在最前面
				codes = new String[basecodes.length+1];
				System.arraycopy(basecodes, 0, codes, 1, basecodes.length);
				codes[0] = code;
			}
			url = url.lastIndexOf('/')>0?(url.substring(0,url.lastIndexOf('/'))):null;
			for(String icode:codes){//循环编码
				html = new ByteArrayInputStream(htmlcopy.toByteArray());//解决InputStream只能读一次
				org.jsoup.nodes.Document doc = Jsoup.parse(html, icode, url);
				/*Java 和 Python的编码解码不一样
				 * Java 指定了编码方式就能解下去,不会报错,需自己再次验证
				 * Python 指定了编码,如果不对会报错,然后就可以尝试下一个了
				 */
				Elements es = doc.select(css);
				for(org.jsoup.nodes.Element e : es){
					org.jsoup.nodes.Element one = null;
					if(child!=null&&!child.equals("")){
						one = e.selectFirst(child);}
					else {
						one = e;
					}
						if(one!=null){
							if(value == null){
								if(notest||testlist(one.text()))//不测试或者测试通过
								list.add(one.text());
							}else{
								if(notest||testlist(one.attr(value)))//不测试或者测试通过
								list.add(one.attr(value));
							}//测试而且没通过 数组为空
						}
				}
				if(list!=null&&list.size()!=0){
					html.close();
					break;//抓到了东西
				}else{
					list.clear();//清掉,换编码重试
					//continue;
				}
			}
			html.close();
			htmlcopy.close();
		} catch (HttpHostConnectException e) {//http连接出错 重试
			try {
				Thread.sleep((int)(20000*Math.random()));
			} catch (InterruptedException e1) {
				e1.printStackTrace();
			}
			return Url2Get(opener, url, code, css, child, value,notest);
		}catch(Exception e){
			e.printStackTrace();
			list.add("");//这里出问题后面抓取就是null,然后就出错了
			list.add("");
		}
			return list;
		}// 查找css元素 获取该元素下的child元素的value属性值 这次可以在一个页面上抓多次 数组的长度
		public static List<List<String>> Url2Get(HTTPOpener opener, String url,String code, String[] css, String[] child, String[] value,boolean[] notest) {
			if(css.length==value.length&&css.length==child.length&&css.length==notest.length){
				List<List<String>> list = new ArrayList<>();
				for(int i=0;i<css.length;i++){
					list.add(Url2Get(opener, url, code, css[i], child[i], value[i],notest[i]));
					}
					return list;}
			return null;
		}//检测用的编码抓的东西对不对 主要是对文本测试 或许这里用正则实现是个不错的主意
		private static boolean testlist(String string){
			boolean in = false;
			for(String str:strs){
				if(string.indexOf(str)>0){//判断是否存在 strs中的任何一个 存在就OK 然后跳出
					in = true;break;
					}
				}
			return in;}
		}

第三步,编写多线程部分:

    连接目标站点获取页面使用多个线程获取,内容存储到SQLite数据库时使用单个线程写入(使用lock)

import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.List;
import java.util.concurrent.locks.ReentrantLock;

import xywi.util.HttpOpener;
import xywi.util.Utils;

public class ThreadWork extends Thread {
	private ReentrantLock lock;
	private Connection con = null;
	private HttpOpener opener = null;
	private String url = null;
	private String code = null;
	private String[] css = null;
	private String[] value = null;
	private String bookno = null;
	private String[] child = null;
	private boolean[] notest = null;
	
	public ThreadWork(Connection con, HttpOpener opener, String url,
			String code, String[] css, String[]child, String[] value, boolean[] notest,String bookno) {
		super();
		this.lock = new ReentrantLock();
		this.con = con;
		this.opener = opener;
		this.url = url;
		this.code = code;
		this.css = css;
		this.child  = child;
		this.value = value;
		this.notest = notest;
		this.bookno = bookno;
	}
	
	public void run(){//从这里开始的内容都会以多线程的方式运行
		Integer urlnum = Integer.parseInt(url.substring(url.lastIndexOf("/")+1, url.lastIndexOf(".")));
		System.out.println("Downloading "+url+" ...");
		List<List<String>> list = Utils.Url2Get(opener, url, code, css, child, value, notest);
		System.out.println("One Get!");
		String title = list.get(0).get(0).replace("\"", "“");//防止写入数据库出错 把半角双引号全部换掉
		String text = list.get(1).get(0).replace("\"", "“").replace("比如广告之类的东西", "");//根据数组的长度可以选择list更多元素
		try{
			lock.lock();//这里开始的内容每次只有一个线程运行 其他线程堵塞
			Statement statement = this.con.createStatement();
			String sql = "insert into txt"+bookno+" values("+urlnum+",\""+title+"\",\""+text+"\");";
			statement.executeUpdate(sql);
		}catch(SQLException e){
			e.printStackTrace();
		}finally{
			lock.unlock();//其他线程可以争夺lock了
		}
	}
}

对于单核CPU运行计算密集型多线程,性能会下降,但对于IO密集型多线程(本例),性能会上升(限制你的是网速和对方网站的响应速度,当然你要是开9999个线程当我没说……)。

最后就是程序的入口和逻辑了

import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.List;


import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.sqlite.SQLiteException;

import xywi.util.HttpOpener;
import xywi.util.Utils;

public class Main {
	static String jdbc = "org.sqlite.JDBC";//sqlite jdbc
	static String url = "jdbc:sqlite:local.db";//sqlite 数据库文件路径
	public static void main(String[] args) throws Exception {
		String bookno = args[0];//带参运行
		HttpOpener opener = HttpOpener.getOpener();//opener 实例
		final String Baseurl = "";//网站首页 跟路径 
		String indexurl = Baseurl +"Book/"+bookno+"/Index.aspx";//某本书的主页 介绍页
		List<String> list = Utils.Url2Get(opener, indexurl, "GBK", "div#CrbsButton", "a", "href",true);//抓取章节列表页url opener实例 当前页面 编码方式 定位元素 获得元素 获得属性 不做测试
		String listurl = Utils.Url2Url(Baseurl, indexurl, list.get(0));//URL转换 list的大小实际上为1
		list = Utils.Url2Url(Baseurl,listurl,Utils.Url2Get(opener, listurl, "GBK", "div#BookText ul li", "a", "href",true));//抓取章节列表中的url 对应每一章
		Class.forName(jdbc);
		Connection con = DriverManager.getConnection(url);
		Statement statement = con.createStatement();
		boolean restart = false;//要是数据库已经有了这本书 我就不下载了
		ResultSet rs;
		try {
			rs = statement.executeQuery("select count(1) as sum from txt"+bookno+" ;");
			rs.next();
			restart = Integer.parseInt(rs.getString("sum"))<list.size();//数据库里的章节数目 与 页面上抓到的章节数目
		} catch (SQLiteException e) {//没有这张表 ……
			restart = true;
		}
		if(restart){
			statement.executeUpdate("drop table if exists txt"+bookno+";");
			statement.executeUpdate("create table txt"+bookno+"(url Integer not null,title text not null,txt text not null);");
			//多线程池
			ExecutorService server = Executors.newFixedThreadPool(64);
			for(String url:list){//提交线程 运行
				server.execute(new ThreadWork(con, opener, url, null, new String[]{"div#TextTitle","div#BookTextt"},new String[]{"span",null},new String[]{null,null},new boolean[]{false,false},bookno));
			}
			server.shutdown();//终止提交
			while(true){
				if(server.isTerminated()){//线程池空了?
				System.out.println("All Download Over!");
				break;
				}
			}
		}
		rs = statement.executeQuery("select title,txt,url from txt"+bookno+" order by url ASC;");
		BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(new File(bookno+".txt"),true));
		while(rs.next()){
			String title = rs.getString("title") + "\n";
			bos.write(title.getBytes());
			String text = rs.getString("txt") + "\n\n";
			bos.write(text.getBytes());
		}
		bos.close();
		rs.close();
		statement.close();
		con.close();
		System.out.println("All Over!");
	}

}

    这里使用了 java.util.concurrent 下的线程池进行多线程的启动,这里的线程池只是限定了同时运行的线程的大小,并不是将线程重复利用。

    在python版本里,我是将章节连接分给了n个线程,然后所有线程互不干扰运行(操作数据库那儿除外),然后等到全部结束后,主线程继续;

    在java版本里,程序创建了章节数个线程(即每一章节一个线程),但这些线程受线程池的限制,同一时间只有线程池大小那么多个线程在运行(数据库操作除外),其他的线程都被堵塞,每当有一个线程运行结束移出线程池,就会有一个线程进入线程池开始运行。而在这个过程中,主线程一直在运行。

将项目打包Jar 后 java -jar FileName.jar bookNo 即可。(路人:别人写了半年,你十分钟down下来了,好么?!  我:表打我……)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值