Java多线程爬取笔趣阁小说

该代码实现了一个从xbiquge.la网站抓取特定小说章节内容的爬虫。首先,获取已存在章节的最大ID,然后通过发送HTTP请求获取所有章节链接。使用线程池并行处理每个章节,抓取内容并存入数据库。如果所有章节都已抓取,程序结束。爬虫过程中考虑了内容的编码、HTML解析和数据库操作。
摘要由CSDN通过智能技术生成
package com.framework.libInteresting.spider;

import java.sql.Connection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import com.framework.util.db.DbUtil;
import com.framework.util.html.HtmlUtil;
import com.framework.util.map.MapUtil;
import com.framework.util.string.StringUtil;

public class Biquge {
	public static void main(String[] args) throws Exception {
		Connection conn = DbUtil.getBestConn();
		try {
			DbUtil db = new DbUtil(conn);

			String baseUrl = "http://www.xbiquge.la";
			String name = "道君";
			String url = "http://www.xbiquge.la/15/15003/";
			
			int maxid = NovelUtil.getExistsId(db, url);
			if (maxid <= 0) {
				maxid = NovelUtil.newId(db, url, name);
			}
			
			// 获取所有url
			Map<String, String> heads = SpiderUtil.getParams();
			String category = SpiderUtil.sendGet(url, heads, "utf-8");
			Map<String, String> urlMap = new HashMap<String, String>();
			category = HtmlUtil.find(category, "div","class=box_con").get(0);
			List<String> dds = HtmlUtil.find(category,"dd");
			for (String dd : dds) {
				dd = HtmlUtil.innerHTML(dd).trim();
				String title = HtmlUtil.getValue(dd);
				String chaturl = HtmlUtil.propertyValue(dd, "href");
				urlMap.put(chaturl, title);
			}
				
			while(true){
				ExecutorService pool = Executors.newCachedThreadPool();
				pool = Executors.newFixedThreadPool(20);
				// 获取具体章节
				for (String purl : urlMap.keySet()) {
					pool.execute(new GetChapter(maxid, baseUrl + purl, urlMap.get(purl), heads));
				}
				pool.shutdown();
				while(true){
		           if(pool.isTerminated()){  
		                break;  
		           }  
		           Thread.sleep(1000);    
				}
				
				List<String> records=db.getListStr("select title from SPIDER_NOVEL_chapter where id=?",new Object[]{maxid});
				if(urlMap.size()>0){
					for(String key:MapUtil.getKeys(urlMap)){
						String v = urlMap.get(key);
						if(records.contains(v)){
							urlMap.remove(key);
						}
					}
				}
				
				if(urlMap.size()==0){
					System.out.println(db.queryForInt("select count(1) from spider_novel_chapter where id=?",new Object[]{maxid}));
					break;
				}
			}

		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			DbUtil.close(conn);
		}

	}
}

class GetChapter implements Runnable {
	int maxid;
	String purl;
	String title;
	Map<String, String> heads = new HashMap<String, String>();

	public GetChapter(int maxid, String purl, String title, Map<String, String> heads) {
		this.maxid = maxid;
		this.purl = purl;
		this.title = title;
		this.heads.putAll(heads);
	}

	public void run() {
		Connection conn = DbUtil.getBestConn();
		String content = "";
		try {
			DbUtil db = new DbUtil(conn);
			if (!NovelUtil.chapterExists(db, maxid, title)) {
//				System.out.println(maxid+" "+title);
				content = SpiderUtil.getContentByProxy("biquge", purl, heads, "utf-8");
				content = HtmlUtil.find(content, "div", "id=content").get(0);
				content=HtmlUtil.changeHtmlSymbol(content);
				content = StringUtil.kill(content, "<", ">").trim();
				content=StringUtil.trim(content);
				if(content!=null&&!"".equals(content))
					NovelUtil.insertChapter(db, maxid, title, content);
			}
		} catch (Exception e) {
//			FileUtil.writeTxt(content, "c:/error.txt");
		}finally{
			DbUtil.close(conn);
		}
	}
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

ak01_10

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值