java 爬虫数据

最新推荐文章于 2024-06-26 14:44:53 发布
Vincebran
最新推荐文章于 2024-06-26 14:44:53 发布
阅读量223
点赞数
文章标签： java
本文链接：https://blog.csdn.net/Vincebran/article/details/106574758
版权
package net.aykj.util;

import java.io.File;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.servlet.ServletContext;

import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

import net.aykj.pojo.Annex;
import net.aykj.pojo.Article;
import net.aykj.pojo.Rule;
import net.aykj.service.AnnexService;
import net.aykj.service.ArticleService;
import net.aykj.service.RuleService;
import sun.security.x509.GeneralName;

/**
 * 采集线程蜘蛛
 * Update by Bingyong.Wang    当详情页中没有显示时间时，采集列表页的时间
 */
@SuppressWarnings({"unchecked", "unused"})
public class SpiderThreadUtil extends Thread {
	
	private boolean stop = false;
	private RuleService ruleService = null;
	private ArticleService articleService = null;
	private AnnexService annexService =null;
	private ServletContext servletContext = null;
	private Integer rulesId = null;
	private String rulesName = null;
	
	public SpiderThreadUtil(RuleService ruleService, ArticleService articleService,AnnexService annexService, ServletContext servletContext, String rulesName, Integer rulesId) {
		this.ruleService = ruleService;
		this.articleService = articleService;
		this.annexService=annexService;
		this.servletContext = servletContext;
		this.rulesId = rulesId;
		this.rulesName = rulesName;
	}
	
	@Override
	public void run() {
		try {
			String totalCount = rulesName + "_totalCount";
			// 采集的数量
			String getCount = rulesName + "_getCount"; 
			// 已经存在的数量
			String gotCount = rulesName + "_gotCount"; 
			String errorCount = rulesName + "_errorCount";
			
			List<Rule> ruleList =  ruleService.queryRuleListByRulesId(rulesId, true);
			List<Rule> newRuleList = new ArrayList<Rule>();
			if (ruleList != null && ruleList.size() > 0) {
				int total = 0;
				for (Rule rule : ruleList) {
					// 查询详细页的连接  若详情页没时间时，列表页获取时间，及查询列表页时间，详情页连接与时间组合在一起，构成详情页链接，通过VT连接
					List<String> viewUrlList = queryViewUrlList(rule, errorCount); 
					
					System.out.println("翻转的详情连接======" + viewUrlList);
					
					total = total + viewUrlList.size();					
					System.out.println("-----------------------"+rule.getNewsClassId()+"栏目，查到" + total + "篇---------------------");
					
					// 需要采集的详细页的连接
					rule.setViewUrlList(viewUrlList); 
					newRuleList.add(rule);
					
					// 判断是否结束线程
					if (isStop(totalCount, getCount, gotCount, errorCount)) break; 
				}
				// 设置总共需要采集的文章数量
				servletContext.setAttribute(totalCount, total); 
				
				// 采集文章
				catchArticles(newRuleList, totalCount, getCount, gotCount, errorCount); 
				
				System.out.println("----------------------- 数据采集完成，共采集到" + total + "篇 ----------------------");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
	
	private void catchArticles(List<Rule> newRuleList, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
		// 详情链接
		String viewUrlTemp = null;
		// 创建时间
		String createtime = null;
		for (Rule rule : newRuleList) {
			List<String> viewUrlList = (List<String>) rule.getViewUrlList();
			if (viewUrlList != null) {
				for (String viewUrl : viewUrlList) {
					if (viewUrl.indexOf("VT") > 0) {
						viewUrlTemp = viewUrl.split("VT")[0];
						viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
						createtime = viewUrl.split("VT")[1];
					}else{
						viewUrlTemp=viewUrlTemp.replaceAll("href=\"|\"", "");
					}
					if (isStop(totalCount, getCount, gotCount, errorCount)) break; //判断是否结束线程
					System.out.println(viewUrlTemp);
					
					// 详情页链接及文章时间
					if (GeneralUtil.isNotNull(viewUrlTemp) && GeneralUtil.isNotNull(createtime)) {
						catchArticleByViewUrlAndCreatetime(rule, viewUrlTemp, createtime, totalCount, getCount, gotCount, errorCount);
					} else {
						catchArticle(rule, viewUrlTemp, totalCount, getCount, gotCount, errorCount);
					}
				}
			}
		}
	}
	

	private void catchArticle(Rule rule, String viewUrl, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
		String encode = rule.getEncode();
		encode = encode == null ? "UTF-8" : encode;
		String content = HttpUtil.get(viewUrl, encode);
		if("HTTP/1.1 404 Not Found".equals(content)){
			System.out.println("详细页链接:"+viewUrl+",访问404,跳过");
			this.addErrorCount(errorCount);
		}else{
			String host = rule.getHost();
			
			String titleRegex = rule.getTitleRegex();
			String authorRegex = rule.getAuthorRegex();
			String createtimeRegex = rule.getCreatetimeRegex();
			String sourceRegex = rule.getSourceRegex();
			String hitsRegex = rule.getHitsRegex();
			String contentRegex = rule.getContentRegex();
			
			String titleFilterRegex = rule.getTitleFilterRegex();
			String authorFilterRegex = rule.getAuthorFilterRegex();
			String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
			String sourceFilterRegex = rule.getSourceFilterRegex();
			String hitsFilterRegex = rule.getHitsFilterRegex();
			String contentFilterRegex = rule.getContentFilterRegex();
			
			Integer subsiteId = rule.getSubsiteId();
			Integer objId = rule.getId();
			Integer newsClassId = rule.getNewsClassId();
			
			List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
			List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
			List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
			List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
			List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
			List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
			
			String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
			// 没有名称的说明没有采集到  跳过
			if(GeneralUtil.isNotNull(title)){
				String createtimeStr = createtimeList!=null && createtimeList.size()>0 ? createtimeList.get(0): null;
				
				// 获取时间
				if (createtimeStr.contains("发布时间")) {
					createtimeStr = createtimeStr.substring(createtimeStr.indexOf("发布时间"), createtimeStr.indexOf("作者")).replace("发布时间：", "").trim();
				}
				// 获取文章来源
				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原创";
				if (source.contains("新闻来源")) {
					source = source.substring(source.indexOf("新闻来源："), source.indexOf("新闻来源")).replace("新闻来源：", "").trim();
					source = null != source && source != " " && source.length() == 2 ? source : "原创";
				}
				
				// 作者
				String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理员";
				if (author.contains("作者")) {
					author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者：", "").trim();
					System.out.println(source.length());
					author = null != author && author != " " && author.length() == 2 ? author : "管理员";
				}
				
				String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
				String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
				
				//处理采集到的时间
				Date createtime = null;
				if (createtimeStr == null) {
					createtime = new Date();
				} else {
//					System.out.println(createtimeStr);
					createtimeStr=createtimeStr.replaceAll("\\r.*\\n", "").trim();
//					createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
					//System.out.println(createtimeStr);
					String timeFormat = rule.getTimeFormat();
					if (timeFormat != null && !"".equals(timeFormat)) {
						SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
						createtime = sdf.parse(createtimeStr);
					}
				}
				
				if (articleExist(title, subsiteId, newsClassId)) {
					this.addGotCount(gotCount);
					return;
				}
				
				//采集图片
				// articleContent = catchImage(articleContent, host);
				articleContent = catchImageByViewUrl(articleContent, viewUrl);
				Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
				//创建一个缩略图
				String imgRegex="/static/upload.*?pdf";//获取一个缩略图的正则
				Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
				Matcher matcher = pattern.matcher(articleContent);
				String imgSrc="";
				while(matcher.find()){
					imgSrc=matcher.group();
					break;
				}
				if(GeneralUtil.isNotNull(imgSrc)){
					String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
					Annex annex = new Annex();
					annex.setPath(annexPath);
					annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
					annex.setExt("pdf");
					annex.setType("annex");
					annex.setObj("article");
					annex.setCreatetime(createtime);
					annex.setObjId(aid);
					annexService.save(annex);
				}
				
				this.addGetCount(getCount);
			}else{
				System.out.println("详细页链接:"+viewUrl+",采集到的标题为空");
				this.addErrorCount(errorCount);
			}
		}
	}
	
	/**
	 * 根据文章详情页链接爬取文章，文章时间已获取有
	 * @param rule
	 * @param viewUrl
	 * @param createtimeTemp
	 * @param totalCount
	 * @param getCount
	 * @param gotCount
	 * @param errorCount
	 * @throws Exception
	 * void
	 * Bingyong.Wang at 2019年12月5日
	 */
	private void catchArticleByViewUrlAndCreatetime(Rule rule, String viewUrl, String createtimeTemp, String totalCount, String getCount, String gotCount, String errorCount) throws Exception {
		String encode = rule.getEncode();
		encode = encode == null ? "UTF-8" : encode;
		String content = HttpUtil.get(viewUrl, encode);
		if("HTTP/1.1 404 Not Found".equals(content)){
			System.out.println("详细页链接:"+viewUrl+",访问404,跳过");
			this.addErrorCount(errorCount);
		}else{
			String host = rule.getHost();
			
			String titleRegex = rule.getTitleRegex();
			String authorRegex = rule.getAuthorRegex();
			String createtimeRegex = rule.getCreatetimeRegex();
			String sourceRegex = rule.getSourceRegex();
			String hitsRegex = rule.getHitsRegex();
			String contentRegex = rule.getContentRegex();
			
			String titleFilterRegex = rule.getTitleFilterRegex();
			String authorFilterRegex = rule.getAuthorFilterRegex();
			String createtimeFilterRegex = rule.getCreatetimeFilterRegex();
			String sourceFilterRegex = rule.getSourceFilterRegex();
			String hitsFilterRegex = rule.getHitsFilterRegex();
			String contentFilterRegex = rule.getContentFilterRegex();
			
			Integer subsiteId = rule.getSubsiteId();
			Integer objId = rule.getId();
			Integer newsClassId = rule.getNewsClassId();
			
			List<String> titleList = this.extractStrByPattern(content, titleRegex, errorCount, true, titleFilterRegex);
			List<String> authorList = this.extractStrByPattern(content, authorRegex, errorCount, true, authorFilterRegex);
			List<String> createtimeList = this.extractStrByPattern(content, createtimeRegex, errorCount, true, createtimeFilterRegex);
			List<String> sourceList = this.extractStrByPattern(content, sourceRegex, errorCount, true, sourceFilterRegex);
			List<String> hitsList = this.extractStrByPattern(content, hitsRegex, errorCount, true, hitsFilterRegex);
			List<String> contentList = this.extractStrByPattern(content, contentRegex, errorCount, false, contentFilterRegex);
			
			String title = titleList!=null && titleList.size()>0 ? titleList.get(0).trim(): null;
			// 没有名称的说明没有采集到  跳过
			if(GeneralUtil.isNotNull(title)){
				
				/* -----------   高检 采集 start---  */
				// 获取时间
				/*if (createtimeStr.contains("时间")) {
					createtimeStr = createtimeStr.substring(createtimeStr.indexOf("时间"), createtimeStr.indexOf("作者")).replace("时间：", "").trim();
				}
				// 获取文章来源
				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "网站原创";
				if (source.contains("来源")) {
					source = null != source.substring(source.indexOf("来源"), source.length()).replace("来源：", "").trim() 
							&& "" != source.substring(source.indexOf("来源"), source.length()).replace("来源：", "").trim() 
							? source.substring(source.indexOf("来源"), source.length()).replace("来源：", "").trim() : "网站原创";
				}*/
				/* -----------   高检 采集    end---  */
				
				/* -----------   云检 采集 start---  */
				
				// 获取文章来源
				String source = sourceList!=null && sourceList.size()>0 ? sourceList.get(0): "原创";
				if (source.contains("来源")) {
					source = source.substring(source.indexOf("来源："), source.indexOf("查看")).replace("来源：", "").replace("\r\n\t\t\t\t", "").trim();
					source = null != source && source != " " && source.length() == 2 ? source : "原创";
				}
				/* -----------   云检 采集    end---  */
				
				// 作者
				String author = authorList!=null && authorList.size()>0 ? authorList.get(0).trim(): "管理员";
				if (author.contains("作者")) {
					author = author.substring(author.indexOf("作者"), author.indexOf("作者")).replace("作者：", "").trim();
					System.out.println(source.length());
					author = null != author && author != " " && author.length() == 2 ? author : "管理员";
				}
				
				String hits = hitsList!=null && hitsList.size()>0 ? hitsList.get(0).trim(): "0";
				String articleContent = contentList!=null && contentList.size()>0 ? contentList.get(0): null;
				
				//处理采集到的时间
				Date createtime = null;
				if (GeneralUtil.isNull(createtimeTemp)) {
					createtime = new Date();
				} else {
//					System.out.println(createtimeStr);
					createtimeTemp = createtimeTemp.replaceAll("\\r.*\\n", "").trim();
//					createtimeStr=createtimeStr.replaceAll("\u4E00-\u9FFF", "");
					//System.out.println(createtimeStr);
					String timeFormat = rule.getTimeFormat();
					if (timeFormat != null && !"".equals(timeFormat)) {
						SimpleDateFormat sdf = new SimpleDateFormat(timeFormat);
						createtime = sdf.parse(createtimeTemp);
					}
				}
				
				if (articleExist(title, subsiteId, newsClassId)) {
					this.addGotCount(gotCount);
					return;
				}
				
				//采集图片
				// articleContent = catchImage(articleContent, host);
				articleContent = catchImageByViewUrl(articleContent, viewUrl);
				Integer aid=addArticle(title, author, createtime, source, articleContent, subsiteId, newsClassId, hits);
				//创建一个缩略图
				String imgRegex="/static/upload.*?pdf";//获取一个缩略图的正则
				Pattern pattern = Pattern.compile(imgRegex, Pattern.DOTALL);
				Matcher matcher = pattern.matcher(articleContent);
				String imgSrc="";
				while(matcher.find()){
					imgSrc=matcher.group();
					break;
				}
				if(GeneralUtil.isNotNull(imgSrc)){
					String annexPath=downloadFile("http://www.ynsap.org.cn/"+imgSrc);
					Annex annex = new Annex();
					annex.setPath(annexPath);
					annex.setName(imgSrc.substring(imgSrc.lastIndexOf("/")+1));
					annex.setExt("pdf");
					annex.setType("annex");
					annex.setObj("article");
					annex.setCreatetime(createtime);
					annex.setObjId(aid);
					annexService.save(annex);
				}
				
				this.addGetCount(getCount);
			}else{
				System.out.println("详细页链接:"+viewUrl+",采集到的标题为空");
				this.addErrorCount(errorCount);
			}
		}
	}
	
	private String catchImage(String articleContent, String host) throws ParserException, InterruptedException {
		String html = "<html>" +  articleContent + "</html>";
		Parser parser = new Parser (html);
		SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
		while (nodeList.hasMoreNodes()) {
			Tag tag = (Tag) nodeList.nextNode();
			String src = tag.getAttribute("src");
			if (src != null) {
				String canDownloadSrc = src.replace("\\", "/");
				if (canDownloadSrc.startsWith("/")) {
					canDownloadSrc = host + canDownloadSrc;
				} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
					canDownloadSrc =host + "/" + canDownloadSrc;
				}
				
				String newSrc = downloadFile(canDownloadSrc);
				html = html.replace(src, newSrc);
			}
		}
		html =  html.substring("<html>".length());
		html = html.substring(0, html.length() - "</html>".length());
		return html;
	}
	
	/**
	 * 根据文章详情页连接获取文章中图片
	 * @param articleContent
	 * @param viewUrl
	 * @return
	 * @throws ParserException
	 * @throws InterruptedException
	 * String
	 * Bingyong.Wang at 2019年8月8日
	 */
	private String catchImageByViewUrl(String articleContent, String viewUrl) throws ParserException, InterruptedException {
		String html = "<html>" +  articleContent + "</html>";
		Parser parser = new Parser (html);
		SimpleNodeIterator nodeList = parser.extractAllNodesThatMatch(new TagNameFilter("img")).elements();
		while (nodeList.hasMoreNodes()) {
			Tag tag = (Tag) nodeList.nextNode();
			String src = tag.getAttribute("src");
			if (src != null) {
				String canDownloadSrc = src.replace("\\", "/");
				if (canDownloadSrc.startsWith("/")) {
					canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
				} else if (!canDownloadSrc.startsWith("/") && !canDownloadSrc.startsWith("http://")) {
					canDownloadSrc = viewUrl.substring(0, viewUrl.lastIndexOf("/") + 1) + canDownloadSrc;
				}
				
				String newSrc = downloadFile(canDownloadSrc);
				html = html.replace(src, newSrc);
			}
		}
		html =  html.substring("<html>".length());
		html = html.substring(0, html.length() - "</html>".length());
		return html;
	}
	
	private String downloadFile(String src) throws InterruptedException {
		Thread.sleep(500);
		String ext = src.substring(src.lastIndexOf(".") + 1);
		String fileName = System.currentTimeMillis() + "." + ext;
		String localFile =  net.aykj.listener.InitialListener.basePath + "temp/" + fileName;
		HttpUtil.downloadFile(src, localFile);
		return "/temp/" + fileName;
	}
	
	@SuppressWarnings("rawtypes")
	private boolean articleExist(String title, Integer subsiteId, Integer newsClassId) {
		//判断文章是否存在
		Map condition = new HashMap();
		condition.put("title", title);
		condition.put("subsiteId", subsiteId);
		condition.put("newsClassIds", newsClassId);
		Long count = articleService.queryArticleCountByTitle(subsiteId, newsClassId, title);
		if (count > 0) {
			return true;
		}
		return false;
	}
	
	/**
	 * 保存文章
	 * @param title
	 * @param author
	 * @param createtime
	 * @param source
	 * @param articleContent
	 * @param subsiteId
	 * @param newsClassId
	 * @param hits
	 * @throws Exception
	 */
	private Integer addArticle(String title, String author, Date createtime, String source, String articleContent, 
			Integer subsiteId, Integer newsClassId, String hits) throws Exception {
		Article article = new Article();
		article.setTitle(title);
		article.setAuthor(author);
		article.setCreatetime(createtime);
		article.setSource(source);
		article.setContent(articleContent);
		article.setAudit(1);
		article.setHits(hits == null || "".equals(hits) ? 0 : Integer.valueOf(hits));
		return articleService.saveArticle(article, null, new Integer[]{newsClassId});
	}

	/**
	 * 查询详细页的连接
	 * @param rule
	 * @param errorCount
	 * @return
	 * @throws Exception
	 */
	private List<String> queryViewUrlList(Rule rule, String errorCount) throws Exception {
		String listUrl = rule.getListUrl();
		String encode =  GeneralUtil.isNull(rule.getEncode()) ? "UTF-8" : rule.getEncode();
		String viewRegex = rule.getViewRegex();
		
		// 列表时间正则表达式
		String listCreatetimeRegex = rule.getListCreatetimeRegex();
		
		// 列表时间过滤器
		String listCreatetimeFilter = rule.getListCreatetimeFilter();
		
		String host = rule.getHost();
		// 云检
		//host = "http://" + host + "/";
		// 高检
		// host = "http:";
		if (listUrl != null) {
			String[] listUrlArray = listUrl.split(",");
			List<String> viewUrlList = new ArrayList<String>();
			List<String> viewUrlTemp = new ArrayList<String>();
			for (String url : listUrlArray) {
				String content = HttpUtil.get(url, encode);
				if("HTTP/1.1 404 Not Found".equals(content)){
					System.out.println("链接:"+host+",访问404,请检查链接");
				}else{
					if (GeneralUtil.isNotNull(rule.getListContainerRegex())) {
						List<String> contentList = extractStrByPattern(host, content, rule.getListContainerRegex(), errorCount);
						if (contentList != null && contentList.size() > 0) {
							content = contentList.get(0);
						}
					}
					
					//详细页连接在部分网站没有写绝对路径  这里要拼出完整的连接前缀
					List<String> list = extractStrByPattern((GeneralUtil.isNotNull(rule.getPrefix()) ? rule.getPrefix() : "" ), content, viewRegex, errorCount);
					
					// 获取列表页时间
					List<String> createtimeList = extractStrByPattern(content, listCreatetimeRegex, errorCount, true, listCreatetimeFilter);
					System.out.println(createtimeList);
					
					// 这里如果详情页没有时间，需从列表页获取时间时使用。  思想：把列表页详情链接和时间绑在一起用VT分隔
					if (GeneralUtil.isNotNull(createtimeList)) {
						int i = 0;
						for (String viewList : list) {
							List<String> vListTemp = new ArrayList<String>();
							vListTemp.add(viewList + "VT" + (createtimeList.get(i).contains("\r\n\t\t\t\t\t\t\t\t") ? createtimeList.get(i).replace("\r\n\t\t\t\t\t\t\t\t", "") : createtimeList.get(i)));
							i++;
							viewUrlTemp.addAll(vListTemp);
						}
						//详细页连接进行倒叙  插入数据才是正着的
						Collections.reverse(viewUrlTemp);
						viewUrlList.addAll(viewUrlTemp);
					} else {
						//详细页连接进行倒叙  插入数据才是正着的
						Collections.reverse(list);
						viewUrlList.addAll(list);
					}
					
				}
			}
			return viewUrlList;
		}
		return null;
	}
	
	
	private List<String> extractStrByPattern(String content, String regex, String errorCount) {
		return extractStrByPattern(null, content, regex, errorCount);
	}
	
	private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount) {
		return extractStrByPattern(prefix, content, regex, errorCount, false, null);
	}
	
	private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml) {
		return extractStrByPattern(null, content, regex, errorCount, filterHtml, null);
	}
	
	private List<String> extractStrByPattern(String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
		return extractStrByPattern(null, content, regex, errorCount, filterHtml, filterRegex);
	}
	
	private List<String> extractStrByPattern(String prefix, String content, String regex, String errorCount, boolean filterHtml, String filterRegex) {
		if(GeneralUtil.isNotNull(regex)){
			
			List<String> list = new ArrayList<String>();
			Pattern pattern = Pattern.compile(regex, Pattern.DOTALL);
			Matcher matcher = pattern.matcher(content);
			boolean isFound = false;
			while(matcher.find()) {
				isFound = true;
				String g = matcher.group();
				
				if (filterHtml) {
					g = g.replaceAll("^[\u00ff\uffff]", "").replaceAll("<.*?>", "").replaceAll("&.*?;", "");
				}
				
				if (GeneralUtil.isNotNull(filterRegex)) {
					g = g.replaceAll(filterRegex, "");
				}
				
				if (GeneralUtil.isNotNull(prefix)) {
					list.add(prefix + g);
				} else {
					list.add(g);
				}
			}
			if (!isFound) {
				addErrorCount(errorCount);
			}
			return list;
		}else{
			return null;
		}
	}
	
	
	private void addGetCount(String getCount) {
		Integer count = servletContext.getAttribute(getCount) ==  null ? 0 : (Integer)servletContext.getAttribute(getCount);
		count++;
		servletContext.setAttribute(getCount, count);
	}
	
	private void addGotCount(String gotCount) {
		Integer count = servletContext.getAttribute(gotCount) ==  null ? 0 : (Integer)servletContext.getAttribute(gotCount);
		count++;
		servletContext.setAttribute(gotCount, count);
	}
	
	private void addErrorCount(String errorCount) {
		Integer count = servletContext.getAttribute(errorCount) ==  null ? 0 : (Integer)servletContext.getAttribute(errorCount);
		count++;
		servletContext.setAttribute(errorCount, count);
	}
	
	private boolean isStop(String totalCount, String getCount, String gotCount, String errorCount)	{
		if (stop) {
			servletContext.removeAttribute(rulesName);
			servletContext.removeAttribute(totalCount);
			servletContext.removeAttribute(getCount);
			servletContext.removeAttribute(gotCount);
			servletContext.removeAttribute(errorCount);
			return true;
		}
		return false;
	}
	
	public void clear(String totalCount, String getCount, String gotCount, String errorCount) {
		this.stop = true;
		isStop(totalCount, getCount, gotCount, errorCount);
	}

	public boolean isStop() {
		return stop;
	}

	public void setStop(boolean stop) {
		this.stop = stop;
	}
	
	
	
	public static void main(String[] args) throws ParseException {
		String content = FileUtil.readFileToString(new File("F:\\test.txt"), "UTF-8");
		String regex = "E_ReadNews.asp\\?NewsID=[0-9]*";
		Pattern pattern = Pattern.compile(regex, Pattern.MULTILINE);
		Matcher matcher = pattern.matcher(content);
		while(matcher.find()) {
			String g = matcher.group();
			System.out.println(g);
		}
	}
	
}

/** 下载文件方法 */
public static String downloadFile(String remoteFile, String localFile) {
		//匹配正则表达式  带中文的替换成编译过的
		String zwRegex = "[\u4e00-\u9fa5]";
		Pattern pattern = Pattern.compile(zwRegex, Pattern.DOTALL);
		Matcher matcher = pattern.matcher(remoteFile);
		String newRemoteFile = remoteFile;
		while(matcher.find()){
			String zw = matcher.group();
			try {
				String zwbm = URLEncoder.encode(zw, "utf-8");
				newRemoteFile = newRemoteFile.replaceAll(zw, zwbm);
			} catch (UnsupportedEncodingException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		remoteFile = newRemoteFile;
		//处理完毕
		
        FileOutputStream output = null;
        String message = null;
        GetMethod get = null;
        try {
        	HttpClient client = new HttpClient();  
            get = new GetMethod(remoteFile); 
			client.executeMethod(get);
			
			localFile = localFile.replace("\\", "/");
			String dirStr = localFile.substring(0, localFile.lastIndexOf("/"));
			File dirFile = new File(dirStr);
			if (!dirFile.exists()) dirFile.mkdirs();
			
			File storeFile = new File(localFile);  
	        output = new FileOutputStream(storeFile);  
	        output.write(get.getResponseBody());  
	        if (get.getStatusCode() != 200) {
	        	message = get.getStatusText();
	        } else {
	        	message =get.getStatusCode() + "";
	        }
		} catch (HttpException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			 try {
				 if (output != null) {
					 output.flush(); 
					 output.close();
				 }
				 if (get != null) get.abort();
			} catch (IOException e) {
				e.printStackTrace();
			} 
		}
		return message;
	}
Vincebran
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 爬虫数据

package net.aykj.util;import java.io.File;import java.text.ParseException;import java.text.SimpleDateFormat;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.Date;import java.util.HashMap;import jav.
复制链接

扫一扫