Java Executor多线程框架 Demo
package com.ws.springsplider.service;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ThreadPoolExecutor;
public class ExecutorTest {
private static Integer pages=1; // 网页数
private static boolean exeFlag=true; // 执行标识
public static void main(String[] args) {
// 创建ExecutorService 连接池默认连接10个
ExecutorService executorService=Executors.newFixedThreadPool(10);
while(exeFlag){
if(pages<=100){
executorService.execute(new Runnable() {
@Override
public void run() {
System.out.println("爬取了第"+pages+"网页...");
pages++;
}
});
}else{
// 活动线程个数是0
if(((ThreadPoolExecutor)executorService).getActiveCount()==0){
executorService.shutdown(); // 结束所有线程
exeFlag=false;
System.out.println("爬虫任务已经完成");
}
}
try {
Thread.sleep(100); // 线程休息0.1秒
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
线程池的submit和execute方法区别
线程池中的execute方法,即开启线程执行池中的任务,
还有一个方法submit也可以做到,它的功能是提交指定的任务去执行并且返回Future对象,即执行的结果
下面简要介绍一下两者的三个区别:
1、接收的参数不一样
2、submit有返回值,而execute没有
用到返回值的例子,比如说我有很多个做validation的task,我希望所有的task执行完,然后每个task告诉我它的执行结果,是成功还是失败,如果是失败,原因是什么。
然后我就可以把所有失败的原因综合起来发给调用者。 个人觉得cancel execution这个用处不大,很少有需要去取消执行的,而最大的用处应该是第二点
3、submit方便Exception处理
意思就是如果你在你的task里会抛出checked或者unchecked exception,
而你又希望外面的调用者能够感知这些exception并做出及时的处理,那么就需要用到submit,通过捕获Future.get抛出的异常。
下面一个小程序演示一下submit方法
public class RunnableTestMain {
public static void main(String[] args) {
ExecutorService pool = Executors.newFixedThreadPool(2);
/**
* execute(Runnable x) 没有返回值。可以执行任务,但无法判断任务是否成功完成。
*/
pool.execute(new RunnableTest("Task1"));
/**
* submit(Runnable x) 返回一个future。可以用这个future来判断任务是否成功完成。请看下面:
*/
Future future = pool.submit(new RunnableTest("Task2"));
try {
if(future.get()==null){//如果Future's get返回null,任务完成
System.out.println("任务完成");
}
} catch (InterruptedException e) {
} catch (ExecutionException e) {
//否则我们可以看看任务失败的原因是什么
System.out.println(e.getCause().getMessage());
}
}
}
public class RunnableTest implements Runnable {
private String taskName;
public RunnableTest(final String taskName) {
this.taskName = taskName;
}
@Override
public void run() {
System.out.println("Inside "+taskName);
throw new RuntimeException("RuntimeException from inside " + taskName);
}
}
Executor多线程框架使用
/**
* 多线程爬取某吧内容数据
*/
public void crawlingGbContentData() {
//String path = "E:/localgbcontent/gbcontent.xlsx"; //Window路径
String path = "/home/splider/gbcontent/gbcontent.xlsx"; //Linux路径
File f=new File(path);
InputStream inputStream= null;
try {
inputStream = new FileInputStream(f);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
ExcelLogs logs =new ExcelLogs();
Collection<Map> excelList = ExcelUtil.importExcel(Map.class, inputStream, "yyyy/MM/dd HH:mm:ss", logs , 0);
System.out.println(excelList.size());
List<String> list = new ArrayList<String>();
for(Map<String, String> m : excelList){
String code = String.valueOf(m.get("code"));
if(!list.contains(code)){
list.add(code);
}
}
System.out.println(list.toString());
//定义线程池
ExecutorService executorService = Executors.newFixedThreadPool(5);
for (String code : list) {
EastMoneyContentThread emThread = new EastMoneyContentThread(code,this);
executorService.submit(emThread); //执行线程,submit有返回值,而execute没有
}
executorService.shutdown();
}
实现Runnable 接口(线程池executorService.submit执行Runnable 接口的业务任务实现)
package com.ws.springsplider.thread;
import com.ws.springsplider.model.EastMoney_JMZY;
import com.ws.springsplider.service.CrawlingEastMoneyContentService;
import com.ws.springsplider.utils.TimeUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.logging.log4j.util.Strings;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.List;
public class EastMoneyContentThread implements Runnable {
public static Log logger = LogFactory.getLog(EastMoneyContentThread.class);
private String code;
private CrawlingEastMoneyContentService emcService;
public EastMoneyContentThread(String code, CrawlingEastMoneyContentService emcService) {
this.code = code;
this.emcService = emcService;
}
public void run() {
try {
System.out.println("进入 子线程 ********" + Thread.currentThread().getName());
logger.info("进入 子线程 ********" + Thread.currentThread().getName());
logger.info("开始获取数据库列表......" + Thread.currentThread().getName());
System.out.println("开始获取数据库列表......" + Thread.currentThread().getName());
System.out.println("code="+code+"------" + Thread.currentThread().getName());
List<EastMoney_JMZY> emList = emcService.selectEastMoneyListByCode(code);
logger.info("emList数据大小="+emList.size());
System.out.println("emList数据大小="+emList.size());
int i=0;
for(EastMoney_JMZY jmzy: emList){
if(Strings.isNotBlank(jmzy.getContentUrl())){
logger.info("开始爬取内容数据......"+ Thread.currentThread().getName());
System.out.println("开始爬取内容数据......"+ Thread.currentThread().getName());
// 爬取内容页数据
String gbContent = emcService.postSubmit(jmzy.getContentUrl());
Document docTitle = Jsoup.parse(gbContent);
if (docTitle != null) {
if (docTitle.getElementById("zwcontent") != null) {
logger.info("进入解析内容流程********"+ Thread.currentThread().getName());
System.out.println("进入解析内容流程********" + Thread.currentThread().getName());
//获取内容页面时间
String date = docTitle.getElementsByClass("zwfbtime").get(0).text().trim();
if(Strings.isNotBlank(date)){
String dateSub = date.substring(date.indexOf(" "), date.lastIndexOf(" "));
String timeFormat = TimeUtil.formatDate(dateSub.trim(), "yyyy-MM-dd HH:mm:ss");
int ftime = TimeUtil.getTimeNumber(timeFormat);
int dtime = 20170101;
//如果时间小于2017-01-01,跳出循环,接着遍历下一条数据
if(ftime>dtime){
//爬取的内容保存到本地
//String path = "E:/gbfiles/" + code + "/" + "_" + i + ".txt"; //Window路径
String path = "/home/splider/gbfiles/" + code + "/" + "_" + i + ".txt"; //Linux路径
File file = new File(path);
if (!file.exists() || file.length() == 0) {
saveParseData(gbContent, file);
logger.info("文件 :" + file + "保存" + "==" + new Date());
System.out.println("文件 :" + file + "保存" + "==" + new Date());
}
//更新字段数据到对象,准备向数据库更新
jmzy.setPublishTime(timeFormat);
jmzy.setLastUpdateTime(TimeUtil.getDate(new Date()));
jmzy.setContent(docTitle.getElementsByClass("stockcodec").text());
//保存文件的路径,将保存本地文件的路径也更新到数据库
jmzy.setContentFileUrl(path);
//爬取的内容根据id更新数据
int mark = emcService.updateSelective(jmzy);
if (mark != 0) {
System.out.println("code="+code+"--"+"id="+jmzy.getId()+"--"+file + "-- 更新成功 ==" + new Date());
logger.info("code="+code+"--"+"id="+jmzy.getId()+"--"+file + "-- 更新成功 ==" + new Date());
} else {
System.out.println("code="+code+"--"+"id="+jmzy.getId()+"--"+file + "-- 更新失败 ==" + new Date());
logger.info("code="+code+"--"+"id="+jmzy.getId()+"--"+file + "-- 更新失败 ==" + new Date());
}
}
}
} else {
//内容为空的情况(可能IP被封),此时跳出循环,接着遍历下一条数据
break;
}
}
}
i++;
Thread.sleep(500);
}
} catch (Exception e) {
e.printStackTrace();
}
}
//保存文件到磁盘
public static void saveParseData(String context, File file) {
String oSystem = System.getProperty("os.name");
//if (oSystem.contains("Windows")) {
// 将数据写入文件
// File file = new File(path);
try {
FileUtils.writeStringToFile(file, context, "UTF-8");
} catch (IOException e) {
e.printStackTrace();
}
//}
}
}
发送http请求,获取页面内容
//发送http请求,获取页面内容
public static String postSubmit(String URL) throws Exception {
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
org.apache.http.HttpEntity entity = null;
String responseContent = null;
try {
// 创建默认的httpClient实例.
String postUrl = URL;
HttpGet httpPost = new HttpGet(postUrl);// 创建get请求
// httpPost.setHeader("Authorization", token);
RequestConfig config = RequestConfig.custom().setSocketTimeout(0).setConnectTimeout(0)
.setConnectionRequestTimeout(0).build();
httpPost.setConfig(config);// 设置代理ip
// 可根据响应状态进行换ip,ip在代理ip网站进行抓取,抓取到放到队列中
httpPost.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
httpPost.setHeader("Accept-Encoding", "gzip, deflate");
httpPost.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
httpPost.setHeader("Cache-Control", "no-cache");
httpPost.setHeader("Connection", "keep-alive");
// httpPost.setHeader("Cookie",
// "st_pvi=13101359948035; st_si=73199158729108;
// qgqp_b_id=a64c2248dd015b209aa8ea01d6bb0d30;
// _adsame_fullscreen_16884=1");
httpPost.setHeader("Host", "guba.eastmoney.com");
httpPost.setHeader("Pragma", "no-cache");
httpPost.setHeader("Upgrade-Insecure-Requests", "1");
httpPost.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36");
httpClient = HttpClients.createDefault();
// 执行请求
response = httpClient.execute(httpPost);
entity = response.getEntity();
responseContent = EntityUtils.toString(entity, "UTF-8");
System.out.println("执行请求");
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
// 关闭连接,释放资源
if (response != null) {
response.close();
}
if (httpClient != null) {
httpClient.close();
}
} catch (IOException e) {
}
}
return responseContent;
}