多线程读db,并将数据写入csv文件

22 篇文章 0 订阅
package com.ad.action;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;
import com.cap.dao.MongoControl;
import com.mongodb.BasicDBObject;
import com.mongodb.DBCursor;

/**
 * 多线程写入一个文件
 * 
 * @author lyuan
 *
 */
public class TBuilderRoomSqlFileTool {
	final static int BSIZE = 1024 * 1024;
	final static int DATACACHENUM = 10000;
	static int currThreadCount = 0;
	static int maxThreadCount = 20;
	static File dataFile = new File("c://dataFile.csv");

	public static BufferedWriter initDataWrite(File dataFile) throws Exception {
		if (!dataFile.exists()) {
			if (!dataFile.createNewFile()) {
				System.err.println("创建文件失败,已存在:" + dataFile.getAbsolutePath());
			}
		}
		return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dataFile, true), "UTF-8"));
	}

	//读一行数据
	public static void loadDB(CallBack<Void> callBack) throws Exception {
		MongoControl mongo = new MongoControl("smg", "t_schedule");
		DBCursor cur = mongo.select();
		int num = 0;
		while (cur.hasNext()) {
        	 BasicDBObject bdbObj = (BasicDBObject) cur.next(); 
        	 if(bdbObj != null){  
        		 String line = bdbObj.getString("id") + "," + bdbObj.getString("startTime") + "," + bdbObj.getString("endTime") + "," + bdbObj.getString("fileId");
        		 num++;
        		 callBack.call(num, line);
        	 }
	    }
	}
	
	public static void writeLog(String str, Object... values) {
		System.out.println(str);
	}
	
	public static void main(String[] args) throws Exception {
		final ExecutorService threadPool = Executors.newFixedThreadPool(maxThreadCount);//10
		final List<Future<String>> threadResultList = new ArrayList<Future<String>>();
		final BufferedWriter bw = initDataWrite(dataFile); // 主要的buffer对象.
		final WriteDataHandle writeDataFile = new WriteDataHandle(DATACACHENUM);//缓存 10000
		StopWatch stopWatch = new StopWatch();
		stopWatch.start();
		loadDB(new CallBack<Void>() {

			@Override
			public Void call(int num, String data) {
				try {
					final String tempData = data;
					// 添加数据,如果超出了缓存数据,则 开始写入文件系统
					if (writeDataFile.add(tempData)) {
						currThreadCount++;//0
						// 如果提交的线程过多,则取回之后再提交.
						if (currThreadCount >= maxThreadCount) {// 0  10
							for (Future<String> fs : threadResultList) {
								String tempDataName = fs.get();
								currThreadCount--;
							}
							threadResultList.clear(); // 清空
							currThreadCount = threadResultList.size();
						}
						Future<String> future = threadPool.submit(new TaskWithResult(writeDataFile, bw));
						threadResultList.add(future);
					}

				} catch (Exception e) {
					writeLog("录入错误的数据::0", e.getMessage());
				}
				return null;
				
			}
		});
		writeDataFile.flush(bw);
		threadPool.shutdown();
		stopWatch.stop();
		System.out.println(String.format("任务完成时间:%s ms", stopWatch.getTime()));
	}
}

class TaskWithResult implements Callable<String> {
	WriteDataHandle handle;

	BufferedWriter bufferedWriter;

	public TaskWithResult(WriteDataHandle handle, BufferedWriter bufferedWriter) {
		this.handle = handle;
		this.bufferedWriter = bufferedWriter;
	}

	@Override
	public String call() throws Exception {
		String fileName = Thread.currentThread().getName();

		handle.save(bufferedWriter);

		return fileName;
	}

}

class WriteDataHandle {
	ReentrantReadWriteLock readWriteLock = new ReentrantReadWriteLock();//读写锁

	WriteLock writeLock = readWriteLock.writeLock();

	List<String> cacheList;

	int currItemCount = 0;

	int dataCacheNum;

	public WriteDataHandle() {
		cacheList = new ArrayList<String>();
	}

	public WriteDataHandle(int dataCacheNum) {
		this.dataCacheNum = dataCacheNum;
		cacheList = new ArrayList<String>(dataCacheNum);
	}

	public boolean isCacheExpires() {
		return currItemCount >= dataCacheNum;
	}

	public boolean add(String sqlStr) {
		try {
			writeLock.lock();
			cacheList.add(sqlStr);
			currItemCount++;
			return isCacheExpires();
		} finally {
			writeLock.unlock();
		}
	}

	public void save(BufferedWriter bw) throws Exception {
		try {
			writeLock.lock();
			// 如果数据没有超出缓存.则返回.
			if (!isCacheExpires()) {
				return;
			}
			StopWatch stopWatch = new StopWatch();
			stopWatch.start();
			for (String str : cacheList) {
				bw.write(str + "\r\n");
				currItemCount--;
			}
			stopWatch.stop();
			System.out.println(String.format("%s,消费完成,耗费时间:%s ms,消费数据长度:%s",Thread.currentThread().getName(), stopWatch.getTime(),
					cacheList.size()));
			cacheList.clear(); // 清空数据.
		} finally {
			writeLock.unlock();
		}
	}

	public void flush(BufferedWriter bw) throws Exception {
		System.out.println(String.format("flush线程:%s, 需要保存数据的集合长度:%s", Thread.currentThread().getName(), cacheList.size()));
		for (String str : cacheList) {
			bw.write(str + "\r\n");
		}
		System.out.println(String.format("flush线程:%s, 消费完成,消费数据长度:%s", Thread.currentThread().getName(), cacheList.size()));
		cacheList.clear(); // 清空数据
		closeWrite(bw);
	}

	private void closeWrite(BufferedWriter bw) throws Exception {
		bw.flush();
		bw.close();
	}

}

class StopWatch {
	long begin;
	long end;

	public void start() {
		begin = System.currentTimeMillis();
	}

	public void stop() {
		end = System.currentTimeMillis();
	}

	public long getTime() {
		return end - begin;
	}
}

interface CallBack<T> {
	T call(int num, String str);
}

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
可以使用Python的多线程模块`threading`来实现多线程爬取数据,并使用`csv`模块将数据写入CSV文件。 以下是一个示例代码,用于爬取豆瓣电影Top250的电影名称、评分和链接,并将数据写入CSV文件中。 ```python import requests import csv import threading from bs4 import BeautifulSoup def get_movie_info(start): url = f'https://movie.douban.com/top250?start={start}' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') movie_list = soup.find(class_='grid_view').find_all('li') for movie in movie_list: title = movie.find(class_='title').text rating = movie.find(class_='rating_num').text link = movie.find('a')['href'] movie_info = [title, rating, link] write_to_csv(movie_info) def write_to_csv(movie_info): with open('douban_top250.csv', 'a', newline='', encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow(movie_info) if __name__ == '__main__': threads = [] for i in range(0, 250, 25): t = threading.Thread(target=get_movie_info, args=(i,)) threads.append(t) t.start() for t in threads: t.join() ``` 在上面的代码中,我们首先定义了一个`get_movie_info`函数用于爬取每一页的电影信息。在函数中,我们使用`requests`模块发送HTTP请求,获取网页的HTML源代码,并使用`BeautifulSoup`模块解析HTML文档,获取电影名称、评分和链接信息。 然后,我们定义了一个`write_to_csv`函数用于将获取到的电影信息写入CSV文件中。使用`csv`模块的`writer`函数将电影信息写入CSV文件。 在`if __name__ == '__main__':`中,我们创建了一个线程列表,并循环创建线程,每个线程负责爬取一页电影信息。我们使用`threading.Thread`函数创建线程,将`get_movie_info`函数作为线程的目标函数,并将页码作为参数传递给函数。 然后,我们循环启动所有线程,等待所有线程完成后再退出程序。使用`join`函数等待所有线程完成。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值