java crawl_Java CrawlController.waitUntilFinish方法代碼示例

本文整理匯總了Java中edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法的典型用法代碼示例。如果您正苦於以下問題:Java CrawlController.waitUntilFinish方法的具體用法?Java CrawlController.waitUntilFinish怎麽用?Java CrawlController.waitUntilFinish使用的例子?那麽恭喜您, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類edu.uci.ics.crawler4j.crawler.CrawlController的用法示例。

在下文中一共展示了CrawlController.waitUntilFinish方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於我們的係統推薦出更棒的Java代碼示例。

示例1: execute

​點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void execute() throws Exception {

urlMap = GetURL.getAllUrl();

String crawlStorageFolder = "/data/crawl/root";

//�����������

int numberOfCrawlers = 2;

CrawlConfig config = new CrawlConfig();

//���ô�������м���Ϣ���ļ�Ŀ¼

config.setCrawlStorageFolder(crawlStorageFolder);

//������ȡ���

config.setMaxDepthOfCrawling(0);

//�����Ƿ���ȡ���������ݵ�ҳ��

config.setIncludeBinaryContentInCrawling(false);

//���⼫����ʷ��������������ɣ����������������ǰ�ȴ�200���루Ĭ�ϣ�

config.setPolitenessDelay(200);

//���¿�������

//config.setResumableCrawling(true);

//��ʼ������������Ϣ

PageFetcher pageFetcher = new PageFetcher(config);

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

/*

* Ϊÿ��������ӳ�ʼ��ȡҳ�棬������ÿ��ҳ�淢�ֵ�������Ϊ��ȡ����

* �����ݿ���Ҫ��ȡ��url��ӵ���ȡ�б���

*/

//note: map.values and map.keySet ˳���Ƿ�һ�£���飩

for (String url : urlMap.keySet()) {

controller.addSeed(url);

}

/*

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

*/

controller.startNonBlocking(MyCrawler.class, numberOfCrawlers);

//�ȴ� 1 ���ӣ���ֹ������ȡ������վ��ֹ

Thread.sleep(1000);

controller.waitUntilFinish();

}

開發者ID:wrayzheng,項目名稱:webpage-update-subscribe,代碼行數:54,

示例2: main

​點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 1) {

System.out.println("Needed parameter: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

return;

}

/*

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

*/

String crawlStorageFolder = args[0];

CrawlConfig config1 = new CrawlConfig();

CrawlConfig config2 = new CrawlConfig();

/*

* The two crawlers should have different storage folders for their

* intermediate data

*/

config1.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler1");

config2.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler2");

config1.politenessDelay_$eq(1000);

config2.politenessDelay_$eq(2000);

config1.maxPagesToFetch_$eq(50);

config2.maxPagesToFetch_$eq(100);

/*

* We will use different PageFetchers for the two crawlers.

*/

PageFetcher pageFetcher1 = new PageFetcher(config1);

PageFetcher pageFetcher2 = new PageFetcher(config2);

/*

* We will use the same RobotstxtServer for both of the crawlers.

*/

RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();

RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher1);

CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);

CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };

String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

controller1.setCustomData(crawler1Domains);

controller2.setCustomData(crawler2Domains);

controller1.addSeed("http://www.ics.uci.edu/");

controller1.addSeed("http://www.cnn.com/");

controller1.addSeed("http://www.ics.uci.edu/~lopes/");

controller1.addSeed("http://www.cnn.com/POLITICS/");

controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");

controller2.addSeed("http://en.wikipedia.org/wiki/Obama");

controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

/*

* The first crawler will have 5 cuncurrent threads and the second

* crawler will have 7 threads.

*/

controller1.startNonBlocking(BasicCrawler.class, 5);

controller2.startNonBlocking(BasicCrawler.class, 7);

controller1.waitUntilFinish();

System.out.println("Crawler 1 is finished.");

controller2.waitUntilFinish();

System.out.println("Crawler 2 is finished.");

}

開發者ID:sapienapps,項目名稱:scrawler,代碼行數:73,

示例3: main

​點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 2) {

System.out.println("Needed parameters: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

System.out.println("\t numberOfCralwers (number of concurrent threads)");

return;

}

/*

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

*/

String crawlStorageFolder = args[0];

/*

* numberOfCrawlers shows the number of concurrent threads that should

* be initiated for crawling.

*/

int numberOfCrawlers = Integer.parseInt(args[1]);

CrawlConfig config = new CrawlConfig();

config.crawlStorageFolder_$eq(crawlStorageFolder);

config.politenessDelay_$eq(1000);

// Unlimited number of pages can be crawled.

config.maxPagesToFetch_$eq(-1);

/*

* Instantiate the controller for this crawl.

*/

PageFetcher pageFetcher = new PageFetcher(config);

RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();

RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

/*

* For each crawl, you need to add some seed urls. These are the first

* URLs that are fetched and then the crawler starts following links

* which are found in these pages

*/

controller.addSeed("http://www.ics.uci.edu/~welling/");

controller.addSeed("http://www.ics.uci.edu/~lopes/");

controller.addSeed("http://www.ics.uci.edu/");

/*

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

*/

controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

// Wait for 30 seconds

Thread.sleep(30 * 1000);

// Send the shutdown request and then wait for finishing

controller.shutdown();

controller.waitUntilFinish();

}

開發者ID:sapienapps,項目名稱:scrawler,代碼行數:60,

示例4: main

​點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 1) {

System.out.println("Needed parameter: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

return;

}

/*

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

*/

String crawlStorageFolder = args[0];

CrawlConfig config1 = new CrawlConfig();

CrawlConfig config2 = new CrawlConfig();

/*

* The two crawlers should have different storage folders for their

* intermediate data

*/

config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");

config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");

config1.setPolitenessDelay(1000);

config2.setPolitenessDelay(2000);

config1.setMaxPagesToFetch(50);

config2.setMaxPagesToFetch(100);

/*

* We will use different PageFetchers for the two crawlers.

*/

PageFetcher pageFetcher1 = new PageFetcher(config1);

PageFetcher pageFetcher2 = new PageFetcher(config2);

/*

* We will use the same RobotstxtServer for both of the crawlers.

*/

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);

CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);

CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };

String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

controller1.setCustomData(crawler1Domains);

controller2.setCustomData(crawler2Domains);

controller1.addSeed("http://www.ics.uci.edu/");

controller1.addSeed("http://www.cnn.com/");

controller1.addSeed("http://www.ics.uci.edu/~lopes/");

controller1.addSeed("http://www.cnn.com/POLITICS/");

controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");

controller2.addSeed("http://en.wikipedia.org/wiki/Obama");

controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

/*

* The first crawler will have 5 cuncurrent threads and the second

* crawler will have 7 threads.

*/

controller1.startNonBlocking(BasicCrawler.class, 5);

controller2.startNonBlocking(BasicCrawler.class, 7);

controller1.waitUntilFinish();

System.out.println("Crawler 1 is finished.");

controller2.waitUntilFinish();

System.out.println("Crawler 2 is finished.");

}

開發者ID:Chaiavi,項目名稱:Crawler4j,代碼行數:73,

示例5: main

​點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 2) {

System.out.println("Needed parameters: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

System.out.println("\t numberOfCralwers (number of concurrent threads)");

return;

}

/*

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

*/

String crawlStorageFolder = args[0];

/*

* numberOfCrawlers shows the number of concurrent threads that should

* be initiated for crawling.

*/

int numberOfCrawlers = Integer.parseInt(args[1]);

CrawlConfig config = new CrawlConfig();

config.setCrawlStorageFolder(crawlStorageFolder);

config.setPolitenessDelay(1000);

// Unlimited number of pages can be crawled.

config.setMaxPagesToFetch(-1);

/*

* Instantiate the controller for this crawl.

*/

PageFetcher pageFetcher = new PageFetcher(config);

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

/*

* For each crawl, you need to add some seed urls. These are the first

* URLs that are fetched and then the crawler starts following links

* which are found in these pages

*/

controller.addSeed("http://www.ics.uci.edu/~welling/");

controller.addSeed("http://www.ics.uci.edu/~lopes/");

controller.addSeed("http://www.ics.uci.edu/");

/*

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

*/

controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

// Wait for 30 seconds

Thread.sleep(30 * 1000);

// Send the shutdown request and then wait for finishing

controller.shutdown();

controller.waitUntilFinish();

}

開發者ID:Chaiavi,項目名稱:Crawler4j,代碼行數:60,

注:本文中的edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法示例整理自Github/MSDocs等源碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值