java crawl_Java CrawlController.waitUntilFinish方法代碼示例

最新推荐文章于 2021-12-16 14:01:49 发布

冯国添

最新推荐文章于 2021-12-16 14:01:49 发布

阅读量221

点赞数

文章标签： java crawl

本文链接：https://blog.csdn.net/weixin_42445886/article/details/115067784

版权

本文整理匯總了Java中edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法的典型用法代碼示例。如果您正苦於以下問題：Java CrawlController.waitUntilFinish方法的具體用法？Java CrawlController.waitUntilFinish怎麽用？Java CrawlController.waitUntilFinish使用的例子？那麽恭喜您, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類edu.uci.ics.crawler4j.crawler.CrawlController的用法示例。

在下文中一共展示了CrawlController.waitUntilFinish方法的5個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於我們的係統推薦出更棒的Java代碼示例。

示例1: execute

點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void execute() throws Exception {

urlMap = GetURL.getAllUrl();

String crawlStorageFolder = "/data/crawl/root";

//��ò��

int numberOfCrawlers = 2;

CrawlConfig config = new CrawlConfig();

//��ô��м��Ϣ��ļ�Ŀ¼

config.setCrawlStorageFolder(crawlStorageFolder);

//��ȡ��

config.setMaxDepthOfCrawling(0);

//��Ƿ��ȡ��ݵ�ҳ��

config.setIncludeBinaryContentInCrawling(false);

//��⼫��ʷ��ɣ��ǰ�ȴ�200��루Ĭ�ϣ�

config.setPolitenessDelay(200);

//��¿��

//config.setResumableCrawling(true);

//��ʼ��Ϣ

PageFetcher pageFetcher = new PageFetcher(config);

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

* Ϊÿ��ӳ�ʼ��ȡҳ�棬��ÿ��ҳ�淢�ֵ��Ϊ��ȡ��

* ��ݿ��Ҫ��ȡ��url��ӵ��ȡ�б��

//note: map.values and map.keySet ˳��Ƿ�һ�£��飩

for (String url : urlMap.keySet()) {

controller.addSeed(url);

}

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

controller.startNonBlocking(MyCrawler.class, numberOfCrawlers);

//�ȴ� 1 ��ӣ��ֹ��ȡ��վ��ֹ

Thread.sleep(1000);

controller.waitUntilFinish();

}

開發者ID:wrayzheng，項目名稱:webpage-update-subscribe，代碼行數:54，

示例2: main

點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 1) {

System.out.println("Needed parameter: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

return;

}

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

String crawlStorageFolder = args[0];

CrawlConfig config1 = new CrawlConfig();

CrawlConfig config2 = new CrawlConfig();

* The two crawlers should have different storage folders for their

* intermediate data

config1.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler1");

config2.crawlStorageFolder_$eq(crawlStorageFolder + "/crawler2");

config1.politenessDelay_$eq(1000);

config2.politenessDelay_$eq(2000);

config1.maxPagesToFetch_$eq(50);

config2.maxPagesToFetch_$eq(100);

* We will use different PageFetchers for the two crawlers.

PageFetcher pageFetcher1 = new PageFetcher(config1);

PageFetcher pageFetcher2 = new PageFetcher(config2);

* We will use the same RobotstxtServer for both of the crawlers.

RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();

RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher1);

CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);

CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };

String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

controller1.setCustomData(crawler1Domains);

controller2.setCustomData(crawler2Domains);

controller1.addSeed("http://www.ics.uci.edu/");

controller1.addSeed("http://www.cnn.com/");

controller1.addSeed("http://www.ics.uci.edu/~lopes/");

controller1.addSeed("http://www.cnn.com/POLITICS/");

controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");

controller2.addSeed("http://en.wikipedia.org/wiki/Obama");

controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

* The first crawler will have 5 cuncurrent threads and the second

* crawler will have 7 threads.

controller1.startNonBlocking(BasicCrawler.class, 5);

controller2.startNonBlocking(BasicCrawler.class, 7);

controller1.waitUntilFinish();

System.out.println("Crawler 1 is finished.");

controller2.waitUntilFinish();

System.out.println("Crawler 2 is finished.");

}

開發者ID:sapienapps，項目名稱:scrawler，代碼行數:73，

示例3: main

點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 2) {

System.out.println("Needed parameters: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

System.out.println("\t numberOfCralwers (number of concurrent threads)");

return;

}

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

String crawlStorageFolder = args[0];

* numberOfCrawlers shows the number of concurrent threads that should

* be initiated for crawling.

int numberOfCrawlers = Integer.parseInt(args[1]);

CrawlConfig config = new CrawlConfig();

config.crawlStorageFolder_$eq(crawlStorageFolder);

config.politenessDelay_$eq(1000);

// Unlimited number of pages can be crawled.

config.maxPagesToFetch_$eq(-1);

* Instantiate the controller for this crawl.

PageFetcher pageFetcher = new PageFetcher(config);

RobotsTxtConfig robotstxtConfig = new RobotsTxtConfig();

RobotsTxtServer robotstxtServer = new RobotsTxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

* For each crawl, you need to add some seed urls. These are the first

* URLs that are fetched and then the crawler starts following links

* which are found in these pages

controller.addSeed("http://www.ics.uci.edu/~welling/");

controller.addSeed("http://www.ics.uci.edu/~lopes/");

controller.addSeed("http://www.ics.uci.edu/");

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

// Wait for 30 seconds

Thread.sleep(30 * 1000);

// Send the shutdown request and then wait for finishing

controller.shutdown();

controller.waitUntilFinish();

}

開發者ID:sapienapps，項目名稱:scrawler，代碼行數:60，

示例4: main

點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 1) {

System.out.println("Needed parameter: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

return;

}

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

String crawlStorageFolder = args[0];

CrawlConfig config1 = new CrawlConfig();

CrawlConfig config2 = new CrawlConfig();

* The two crawlers should have different storage folders for their

* intermediate data

config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");

config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");

config1.setPolitenessDelay(1000);

config2.setPolitenessDelay(2000);

config1.setMaxPagesToFetch(50);

config2.setMaxPagesToFetch(100);

* We will use different PageFetchers for the two crawlers.

PageFetcher pageFetcher1 = new PageFetcher(config1);

PageFetcher pageFetcher2 = new PageFetcher(config2);

* We will use the same RobotstxtServer for both of the crawlers.

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);

CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);

CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);

String[] crawler1Domains = new String[] { "http://www.ics.uci.edu/", "http://www.cnn.com/" };

String[] crawler2Domains = new String[] { "http://en.wikipedia.org/" };

controller1.setCustomData(crawler1Domains);

controller2.setCustomData(crawler2Domains);

controller1.addSeed("http://www.ics.uci.edu/");

controller1.addSeed("http://www.cnn.com/");

controller1.addSeed("http://www.ics.uci.edu/~lopes/");

controller1.addSeed("http://www.cnn.com/POLITICS/");

controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");

controller2.addSeed("http://en.wikipedia.org/wiki/Obama");

controller2.addSeed("http://en.wikipedia.org/wiki/Bing");

* The first crawler will have 5 cuncurrent threads and the second

* crawler will have 7 threads.

controller1.startNonBlocking(BasicCrawler.class, 5);

controller2.startNonBlocking(BasicCrawler.class, 7);

controller1.waitUntilFinish();

System.out.println("Crawler 1 is finished.");

controller2.waitUntilFinish();

System.out.println("Crawler 2 is finished.");

}

開發者ID:Chaiavi，項目名稱:Crawler4j，代碼行數:73，

示例5: main

點讚 2

import edu.uci.ics.crawler4j.crawler.CrawlController; //導入方法依賴的package包/類

public static void main(String[] args) throws Exception {

if (args.length != 2) {

System.out.println("Needed parameters: ");

System.out.println("\t rootFolder (it will contain intermediate crawl data)");

System.out.println("\t numberOfCralwers (number of concurrent threads)");

return;

}

* crawlStorageFolder is a folder where intermediate crawl data is

* stored.

String crawlStorageFolder = args[0];

* numberOfCrawlers shows the number of concurrent threads that should

* be initiated for crawling.

int numberOfCrawlers = Integer.parseInt(args[1]);

CrawlConfig config = new CrawlConfig();

config.setCrawlStorageFolder(crawlStorageFolder);

config.setPolitenessDelay(1000);

// Unlimited number of pages can be crawled.

config.setMaxPagesToFetch(-1);

* Instantiate the controller for this crawl.

PageFetcher pageFetcher = new PageFetcher(config);

RobotstxtConfig robotstxtConfig = new RobotstxtConfig();

RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);

CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

* For each crawl, you need to add some seed urls. These are the first

* URLs that are fetched and then the crawler starts following links

* which are found in these pages

controller.addSeed("http://www.ics.uci.edu/~welling/");

controller.addSeed("http://www.ics.uci.edu/~lopes/");

controller.addSeed("http://www.ics.uci.edu/");

* Start the crawl. This is a blocking operation, meaning that your code

* will reach the line after this only when crawling is finished.

controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);

// Wait for 30 seconds

Thread.sleep(30 * 1000);

// Send the shutdown request and then wait for finishing

controller.shutdown();

controller.waitUntilFinish();

}

開發者ID:Chaiavi，項目名稱:Crawler4j，代碼行數:60，

注：本文中的edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法示例整理自Github/MSDocs等源碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。

冯国添

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java crawl_Java CrawlController.waitUntilFinish方法代碼示例

本文整理匯總了Java中edu.uci.ics.crawler4j.crawler.CrawlController.waitUntilFinish方法的典型用法代碼示例。如果您正苦於以下問題：Java CrawlController.waitUntilFinish方法的具體用法？Java CrawlController.waitUntilFinish怎麽用？Java CrawlController...
复制链接

扫一扫