产生synthetic DataSet的数据集

原创 2012年03月23日 09:07:55

Copyright 属于 The Skyline Operator的作者


@inproceedings{DBLP:conf/icde/BorzsonyiKS01,
  author    = {Stephan B{\"o}rzs{\"o}nyi and
               Donald Kossmann and
               Konrad Stocker},
  title     = {The Skyline Operator},
  booktitle = {ICDE},
  year      = {2001},
  pages     = {421-430},
  ee        = {http://doi.ieeecomputersociety.org/10.1109/ICDE.2001.914855},
  crossref  = {DBLP:conf/icde/2001},
  bibsource = {DBLP, http://dblp.uni-trier.de}
}

@proceedings{DBLP:conf/icde/2001,
  editor    = {Dimitrios Georgakopoulos and
               Alexander Buchmann},
  title     = {Proceedings of the 17th International Conference on Data
               Engineering, April 2-6, 2001, Heidelberg, Germany},
  publisher = {IEEE Computer Society},
  year      = {2001},
  isbn      = {0-7695-1001-9},
  bibsource = {DBLP, http://dblp.uni-trier.de}
}



需要使用g++编译器进行编译

#define FOR_RELEASE

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <iostream>
#include <cstdlib>
#include <fstream>
#include <assert.h>
#include <string.h>

static const double UPPER = 0.95;
static const double LOWER = 0.05;

#ifndef RADIUS_BOUND
#define RADIUS_BOUND 0.05
#endif

#define DOMAIN 10000

using namespace std;

double RandomEqual(double min,double max)
{
	double x = (double)rand()/RAND_MAX;
	return x*(max-min)+min;
}

double RandomPeak(double min,double max,int dim)
{
	double sum = 0.0;
	for (int d=0; d<dim; d++) sum += RandomEqual(0,1);
	sum /= dim;
	return sum*(max-min)+min;
}

double RandomNormal(double med,double var)
{
	return RandomPeak(med-var,med+var,12);
}

double loc(double v, double s)
{
	double d = v / s;
	long i = (long)d;
	if(d > i)
	{
		i ++;	
	}

	return (double)i * s;
}

// void GenerateDataEqually(ostream& dou_data,int Count,int DupRatio,int Dimensions)
void GenerateDataEqually(double** center, int Dimensions, int Count, int DupRatio)
{
	double step = 1.0 / ((double)Count / (double)DupRatio);

	for (int i=0; i<Count; i++)
	{
		double x[Dimensions];
		for (int d=0; d<Dimensions; d++)
		{
// 			x[d] = RandomEqual(0,1);
			x[d] = RandomEqual(LOWER, UPPER);
			if(DupRatio == 1)
			{
// 				dou_data << x[d] << " ";
				center[i][d] = x[d];
			}
			else
			{
// 				dou_data << loc(x[d],step) << " ";
				center[i][d] = loc(x[d],step);
			}
		}
// 		dou_data << endl;
	}
}

// void GenerateDataCorrelated(ostream& dou_data,int Count,int DupRatio,int Dimensions)
void GenerateDataCorrelated(double** center, int Dimensions, int Count, int DupRatio)
{
	double step = 1.0 / ((double)Count / (double)DupRatio);

	double x[Dimensions];
	for (long i=0; i<Count; i++) {
again:
// 		double v = RandomPeak(0,1,Dimensions);
		double v = RandomPeak(LOWER, UPPER, Dimensions);
		for (int d=0; d<Dimensions; d++) x[d] = v;
// 		double l = v<=0.5 ? v:1.0-v;
		double l = v<=(LOWER+UPPER)/2 ? v : UPPER-v;
		for (int d=0; d<Dimensions; d++) {
// 			double h = RandomNormal(0,l);
			double h = RandomNormal(LOWER, l);
			x[d] += h;
			x[(d+1)%Dimensions] -= h;
		}
// 		for (int d=0; d<Dimensions; d++) if (x[d]<0 || x[d]>=1) goto again;
		for (int d=0; d<Dimensions; d++) if (x[d]<LOWER || x[d]>=UPPER) goto again;
		
		for (int d=0; d<Dimensions; d++) {
			if(DupRatio == 1)
			{
// 				dou_data << x[d] << " ";
				center[i][d] = x[d];
			}
			else
			{
// 				dou_data << loc(x[d],step) << " ";
				center[i][d] = loc(x[d],step);
			}
		}
// 		dou_data << endl;
	}
}

// void GenerateDataAnticorrelated(ostream& dou_data,int Count,int DupRatio,int Dimensions)
void GenerateDataAnticorrelated(double** center, int Dimensions, int Count, int DupRatio)
{
	double step = 1.0 / ((double)Count / (double)DupRatio);

	double x[Dimensions];
	for (long i=0; i<Count; i++) 
	{
again:
// 		double v = RandomNormal(0.5,0.25);
		double v = RandomNormal((LOWER+UPPER)/2, (LOWER+UPPER)/4);
		for (int d=0; d<Dimensions; d++) x[d] = v;
// 		double l = v<=0.5 ? v:1.0-v;
		double l = v<=(LOWER+UPPER)/2 ? v : UPPER-v;
		for (int d=0; d<Dimensions; d++) 
		{
			double h = RandomEqual(-l,l);
			x[d] += h;
			x[(d+1)%Dimensions] -= h;
		}
		for (int d=0; d<Dimensions; d++) if (x[d]<LOWER || x[d]>=UPPER) goto again;
		for (int d=0; d<Dimensions; d++) 
		{
			if(DupRatio == 1)
			{
// 				dou_data << x[d] << " ";
				center[i][d] = x[d];
			}
			else
			{
// 				dou_data << loc(x[d],step) << " ";
				center[i][d] = loc(x[d],step);
			}
		}
// 		dou_data << endl;
	}
}

// void GenerateData(int Dimensions,char Distribution,int Count,int DupRatio, 
// 									ostream& dou_data)
void GenerateCenter(double** center, int Dimensions, char Distribution, int Count)
{
 	if (Count <= 0) {
 		cout << "Amount should be greater than 0" << endl;
 		return;
 	}
	if (Dimensions < 2) {
		cout << "Dimension should be greater than 2" << endl;
		return;
	}
	switch (Distribution) {
		case 'E':
		case 'e': Distribution = 'E'; break;
		case 'C':
		case 'c': Distribution = 'C'; break;
		case 'A':
		case 'a': Distribution = 'A'; break;
		default: cout << "Unknown data distribution error." << endl; return;
	}

// 	dou_data << Count << " " << Dimensions << endl;
	
	switch (Distribution) {
		case 'E': 
			srand((unsigned)time(NULL));
			GenerateDataEqually(center, Dimensions , Count, 1); 
			break;
		case 'C': 
			srand((unsigned)time(NULL));
			GenerateDataCorrelated(center, Dimensions , Count, 1); 
			break;
		case 'A': 
			srand((unsigned)time(NULL));
			GenerateDataAnticorrelated(center, Dimensions , Count, 1); 
			break;
	}
// 	cout << "done." << endl;
}


//argv[1]: MIN_GROUP_SIZE, argv[2]: MAX_GROUP_SIZE, argv[3]: dimension, argv[4]: groupNo, argv[5]: distribution
int main(int argc, char *argv[])	
{
	int dimension = 4;
	long groupNo = 5000;
	char type='E';
	char path[128]="/home/yingz/work/SO/exp/center";

#ifdef FOR_RELEASE
	
	if( argc <= 4 )
	{
		cerr << "data generator for pskyline" << endl
				<< "Usage: " << argv[0] << endl
				<< "  dimension: " << endl
				<< "  the number of centers to be generated: " << endl
				<< "  distribution for the centers of groups (E(qually) | C(orrelated) | A(nti-correlated)): " << endl
				<< "  path " << endl;
	}
	else
	{
		dimension = atoi(argv[1]);
		groupNo = atol(argv[2]);
		type = argv[3][0];	
		strcpy( path , argv[4] );
	}

#endif
		
		char filename[128];
	 	sprintf( filename, "%s/%dd_%c_%d.txt" , path , dimension, type, groupNo);

		ofstream out(filename, ios_base::trunc);

		assert( out.is_open());

		out.setf(ios::fixed, ios::floatfield);		
		out << groupNo << " " << dimension << endl;
		
		srand(time(NULL));
		
		double** center;
		center = new double*[groupNo];
		for(long i = 0; i < groupNo; ++i)
		{
			center[i] = new double[dimension];
		}

		GenerateCenter(center, dimension, type, groupNo);
		
		for ( int i=0; i< groupNo; i++ )
		{
			for ( int d=0; d< dimension; d++ )
			out<< center[i][d] * DOMAIN  << " ";

			out << endl;
		}
		
		cout << "generation complete" << endl;

  	for(long i = 0; i < groupNo; ++i)
  	{
  		delete[] center[i];
  	}
  	delete[] center;

  return EXIT_SUCCESS;

}
编译完成后运行 ./a.out 2 100 E .

2D15数据集

  • 2015年06月08日 19:25
  • 30KB
  • 下载

各种数据集汇总

转:http://bbs.w3china.org/blog/more.asp?name=idmer&id=24017 大家做数据挖掘研究时,常常为找不到合适的数据而发愁。在KDNuggets上有...
  • u010062397
  • u010062397
  • 2015年11月05日 10:46
  • 4363

论文阅读:Synthetic Data for Text Localisation in Natural Images

重新把注意力放到了 自然场景文本检测与识别 上了。已经从这块方向离开了一年多了。再回来,已经物是人非。都不像以前那么玩了,论文赶紧看起来。 上次阅读完 Reading Text in the Wild...
  • u010167269
  • u010167269
  • 2016年08月31日 22:24
  • 5369

推荐|caffe-orc主流ocr算法:CNN+BLSTM+CTC架构实现!

——免费加入AI技术专家社群>> ——免费加入AI高管投资者群>> 简介 caffe_ocr是一个对现有主流ocr算法研究实验性的项目,目前实现了CNN+BLSTM+CTC的识别架构...
  • CygqjBABx875u
  • CygqjBABx875u
  • 2017年10月29日 00:00
  • 510

100款机器学习数据集

Kaggle 书籍推荐数据集(goodreads/上万图书/百万评价)【Kaggle】 https://www.kaggle.com/zygmunt/goodbooks-10k...
  • Touch_Dream
  • Touch_Dream
  • 2017年09月05日 08:23
  • 1885

产生synthetic DataSet的数据集

Copyright 属于 The Skyline Operator的作者 @inproceedings{DBLP:conf/icde/BorzsonyiKS01, author ...
  • com_stu_zhang
  • com_stu_zhang
  • 2012年03月23日 09:07
  • 1734

Java中的synthetic

转自:http://www.linuxidc.com/Linux/2010-05/26396.htm 有synthetic标记的field和method是class内部使用的,正常的源代码里不会出现...
  • wonengxing
  • wonengxing
  • 2014年06月20日 15:02
  • 3257

Multiview RGB-D Dataset for Object Instance Detection

大致译文,可以看看,参考一下。转载请说明 摘要 基于9个厨房场景,本文提供了一个新的多视角RGB-D数据集。场景中的物体有一部分来自于BigBird数据集。Scenes被密集采取,使用boundi...
  • h1418792278
  • h1418792278
  • 2017年03月02日 16:22
  • 619

使用 matlab 数据集的生成(generate datasets)

一般手工生成的数据集(artificial datasets),通常用于实验部分最开始的演示和示意,用于对结果的一种精确计算和量化分析。1. Swiss/Helix/Twinpeaks/Broken ...
  • lanchunhui
  • lanchunhui
  • 2016年11月30日 11:49
  • 2280

Dataset导入string变量

 在工作中有个小任务,要把某个格式的XML以表格的形式显示出来,思路很简单,就是把XML文件导入Dataset 即可。可当我想XML文件内容读到一个string变量内,再把这个变量到到dataset对...
  • sycyb
  • sycyb
  • 2007年01月04日 11:33
  • 836
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:产生synthetic DataSet的数据集
举报原因:
原因补充:

(最多只允许输入30个字)