大数据-sparkday01-模拟数据20230202

package com.producedate2hive;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;

import com.bjsxt.spark.util.DateUtils;
import com.bjsxt.spark.util.StringUtils;

public class Data2File {
	public static String MONITOR_FLOW_ACTION ="./monitor_flow_action";
	public static String MONITOR_CAMERA_INFO ="./monitor_camera_info";
	public static void main(String[] args) {
		CreateFile(MONITOR_FLOW_ACTION );
		CreateFile(MONITOR_CAMERA_INFO );
		System.out.println("running... ...");
		mock();
		System.out.println("finished");
	}
	
	/**
	 * 创建文件
	 * @param args
	 */
	public static Boolean CreateFile(String pathFileName){
		try {
			File file = new File(pathFileName);
			if(file.exists()){
				file.delete();
			}
			boolean createNewFile = file.createNewFile();
			System.out.println("create file "+pathFileName+" success!");
			return createNewFile;
		} catch (IOException e) {
			e.printStackTrace();
		}
		return false;
	}
	
	/**
	 * 向文件中写入数据
	 * @param args
	 */
	public static void WriteDataToFile(String pathFileName,String newContent){
		FileOutputStream fos = null ;
		OutputStreamWriter osw = null;
		PrintWriter pw = null ;
		try {
			//产生一行模拟数据
			String content = newContent;
			File file = new File(pathFileName);
			fos=new FileOutputStream(file,true);
	        osw=new OutputStreamWriter(fos, "UTF-8");
	        pw =new PrintWriter(osw);
	        pw.write(content+"\n");
	        //注意关闭的先后顺序,先打开的后关闭,后打开的先关闭
	        pw.close();
	        osw.close();
	        fos.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	
	
	//生成模拟数据
	public static void mock() {
    	List<Row> dataList = new ArrayList<Row>();
    	Random random = new Random();
    	
    	String[] locations = new String[]{"鲁","京","京","京","沪","京","京","深","京","京"}; 
    	String date = DateUtils.getTodayDate();
    	
    	/**
    	 * 模拟3000个车辆
    	 */
    	for (int i = 0; i < 3000; i++) {
        	String car = locations[random.nextInt(10)] + (char)(65+random.nextInt(26))+StringUtils.fulfuill(5,random.nextInt(100000)+"");
        	
        	//baseActionTime 模拟24小时
        	String baseActionTime = date + " " + StringUtils.fulfuill(random.nextInt(24)+"");
        	/**
        	 * 这里的for循环模拟每辆车经过不同的卡扣不同的摄像头 数据。
        	 */
        	for(int j = 0 ; j < random.nextInt(300)+1 ; j++){
        		//模拟每个车辆每被30个摄像头拍摄后 时间上累计加1小时。这样做使数据更加真实。
        		if(j % 30 == 0 && j != 0){
        			 baseActionTime = date + " " + StringUtils.fulfuill((Integer.parseInt(baseActionTime.split(" ")[1])+1)+"");
        		}
        		
        		String actionTime = baseActionTime + ":" 
        				+ StringUtils.fulfuill(random.nextInt(60)+"") + ":" 
        				+ StringUtils.fulfuill(random.nextInt(60)+"");//模拟经过此卡扣开始时间 ,如:2017-10-01 20:09:10
        		
        		String monitorId = StringUtils.fulfuill(4, random.nextInt(9)+"");//模拟9个卡扣monitorId
        		
        		String speed = random.nextInt(260)+1+"";//模拟速度
        		
        		String roadId = random.nextInt(50)+1+"";//模拟道路id 【1~50 个道路】
        		
        		String cameraId = StringUtils.fulfuill(5, random.nextInt(100000)+"");//模拟摄像头id cameraId
        		
        		String areaId = StringUtils.fulfuill(2,random.nextInt(8)+1+"");//模拟areaId 【一共8个区域】
        		
        		
        		//将数据写入到文件中
        		String content = date+"\t"+monitorId+"\t"+cameraId+"\t"+car+"\t"+actionTime+"\t"+speed+"\t"+roadId+"\t"+areaId;
        		WriteDataToFile(MONITOR_FLOW_ACTION,content);
        		Row row = RowFactory.create(date,monitorId,cameraId,car,actionTime,speed,roadId,areaId);
        		dataList.add(row);
        	}
		}
    	
    	/**
    	 * 生成 monitor_id 对应camera_id表
    	 */
    	Map<String,Set<String>> monitorAndCameras = new HashMap<>();
    	
    	int index = 0;
    	for(Row row : dataList){
    		//row.getString(1) monitor_id
    		Set<String> sets = monitorAndCameras.get(row.getString(1));
    		if(sets == null){
    			sets = new HashSet<>();
    			monitorAndCameras.put((String)row.getString(1), sets);
    		}
    		index++;
    		//这里每隔1000条数据随机插入一条数据,模拟出来标准表中卡扣对应摄像头的数据。这个摄像头的数据不一定会在车辆数据中有。
    		if(index % 1000 == 0){
    			sets.add(StringUtils.fulfuill(5, random.nextInt(100000)+""));
    		} 
    		//row.getString(2) camera_id
    		sets.add(row.getString(2)); 
    	}
    	
    	Set<Entry<String,Set<String>>> entrySet = monitorAndCameras.entrySet();
    	for (Entry<String, Set<String>> entry : entrySet) {
    		String monitor_id = entry.getKey();
    		Set<String> sets = entry.getValue();
    		for (String val : sets) {
    			//将数据写入到文件
    			String content = monitor_id+"\t"+val;
        		WriteDataToFile(MONITOR_CAMERA_INFO,content);
			}
		}
    }
}

模拟数据的代码

package com.spark.spark.test;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.Set;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructType;

import com.bjsxt.spark.util.DateUtils;
import com.bjsxt.spark.util.StringUtils;


/**
 * 模拟数据  数据格式如下:
 * 
 *  日期	      卡口ID		     摄像头编号  	车牌号	       拍摄时间	              车速	             道路ID   	   区域ID
 * date	 monitor_id	 camera_id	 car	action_time		speed	road_id		area_id
 * 
 * monitor_flow_action
 * monitor_camera_info
 * 
 * @author Administrator
 */
public class MockData {
    public static void mock(JavaSparkContext sc, SQLContext sqlContext) {
    	List<Row> dataList = new ArrayList<Row>();
    	Random random = new Random();
    	
    	String[] locations = new String[]{"鲁","京","京","京","沪","京","京","深","京","京"}; 
//     	String[] areas = new String[]{"海淀区","朝阳区","昌平区","东城区","西城区","丰台区","顺义区","大兴区"};
    	//date :如:2018-01-01
    	String date = DateUtils.getTodayDate();
    	
    	/**
    	 * 模拟3000个车辆
    	 */
    	for (int i = 0; i < 3000; i++) {
    		//模拟车牌号:如:京A00001
        	String car = locations[random.nextInt(10)] + (char)(65+random.nextInt(26))+StringUtils.fulfuill(5,random.nextInt(100000)+"");
        	
        	//baseActionTime 模拟24小时
        	String baseActionTime = date + " " + StringUtils.fulfuill(random.nextInt(24)+"");//2018-01-01 01
        	/**
        	 * 这里的for循环模拟每辆车经过不同的卡扣不同的摄像头 数据。
        	 */
        	for(int j = 0 ; j < (random.nextInt(300)+1) ; j++){
        		//模拟每个车辆每被30个摄像头拍摄后 时间上累计加1小时。这样做使数据更加真实。
        		if(j % 30 == 0 && j != 0){
        			 baseActionTime = date + " " + StringUtils.fulfuill((Integer.parseInt(baseActionTime.split(" ")[1])+1)+"");
        		}
        		
        		String actionTime = baseActionTime + ":" 
        				+ StringUtils.fulfuill(random.nextInt(60)+"") + ":" 
        				+ StringUtils.fulfuill(random.nextInt(60)+"");//模拟经过此卡扣开始时间 ,如:2018-01-01 20:09:10
        		
        		
        		String monitorId = StringUtils.fulfuill(4, random.nextInt(9)+"");//模拟9个卡扣monitorId,0补全4位
        		
        		String speed = (random.nextInt(260)+1)+"";//模拟速度
        		
        		String roadId = random.nextInt(50)+1+"";//模拟道路id 【1~50 个道路】
        		
        		String cameraId = StringUtils.fulfuill(5, random.nextInt(100000)+"");//模拟摄像头id cameraId
        		
        		String areaId = StringUtils.fulfuill(2,random.nextInt(8)+1+"");//模拟areaId 【一共8个区域】
        		
        		Row row = RowFactory.create(date,monitorId,cameraId,car,actionTime,speed,roadId,areaId);
        		dataList.add(row);
        	}
		}
    	
    	/**
    	 * 2017-4-20 1	22	京A1234 
    	 * 2017-4-20 1	23	京A1234 
    	 * 1 【22,23】
    	 * 1 【22,23,24】
    	 */
    	
    	JavaRDD<Row> rowRdd = sc.parallelize(dataList);
    	
    	StructType cameraFlowSchema = DataTypes.createStructType(Arrays.asList(
    			DataTypes.createStructField("date", DataTypes.StringType, true),
    			DataTypes.createStructField("monitor_id", DataTypes.StringType, true),
    			DataTypes.createStructField("camera_id", DataTypes.StringType, true),
    			DataTypes.createStructField("car", DataTypes.StringType, true),
    			DataTypes.createStructField("action_time", DataTypes.StringType, true),
    			DataTypes.createStructField("speed", DataTypes.StringType, true),
    			DataTypes.createStructField("road_id", DataTypes.StringType, true),
    			DataTypes.createStructField("area_id", DataTypes.StringType, true)
    			));
    	
    	DataFrame df = sqlContext.createDataFrame(rowRdd, cameraFlowSchema);
    	
    	//默认打印出来df里面的20行数据
    	System.out.println("----打印 车辆信息数据----");
    	df.show();
    	df.registerTempTable("monitor_flow_action");
 
    	/**
    	 * monitorAndCameras    key:monitor_id
    	 * 						value:hashSet(camera_id)
    	 * 基于生成的数据,生成对应的卡扣号和摄像头对应基本表
    	 */
    	Map<String,Set<String>> monitorAndCameras = new HashMap<>();
    	
    	int index = 0;
    	for(Row row : dataList){
    		//row.getString(1) monitor_id
    		Set<String> sets = monitorAndCameras.get(row.getString(1));
    		if(sets == null){
    			sets = new HashSet<>();
    			monitorAndCameras.put((String)row.getString(1), sets);
    		}
    		//这里每隔1000条数据随机插入一条数据,模拟出来标准表中卡扣对应摄像头的数据比模拟数据中多出来的摄像头。这个摄像头的数据不一定会在车辆数据中有。即可以看出卡扣号下有坏的摄像头。
    		index++;
    		if(index % 1000 == 0){
    			sets.add(StringUtils.fulfuill(5, random.nextInt(100000)+""));
    		} 
    		//row.getString(2) camera_id
    		sets.add(row.getString(2)); 
    	}
    	
    	dataList.clear();
    	
    	Set<Entry<String,Set<String>>> entrySet = monitorAndCameras.entrySet();
    	for (Entry<String, Set<String>> entry : entrySet) {
    		String monitor_id = entry.getKey();
    		Set<String> sets = entry.getValue();
    		Row row = null;
    		for (String camera_id : sets) {
    			row = RowFactory.create(monitor_id,camera_id);
    			dataList.add(row);
			}
		}

    	StructType monitorSchema = DataTypes.createStructType(Arrays.asList(
    			DataTypes.createStructField("monitor_id", DataTypes.StringType, true),
    			DataTypes.createStructField("camera_id", DataTypes.StringType, true)
    			));
    	
    	
    	rowRdd = sc.parallelize(dataList);
    	
    	DataFrame monitorDF = sqlContext.createDataFrame(rowRdd, monitorSchema);
    	monitorDF.registerTempTable("monitor_camera_info");
    	System.out.println("----打印 卡扣号对应摄像头号 数据----");
    	monitorDF.show();
    }
}














package com.spark.spark.test;

import java.util.Properties;
import java.util.Random;

import com.bjsxt.spark.util.DateUtils;
import com.bjsxt.spark.util.StringUtils;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

public class MockRealTimeData extends Thread {
	
	private static final Random random = new Random();
	private static final String[] locations = new String[]{"鲁","京","京","京","沪","京","京","深","京","京"};
	private Producer<String, String> producer;
	
	public MockRealTimeData() {
		producer = new Producer<String, String>(createProducerConfig());  
	}
	
	private ProducerConfig createProducerConfig() {
		Properties props = new Properties();
		props.put("serializer.class", "kafka.serializer.StringEncoder");
		props.put("metadata.broker.list", "node1:9092,node2:9092,node3:9092");
		return new ProducerConfig(props);
	}
	
	public void run() {
		while(true) {	
			String date = DateUtils.getTodayDate();
			String baseActionTime = date + " " + StringUtils.fulfuill(random.nextInt(24)+"");
			baseActionTime = date + " " + StringUtils.fulfuill((Integer.parseInt(baseActionTime.split(" ")[1])+1)+"");
			String actionTime = baseActionTime + ":" + StringUtils.fulfuill(random.nextInt(60)+"") + ":" + StringUtils.fulfuill(random.nextInt(60)+"");
    		String monitorId = StringUtils.fulfuill(4, random.nextInt(9)+"");
    		String car = locations[random.nextInt(10)] + (char)(65+random.nextInt(26))+StringUtils.fulfuill(5,random.nextInt(99999)+"");
    		String speed = random.nextInt(260)+"";
    		String roadId = random.nextInt(50)+1+"";
    		String cameraId = StringUtils.fulfuill(5, random.nextInt(9999)+"");
    		String areaId = StringUtils.fulfuill(2,random.nextInt(8)+"");
			producer.send(new KeyedMessage<String, String>("RoadRealTimeLog", date+"\t"+monitorId+"\t"+cameraId+"\t"+car + "\t" + actionTime + "\t" + speed + "\t" + roadId + "\t" + areaId));  
			
			try {
				Thread.sleep(50);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}  
		}
	}
	
	/**
	 * 启动Kafka Producer
	 * @param args
	 */
	public static void main(String[] args) {
		MockRealTimeData producer = new MockRealTimeData();
		producer.start();
	}
	
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值