flume+kafka+sparkstreaming+hbase

最新推荐文章于 2022-02-21 14:31:46 发布

李南想做条咸鱼

最新推荐文章于 2022-02-21 14:31:46 发布

阅读量303

点赞数

分类专栏：大数据实时项目文档文章标签： flume kafka hbase spark

本文链接：https://blog.csdn.net/m0_53683186/article/details/120808180

版权

大数据实时项目文档专栏收录该内容

16 篇文章 1 订阅

订阅专栏

文章目录

爬虫代码

Monitor

package ln;

import java.io.File;

public class Monitor extends Thread{
    @Override
    public void run() {
        super.run();
        File file = new File("Data.txt");
        while(true){
            System.out.println("文件大小:"+file.length()+"bytes");
            //1M等于1048576byte
            //file.length返回的是byte
            if(file.length()>=100000000){
                boolean delete = file.delete();
                if (delete==true)
                    System.out.println("删除成功");
            }
            try {
                sleep(2000);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }
}

Catcher

package ln;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import org.apache.commons.io.FileUtils;

import java.io.File;
import java.io.IOException;

public class Catcher {
    public static void main(String[] args) throws IOException, InterruptedException {
        //股票列表url
        String StockListURL="http://43.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=10000&po=0&np=1&fltt=2&invt=2&fid=f12&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23&fields=f12,f13";
        WebClient webClient = new WebClient(BrowserVersion.CHROME);
        Monitor monitor = new Monitor();
        monitor.start();
        //json数据
        String contentAsString = webClient.getPage(StockListURL).getWebResponse().getContentAsString();
        Object[] objects = JSONObject.parseObject(contentAsString).getJSONObject("data").getJSONArray("diff").toArray();
        for (Object object : objects) {
            String F12 = JSONObject.parseObject(object.toString()).getString("f12");
            String F13 = JSONObject.parseObject(object.toString()).getString("f13");
            String DataURL = "http://push2his.eastmoney.com/api/qt/stock/kline/get?fields1=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13&fields2=f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61&beg=0&end=20500101&rtntype=6&secid="+F13+"."+F12+"&klt=101&fqt=1";
            //原始数据
            String contentAsString1 = webClient.getPage(DataURL).getWebResponse().getContentAsString();
            Thread.sleep(5000);
            FileUtils.writeStringToFile(new File("Data.txt"), contentAsString1 + "\n", "UTF-8", true);
        }
    }
}

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>org.example</groupId>
  <artifactId>Spider</artifactId>
  <version>1.0-SNAPSHOT</version>

  <name>Spider</name>
  <!-- FIXME change it to the project's website -->
  <url>http://www.example.com</url>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <!--爬虫组件-->
    <dependency>
      <groupId>net.sourceforge.htmlunit</groupId>
      <artifactId>htmlunit</artifactId>
      <version>2.49.1</version>
    </dependency>
    <!--json解析组件-->
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.76</version>
    </dependency>
    <!--文件输出组件-->
    <dependency>
      <groupId>commons-io</groupId>
      <artifactId>commons-io</artifactId>
      <version>2.8.0</version>
    </dependency>
  </dependencies>

  <build>
    <pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
      <plugins>
        <!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
        <plugin>
          <artifactId>maven-clean-plugin</artifactId>
          <version>3.1.0</version>
        </plugin>
        <!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
        <plugin>
          <artifactId>maven-resources-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-compiler-plugin</artifactId>
          <version>3.8.0</version>
        </plugin>
        <plugin>
          <artifactId>maven-surefire-plugin</artifactId>
          <version>2.22.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-jar-plugin</artifactId>
          <version>3.0.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-install-plugin</artifactId>
          <version>2.5.2</version>
        </plugin>
        <plugin>
          <artifactId>maven-deploy-plugin</artifactId>
          <version>2.8.2</version>
        </plugin>
        <!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
        <plugin>
          <artifactId>maven-site-plugin</artifactId>
          <version>3.7.1</version>
        </plugin>
        <plugin>
          <artifactId>maven-project-info-reports-plugin</artifactId>
          <version>3.0.0</version>
        </plugin>
      </plugins>
    </pluginManagement>
  </build>
</project>

启动爬虫

nohup java -jar xxxx.jar > /opt/spiderlog.file  2>&1 &

flume配置文件

a1.sources = r1
a1.sinks = k1
a1.channels = c1
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F /home/server/Data.txt
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.topic = mytopic
a1.sinks.k1.kafka.bootstrap.servers = server1:9092,server2:9092,server3:9092
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

启动flume命令

cd /opt/flume
nohup bin/flume-ng agent --conf conf --conf-file conf/example.conf --name a1 -Dflume.root.logger=INFO,console > /opt/flumelog.file  2>&1 &

kafka相关命令

bin/kafka-server-start.sh config/server.properties &
bin/kafka-topics.sh --create --topic mytopic --bootstrap-server localhost:9092
bin/kafka-topics.sh --describe --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-producer.sh --topic mytopic --bootstrap-server localhost:9092
bin/kafka-console-consumer.sh --topic mytopic --from-beginning --bootstrap-server localhost:9092
bin/kafka-topics.sh --zookeeper server1:2181 --list

Hive建立HBase关联表

CREATE TABLE StockInfo(
stockkey string,
stockmarket string,
stockcode string,
stockname string,
stockdate string,
stockopen string,
stockend string,
highest string,
lowest string,
ts string,
tn string,
zf string,
zdf string,
zde string,
ch string)
STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
WITH SERDEPROPERTIES ("hbase.columns.mapping" = 
":key,
info:stockmarket,
info:stockcode,
info:stockname,
info:stockdate,
info:stockopen,
info:stockend,
info:highest,
info:lowest,
info:ts,
info:tn,
info:zf,
info:zdf,
info:zde,
info:ch
")
TBLPROPERTIES ("hbase.table.name" = "StockInfo");

Spark Streaming

SparkStreamTest

import com.alibaba.fastjson.JSON.parseObject
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.streaming._
import org.apache.spark.SparkConf
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{KafkaUtils, LocationStrategies, LocationStrategy}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

import scala.collection.mutable.ArrayBuffer

object SparkStreamTest {
  Logger.getLogger("org").setLevel(Level.ERROR);
  def main(args: Array[String]): Unit = {
    println("spark启动中")
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("StreamingKafkaTest")
    val ssc = new StreamingContext(conf,Seconds(1))
    ssc.checkpoint("hdfs://server1:9000/spark-checkpoint")
    val kafkaTopic = Array("mytopic")
    val kafkaParams = Map[String,Object](
      "bootstrap.servers" -> "server1:9092,server2:9092,server3:9092",
      "key.deserializer"->classOf[StringDeserializer],
      "value.deserializer"->classOf[StringDeserializer],
      "group.id"->"1",
      "enable.auto.commit"->(false : java.lang.Boolean)
    )


    val inputStream:InputDStream[ConsumerRecord[String,String]]= KafkaUtils.createDirectStream[String,String](ssc, LocationStrategies.PreferConsistent, Subscribe[String,String](kafkaTopic,kafkaParams))
    val dataDStream = inputStream.map(record =>(record.key,record.value)).map(_._2)
    dataDStream.foreachRDD{y=>y.foreach{x=>
      val data = parseObject(x).getJSONObject("data")
      val market = data.getString("market")
      val code = data.getString("code")
      val name = data.getString("name")
      val day_klines = data.getJSONArray("klines")
      val hbaseConf = HBaseConfiguration.create()
      hbaseConf.set("hbase.zookeeper.quorum","10.206.0.6")
      hbaseConf.set("hbase.property.clientPort","2181")
      val conn = ConnectionFactory.createConnection(hbaseConf)
      val tableName = TableName.valueOf("StockInfo")
      val table = conn.getTable(tableName)
      day_klines.forEach{data=>{
        val s = data.toString.split(",")
        val date = s(0)
        val open = s(1)
        val end = s(2)
        val highest = s(3)
        val lowest  = s(4)
        val ts  = s(5)
        val tn	= s(6)
        val zf	= s(7)
        val zdf	= s(8)
        val zde	= s(9)
        val ch= s(10)

        val putin = new Put(Bytes.toBytes(market+"."+code+","+date))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockname"), Bytes.toBytes(name))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockmarket"), Bytes.toBytes(market))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockcode"), Bytes.toBytes(code))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockdate"), Bytes.toBytes(date))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockopen"), Bytes.toBytes(open))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("stockend"), Bytes.toBytes(end))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("highest"), Bytes.toBytes(highest))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("lowest"), Bytes.toBytes(lowest))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ts"), Bytes.toBytes(ts))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("tn"), Bytes.toBytes(tn))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zf"), Bytes.toBytes(zf))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zdf"), Bytes.toBytes(zdf))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("zde"), Bytes.toBytes(zde))
        putin.addColumn(Bytes.toBytes("info"), Bytes.toBytes("ch"), Bytes.toBytes(ch))

        table.put(putin)
      }}
    }}
    ssc.start()
    ssc.awaitTermination()
  }
}

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.example</groupId>
  <artifactId>SparkProgram</artifactId>
  <version>1.0-SNAPSHOT</version>
  <inceptionYear>2008</inceptionYear>
  <properties>
    <scala.version>2.12.10</scala.version>
  </properties>

  <repositories>
    <repository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </repository>
  </repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
  </pluginRepositories>

  <dependencies>
    <dependency>
      <groupId>org.scala-tools</groupId>
      <artifactId>maven-scala-plugin</artifactId>
      <version>2.12</version>
    </dependency>
    <dependency>
      <groupId>org.apache.maven.plugins</groupId>
      <artifactId>maven-eclipse-plugin</artifactId>
      <version>2.5.1</version>
    </dependency>
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.4</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.specs</groupId>
      <artifactId>specs</artifactId>
      <version>1.2.5</version>
      <scope>test</scope>
    </dependency>
    <!--...............................................................................................................-->
    <!--以上是基础依赖-->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.12</artifactId>
      <version>3.1.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-core</artifactId>
      <version>2.7.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>2.7.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-common</artifactId>
      <version>2.7.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
      <version>2.7.2</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-client</artifactId>
      <version>1.4.13</version>
    </dependency>
    <dependency>
      <groupId>org.apache.hbase</groupId>
      <artifactId>hbase-server</artifactId>
      <version>1.4.13</version>
    </dependency>
    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.76</version>
    </dependency>

  </dependencies>

  <build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
          <args>
            <arg>-target:jvm-1.5</arg>
          </args>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-eclipse-plugin</artifactId>
        <configuration>
          <downloadSources>true</downloadSources>
          <buildcommands>
            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
          </buildcommands>
          <additionalProjectnatures>
            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
          </additionalProjectnatures>
          <classpathContainers>
            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
          </classpathContainers>
        </configuration>
      </plugin>
    </plugins>
  </build>
  <reporting>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
        </configuration>
      </plugin>
    </plugins>
  </reporting>
</project>

启动命令

nohup bin/spark-submit /home/server/SparkProgram.jar --class SparkStreamTest > /opt/sparkstreamlog  2>&1 &

李南想做条咸鱼

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
flume+kafka+sparkstreaming+hbase

爬虫代码Monitorpackage ln;import java.io.File;public class Monitor extends Thread{ @Override public void run() { super.run(); File file = new File("Data.txt"); while(true){ System.out.println("文件大小:"+file.lengt
复制链接

扫一扫