spark 链接ftp读取数据并写入到hive表中

最新推荐文章于 2023-11-26 12:14:00 发布

不会飞的乌龟

最新推荐文章于 2023-11-26 12:14:00 发布

阅读量2.7k

点赞数 1

分类专栏：边缘技术文章标签： spark 链接ftp读取数据并写入到hive表中链接ftp大数据量分批次读取（50万条每次）并写入hive spark 分批次从ftp读取数据并写入hive java.lang.OutOfMemoryError: Java he ftp读取大文件数据导致的内存溢出问题解决

本文链接：https://blog.csdn.net/qq_42012160/article/details/89360616

版权

边缘技术专栏收录该内容

9 篇文章 0 订阅

订阅专栏

问题：
　　ftp读取大文导致的内存溢出问题（java.lang.OutOfMemoryError: Java heap space）
重点排查点：
　　检查代码中是否有死循环或递归调用。
　　检查是否有大循环重复产生新对象实体。
　　检查对数据加载中，是否有一次获得全部数据。一般来说，如果一次取百万条记录到内存，就可能引起内存溢出。
　　检查List、MAP等集合对象是否有使用完后，未清除的问题。List、MAP等集合对象会始终存有对对象的引用，使得这些对象不能被GC回收。
解决方案：
　　从根本上解决Java内存溢出的唯一方法就是修改程序，及时地释放没用的对象，释放内存空间，防止一次载入太多的数据。本次解决ftp内存溢出问题方案：1、及时释放对象；2、分批次加载数据到内存中。代码循环是一次读取５０万条数据，可以根据具体情况适当调整这个值。

spark依赖pom文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.lenovo.cpp</groupId>
    <artifactId>cpp</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <scala.version>2.11.8</scala.version>
        <spark.version>2.0.0</spark.version>
        <hadoop.version>2.6.4</hadoop.version>
    </properties>

    <!--项目依赖-->
    <dependencies>
        <!--scala语言-->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
            <scope>provided</scope>
        </dependency>

        <!--spark core-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>

        <!--spark sql-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>

        <!--spark hive-->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_2.11</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <!--sqlserver数据库访问-->
        <dependency>
            <groupId>com.microsoft.sqlserver</groupId>
            <artifactId>mssql-jdbc</artifactId>
            <version>7.0.0.jre8</version>
            <scope>provided</scope>
        </dependency>
        <!--mysql数据库访问-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.25</version>
            <!--<scope>provided</scope>-->
        </dependency>


        <!--用来包装部份数据类型-->
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
            <version>3.1</version>
            <scope>provided</scope>
        </dependency>

        <!-- log -->
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.1.1</version>
            <type>jar</type>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.9</version>
            <scope>provided</scope>
        </dependency>

        <!--scala用于单元测试-->
        <dependency>
            <groupId>org.specs</groupId>
            <artifactId>specs</artifactId>
            <version>1.2.5</version>
            <scope>provided</scope>
        </dependency>

        <!--java用于单元测试-->
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>provided</scope>
        </dependency>

    </dependencies>

    <!-- 打包和编译插件-->
    <build>
        <sourceDirectory>src/main/scala</sourceDirectory>
        <!--<testSourceDirectory>src/test/scala</testSourceDirectory>-->
        <plugins>
            <!--编译scala的插件-->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <!-- 编译java的插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.1</version>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>

            <!-- 打包插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.4.3</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

mysql数据库配置（需要读取的文件以及要写入的hive表）
表名：tb_cpp_config_data_from_ftp_to_ludp
工具类

package com.lenovo.ftp

import java.io.{BufferedReader, InputStreamReader}
import java.net.InetSocketAddress

import sun.net.ftp.FtpClient

class FtpUtil {
  /**
    * 连接FTP服务
    *
    * @param url      //IP地址
    * @param port     //端口号
    * @param username //用户名
    * @param password //密码
    * @return
    */
  def connectFtp(url:String,port:Int,username:String,password:String): FtpClient ={
    val address = new InetSocketAddress(url,port)
    val ftp = FtpClient.create()
    ftp.connect(address)
    ftp.login(username,password.toCharArray)
    ftp.setBinaryType()
    ftp.setConnectTimeout(120000)
    ftp
  }

  /**
    * 取ftp上的文件内容第一行，也就是column name
    *
    * @param ftpFile
    * @param ftp
    * @return
    */
  def downLoadColumn(ftpFile: String, ftp: FtpClient): String = {
    var column = ""
    val fs = ftp.getFileStream(ftpFile)
    val br = new BufferedReader(new InputStreamReader(fs,"UTF-8"))
    column += br.readLine()
    br.close()
    fs.close()
    column
  }
}

spark代码

package com.lenovo.ftp

import java.io.{BufferedReader, InputStreamReader}
import java.text.SimpleDateFormat
import java.util.Date

import com.lenovo.mysql.LinkPara
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import scala.collection.mutable.{ArrayBuffer, ListBuffer}

object DataFromFtpToHive {
  def main(args: Array[String]): Unit = {
    val ss = SparkSession
      .builder()
      .appName("DataFromFtpToHive")
      .master("yarn")
      //.config("spark.sql.warehouse.dir", "file:///D://lenovo_pj//cpp//cpp")
      .enableHiveSupport()
      .getOrCreate()


    val link = new LinkPara()
    val ftpUtil = new FtpUtil()

    val sdf = new SimpleDateFormat("yyyy-MM-dd")
    val date = sdf.format(new Date())

    //链接Ftp
    val ftpClient = ftpUtil.connectFtp("10.122.2.126",21,"user","password")

    //加载配置数据
    val tb_config = "tb_cpp_config_data_from_ftp_to_ludp"
    val config_arr = ss.read
      .format("jdbc")
      .option("url", link.url).option("driver", link.driver)
      .option("user", link.user).option("password", link.password)
      .option("dbtable", tb_config)
      .load().rdd
      .collect()

    var arr:ArrayBuffer[String] = ArrayBuffer()
    var str = ""
    var i = 0
    for(config <- config_arr){
      ss.sql("truncate table "+config(1))
      //load数据
      val column = ftpUtil.downLoadColumn(config(0).toString,ftpClient)
      val column_info = schemaInfo(column)

      val fs = ftpClient.getFileStream(config(0).toString)
      val br = new BufferedReader(new InputStreamReader(fs,"UTF-8"))
      while (str != null){
        if(i == 0){
          br.readLine()
          i += 1
        }else if(i>0 && i<=500000){
          str = br.readLine()
          arr += str
          i += 1
        }else{
          //写数据到hive
          val values = ss.sparkContext
            .parallelize(arr)
            .filter(line =>{
            line != null && line.toString.count(_ == '|') == column_info._3-1
          })
            .map(row => {
              var arr = row.toString.split("\\|",column_info._3)
              arr.update(0,date)
              Row.fromSeq(arr.toSeq)
            })
          ss.createDataFrame(values,column_info._1)
            .createOrReplaceTempView("ludp")
          ss.sql("insert into table "+config(1)+" select "+column_info._2+" from ludp")

          //重置数据
          i = 1
          arr.clear()
          str = br.readLine()
          arr += str
          i += 1
        }
      }
      if(arr.length != 0){
        //写数据到hive
        val values = ss.sparkContext
          .parallelize(arr)
          .filter(line =>{
            line != null && line.toString.count(_ == '|') == column_info._3-1
          })
          .map(row => {
            var arr = row.toString.split("\\|",column_info._3)
            arr.update(0,date)
            Row.fromSeq(arr.toSeq)
          })
        ss.createDataFrame(values,column_info._1)
          .createOrReplaceTempView("ludp")
        ss.sql("insert into table "+config(1)+" select "+column_info._2+" from ludp")
      }
      br.close()
      fs.close()
      i = 0
      str = ""
      arr.clear()
    }
    ss.stop()
  }

  /**
    * ftp数据第一行转换成schema信息
    * @param column
    * @return
    */
  def schemaInfo(column :String)={
    val columnArr = column.split("\\|")
    var columns = "record_date"
    var structFieldList = new ListBuffer[StructField]()
    structFieldList += StructField("record_date",StringType,true)
    for(i <- 1 until columnArr.length){
      structFieldList += StructField(columnArr(i).toLowerCase,StringType,true)
      if(columnArr(i).toLowerCase == "type"){
        columns += (","+"`"+columnArr(i).toLowerCase+"`")
      }else{
        columns += (","+columnArr(i).toLowerCase)
      }
    }
    val schema = StructType(structFieldList)
    structFieldList.clear()
    (schema,columns,columnArr.length)
  }
}