问题:
ftp读取大文导致的内存溢出问题(java.lang.OutOfMemoryError: Java heap space)
重点排查点:
检查代码中是否有死循环或递归调用。
检查是否有大循环重复产生新对象实体。
检查对数据加载中,是否有一次获得全部数据。一般来说,如果一次取百万条记录到内存,就可能引起内存溢出。
检查List、MAP等集合对象是否有使用完后,未清除的问题。List、MAP等集合对象会始终存有对对象的引用,使得这些对象不能被GC回收。
解决方案:
从根本上解决Java内存溢出的唯一方法就是修改程序,及时地释放没用的对象,释放内存空间,防止一次载入太多的数据。本次解决ftp内存溢出问题方案:1、及时释放对象;2、分批次加载数据到内存中。代码循环是一次读取50万条数据,可以根据具体情况适当调整这个值。
- spark依赖pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.lenovo.cpp</groupId>
<artifactId>cpp</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<scala.version>2.11.8</scala.version>
<spark.version>2.0.0</spark.version>
<hadoop.version>2.6.4</hadoop.version>
</properties>
<!--项目依赖-->
<dependencies>
<!--scala语言-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
<scope>provided</scope>
</dependency>
<!--spark core-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<!--spark sql-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<!--spark hive-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<!--sqlserver数据库访问-->
<dependency>
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>mssql-jdbc</artifactId>
<version>7.0.0.jre8</version>
<scope>provided</scope>
</dependency>
<!--mysql数据库访问-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.25</version>
<!--<scope>provided</scope>-->
</dependency>
<!--用来包装部份数据类型-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
<scope>provided</scope>
</dependency>
<!-- log -->
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
<type>jar</type>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.9</version>
<scope>provided</scope>
</dependency>
<!--scala用于单元测试-->
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>provided</scope>
</dependency>
<!--java用于单元测试-->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>provided</scope>
</dependency>
</dependencies>
<!-- 打包和编译插件-->
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<!--<testSourceDirectory>src/test/scala</testSourceDirectory>-->
<plugins>
<!--编译scala的插件-->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<!-- 编译java的插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.7</source>
<target>1.7</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<!-- 打包插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
- mysql数据库配置(需要读取的文件以及要写入的hive表)
表名:tb_cpp_config_data_from_ftp_to_ludp - 工具类
package com.lenovo.ftp
import java.io.{BufferedReader, InputStreamReader}
import java.net.InetSocketAddress
import sun.net.ftp.FtpClient
class FtpUtil {
/**
* 连接FTP服务
*
* @param url //IP地址
* @param port //端口号
* @param username //用户名
* @param password //密码
* @return
*/
def connectFtp(url:String,port:Int,username:String,password:String): FtpClient ={
val address = new InetSocketAddress(url,port)
val ftp = FtpClient.create()
ftp.connect(address)
ftp.login(username,password.toCharArray)
ftp.setBinaryType()
ftp.setConnectTimeout(120000)
ftp
}
/**
* 取ftp上的文件内容第一行,也就是column name
*
* @param ftpFile
* @param ftp
* @return
*/
def downLoadColumn(ftpFile: String, ftp: FtpClient): String = {
var column = ""
val fs = ftp.getFileStream(ftpFile)
val br = new BufferedReader(new InputStreamReader(fs,"UTF-8"))
column += br.readLine()
br.close()
fs.close()
column
}
}
- spark代码
package com.lenovo.ftp
import java.io.{BufferedReader, InputStreamReader}
import java.text.SimpleDateFormat
import java.util.Date
import com.lenovo.mysql.LinkPara
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object DataFromFtpToHive {
def main(args: Array[String]): Unit = {
val ss = SparkSession
.builder()
.appName("DataFromFtpToHive")
.master("yarn")
//.config("spark.sql.warehouse.dir", "file:///D://lenovo_pj//cpp//cpp")
.enableHiveSupport()
.getOrCreate()
val link = new LinkPara()
val ftpUtil = new FtpUtil()
val sdf = new SimpleDateFormat("yyyy-MM-dd")
val date = sdf.format(new Date())
//链接Ftp
val ftpClient = ftpUtil.connectFtp("10.122.2.126",21,"user","password")
//加载配置数据
val tb_config = "tb_cpp_config_data_from_ftp_to_ludp"
val config_arr = ss.read
.format("jdbc")
.option("url", link.url).option("driver", link.driver)
.option("user", link.user).option("password", link.password)
.option("dbtable", tb_config)
.load().rdd
.collect()
var arr:ArrayBuffer[String] = ArrayBuffer()
var str = ""
var i = 0
for(config <- config_arr){
ss.sql("truncate table "+config(1))
//load数据
val column = ftpUtil.downLoadColumn(config(0).toString,ftpClient)
val column_info = schemaInfo(column)
val fs = ftpClient.getFileStream(config(0).toString)
val br = new BufferedReader(new InputStreamReader(fs,"UTF-8"))
while (str != null){
if(i == 0){
br.readLine()
i += 1
}else if(i>0 && i<=500000){
str = br.readLine()
arr += str
i += 1
}else{
//写数据到hive
val values = ss.sparkContext
.parallelize(arr)
.filter(line =>{
line != null && line.toString.count(_ == '|') == column_info._3-1
})
.map(row => {
var arr = row.toString.split("\\|",column_info._3)
arr.update(0,date)
Row.fromSeq(arr.toSeq)
})
ss.createDataFrame(values,column_info._1)
.createOrReplaceTempView("ludp")
ss.sql("insert into table "+config(1)+" select "+column_info._2+" from ludp")
//重置数据
i = 1
arr.clear()
str = br.readLine()
arr += str
i += 1
}
}
if(arr.length != 0){
//写数据到hive
val values = ss.sparkContext
.parallelize(arr)
.filter(line =>{
line != null && line.toString.count(_ == '|') == column_info._3-1
})
.map(row => {
var arr = row.toString.split("\\|",column_info._3)
arr.update(0,date)
Row.fromSeq(arr.toSeq)
})
ss.createDataFrame(values,column_info._1)
.createOrReplaceTempView("ludp")
ss.sql("insert into table "+config(1)+" select "+column_info._2+" from ludp")
}
br.close()
fs.close()
i = 0
str = ""
arr.clear()
}
ss.stop()
}
/**
* ftp数据第一行转换成schema信息
* @param column
* @return
*/
def schemaInfo(column :String)={
val columnArr = column.split("\\|")
var columns = "record_date"
var structFieldList = new ListBuffer[StructField]()
structFieldList += StructField("record_date",StringType,true)
for(i <- 1 until columnArr.length){
structFieldList += StructField(columnArr(i).toLowerCase,StringType,true)
if(columnArr(i).toLowerCase == "type"){
columns += (","+"`"+columnArr(i).toLowerCase+"`")
}else{
columns += (","+columnArr(i).toLowerCase)
}
}
val schema = StructType(structFieldList)
structFieldList.clear()
(schema,columns,columnArr.length)
}
}