spark导入csv文件,其中string数据类型为'NULL'字符串,解决方案!

  • 创建导入数据的外部表
create EXTERNAL table cyf_ce(
  attr1 STRING ,
  attr2 STRING ,
  attr3 STRING ,
  attr4 STRING ,
  attr5 STRING ,
  channel_account STRING ,
  channel_type STRING ,
  device STRING ,
  employee STRING ,
  inbound INT ,
  field_location STRING ,
  referrer STRING ,
  referrer_name STRING ,
  tag STRING ,
  target_id STRING ,
  target_name STRING ,
  campaign STRING ,
  campaign_name STRING ,
  content_name STRING ,
  source STRING ,
  summary STRING ,
  internal_content_id STRING ,
  last_forwarder STRING ,
  score INT ,
  last_updated BIGINT ,
  ref_score INT ,
  page_id STRING ,
  event_group STRING ,
  batch_id STRING ,
  from_collector INT ,
  app_name STRING ,
  app_version STRING ,
  quantity1 BIGINT ,
  quantity2 BIGINT ,
  amount1 DOUBLE ,
  amount2 DOUBLE ,
  identity_value STRING ,
  identity_type STRING ,
  identity_value2 STRING ,
  identity_type2 STRING ,
  identity_value3 STRING ,
  identity_type3 STRING ,
  is_first_try INT ,
  is_first_engage INT ,
  duration INT ,
  utma STRING ,
  utm STRING ,
  quantity3 BIGINT ,
  date1 BIGINT ,
  date2 BIGINT ,
  platform STRING ,
  os STRING ,
  os_version STRING ,
  browser STRING ,
  browser_version STRING ,
  model STRING ,
  network STRING ,
  ip STRING ,
  ip_county STRING ,
  ip_city STRING ,
  ip_province STRING ,
  ip_country STRING ,
  screen_width STRING ,
  screen_height STRING ,
  domain STRING ,
  path STRING ,
  attr6 STRING ,
  attr7 STRING ,
  attr8 STRING ,
  attr9 STRING ,
  attr10 STRING ,
  amount3 DOUBLE ,
  date3 BIGINT ,
  page_type STRING ,
  session_id STRING ,
  url STRING ,
  attr11 STRING ,
  attr12 STRING ,
  attr13 STRING ,
  attr14 STRING ,
  attr15 STRING ,
  attr16 STRING ,
  attr17 STRING ,
  attr18 STRING ,
  attr19 STRING ,
  attr20 STRING ,
  attr21 STRING ,
  attr22 STRING ,
  attr23 STRING ,
  attr24 STRING ,
  attr25 STRING ,
  attr26 STRING ,
  attr27 STRING ,
  attr28 STRING ,
  attr29 STRING ,
  attr30 STRING ,
  attr31 STRING ,
  attr32 STRING ,
  attr33 STRING ,
  attr34 STRING ,
  attr35 STRING ,
  attr36 STRING ,
  attr37 STRING ,
  attr38 STRING ,
  attr39 STRING ,
  attr40 STRING ,
  date4 BIGINT ,
  date5 BIGINT ,
  date6 BIGINT ,
  date7 BIGINT ,
  date8 BIGINT ,
  date9 BIGINT ,
  date10 BIGINT ,
  amount4 DOUBLE ,
  amount5 DOUBLE ,
  amount6 DOUBLE ,
  amount7 DOUBLE ,
  amount8 DOUBLE ,
  amount9 DOUBLE ,
  amount10 DOUBLE ,
  longitude DOUBLE ,
  latitude DOUBLE ,
  source1 STRING ,
  source2 STRING ,
  source3 STRING ,
  source4 STRING ,
  source5 STRING ,
  source6 STRING ,
  source7 STRING ,
  source8 STRING ,
  flow_id INT ,
  flow_version INT ,
  step_id INT 
 )ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
  • 通过spark导入数据
sql("load data local inpath '/home/cyftest/long_text_2020-04-16-15-57-36.csv' overwrite into table ccic_dmhub_cdp.cyf_ce")

其中string类型的数据都为NULL

  • 通过替换数据将数据替换成null
var s="""select 
  case when attr1 = 'NULL' then null else attr1 end attr1,
  case when attr2 = 'NULL' then null else attr2 end attr2,
  case when attr3 = 'NULL' then null else attr3 end attr3,
  case when attr4 = 'NULL' then null else attr4 end attr4,
  case when attr5 = 'NULL' then null else attr5 end attr5,
  case when channel_account = 'NULL' then null else channel_account end channel_account,
  case when channel_type = 'NULL' then null else channel_type end channel_type,
  case when device = 'NULL' then null else device end device,
  case when employee = 'NULL' then null else employee end employee,
  inbound,
  case when field_location = 'NULL' then null else field_location end field_location,
  case when referrer = 'NULL' then null else referrer end referrer,
  case when referrer_name = 'NULL' then null else referrer_name end referrer_name,
  case when tag = 'NULL' then null else tag end tag,
  case when target_id = 'NULL' then null else target_id end target_id,
  case when target_name = 'NULL' then null else target_name end target_name,
  case when campaign = 'NULL' then null else campaign end campaign,
  case when campaign_name = 'NULL' then null else campaign_name end campaign_name,
  case when content_name = 'NULL' then null else content_name end content_name,
  case when source = 'NULL' then null else source end source,
  case when summary = 'NULL' then null else summary end summary,
  case when internal_content_id = 'NULL' then null else internal_content_id end internal_content_id,
  case when last_forwarder = 'NULL' then null else last_forwarder end last_forwarder,
  score,
  last_updated,
  ref_score,
  case when page_id = 'NULL' then null else page_id end page_id,
  case when event_group = 'NULL' then null else event_group end event_group,
  case when batch_id = 'NULL' then null else batch_id end batch_id,
  from_collector,
  case when app_name = 'NULL' then null else app_name end app_name,
  case when app_version = 'NULL' then null else app_version end app_version,
  quantity1,
  quantity2,
  amount1,
  amount2,
  case when identity_value = 'NULL' then null else identity_value end identity_value,
  case when identity_type = 'NULL' then null else identity_type end identity_type,
  case when identity_value2 = 'NULL' then null else identity_value2 end identity_value2,
  case when identity_type2 = 'NULL' then null else identity_type2 end identity_type2,
  case when identity_value3 = 'NULL' then null else identity_value3 end identity_value3,
  case when identity_type3 = 'NULL' then null else identity_type3 end identity_type3,
  is_first_try,
  is_first_engage,
  duration,
  case when utma = 'NULL' then null else utma end utma,
  case when utm = 'NULL' then null else utm end utm,
  quantity3,
  date1,
  date2,
  case when platform = 'NULL' then null else platform end platform,
  case when os = 'NULL' then null else os end os,
  case when os_version = 'NULL' then null else os_version end os_version,
  case when browser = 'NULL' then null else browser end browser,
  case when browser_version = 'NULL' then null else browser_version end browser_version,
  case when model = 'NULL' then null else model end model,
  case when network = 'NULL' then null else network end network,
  case when ip = 'NULL' then null else ip end ip,
  case when ip_county = 'NULL' then null else ip_county end ip_county,
  case when ip_city = 'NULL' then null else ip_city end ip_city,
  case when ip_province = 'NULL' then null else ip_province end ip_province,
  case when ip_country = 'NULL' then null else ip_country end ip_country,
  case when screen_width = 'NULL' then null else screen_width end screen_width,
  case when screen_height = 'NULL' then null else screen_height end screen_height,
  case when domain = 'NULL' then null else domain end domain,
  case when path = 'NULL' then null else path end path,
  case when attr6 = 'NULL' then null else attr6 end attr6,
  case when attr7 = 'NULL' then null else attr7 end attr7,
  case when attr8 = 'NULL' then null else attr8 end attr8,
  case when attr9 = 'NULL' then null else attr9 end attr9,
  case when attr10 = 'NULL' then null else attr10 end attr10,
  amount3,
  date3,
  case when page_type = 'NULL' then null else page_type end page_type,
  case when session_id = 'NULL' then null else session_id end session_id,
  case when url = 'NULL' then null else url end url,
  case when attr11 = 'NULL' then null else attr11 end attr11,
  case when attr12 = 'NULL' then null else attr12 end attr12,
  case when attr13 = 'NULL' then null else attr13 end attr13,
  case when attr14 = 'NULL' then null else attr14 end attr14,
  case when attr15 = 'NULL' then null else attr15 end attr15,
  case when attr16 = 'NULL' then null else attr16 end attr16,
  case when attr17 = 'NULL' then null else attr17 end attr17,
  case when attr18 = 'NULL' then null else attr18 end attr18,
  case when attr19 = 'NULL' then null else attr19 end attr19,
  case when attr20 = 'NULL' then null else attr20 end attr20,
  case when attr21 = 'NULL' then null else attr21 end attr21,
  case when attr22 = 'NULL' then null else attr22 end attr22,
  case when attr23 = 'NULL' then null else attr23 end attr23,
  case when attr24 = 'NULL' then null else attr24 end attr24,
  case when attr25 = 'NULL' then null else attr25 end attr25,
  case when attr26 = 'NULL' then null else attr26 end attr26,
  case when attr27 = 'NULL' then null else attr27 end attr27,
  case when attr28 = 'NULL' then null else attr28 end attr28,
  case when attr29 = 'NULL' then null else attr29 end attr29,
  case when attr30 = 'NULL' then null else attr30 end attr30,
  case when attr31 = 'NULL' then null else attr31 end attr31,
  case when attr32 = 'NULL' then null else attr32 end attr32,
  case when attr33 = 'NULL' then null else attr33 end attr33,
  case when attr34 = 'NULL' then null else attr34 end attr34,
  case when attr35 = 'NULL' then null else attr35 end attr35,
  case when attr36 = 'NULL' then null else attr36 end attr36,
  case when attr37 = 'NULL' then null else attr37 end attr37,
  case when attr38 = 'NULL' then null else attr38 end attr38,
  case when attr39 = 'NULL' then null else attr39 end attr39,
  case when attr40 = 'NULL' then null else attr40 end attr40,
  date4,
  date5,
  date6,
  date7,
  date8,
  date9,
  date10,
  amount4,
  amount5,
  amount6,
  amount7,
  amount8,
  amount9,
  amount10,
  longitude,
  latitude,
  case when source1 = 'NULL' then null else source1 end source1,
  case when source2 = 'NULL' then null else source2 end source2,
  case when source3 = 'NULL' then null else source3 end source3,
  case when source4 = 'NULL' then null else source4 end source4,
  case when source5 = 'NULL' then null else source5 end source5,
  case when source6 = 'NULL' then null else source6 end source6,
  case when source7 = 'NULL' then null else source7 end source7,
  case when source8 = 'NULL' then null else source8 end source8,
  flow_id,
  flow_version,
  step_id
from cyf_ce"""

//创建df
var df=sql(s)

//注册临时表 或者dataframe.registerTempTable("temp_ce") 

//创建表
sql("create table customer_event_c stored as TEXTFILE as select * from temp_ce")
  • 结果
    转换结果
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值