2021-01-26

最新推荐文章于 2023-01-06 13:00:38 发布

Fei-joe

最新推荐文章于 2023-01-06 13:00:38 发布

阅读量234

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/qq_33290422/article/details/113186796

版权

大数据专栏收录该内容

5 篇文章 0 订阅

订阅专栏

Csv to Excel

$ pip install pyexcel pyexcel-xlsx
you can do it in one command line:

from pyexcel.cookbook import merge_all_to_a_book
# import pyexcel.ext.xlsx # no longer required if you use pyexcel >= 0.2.2 
import glob


merge_all_to_a_book(glob.glob("your_csv_directory/*.csv"), "output.xlsx")

Each csv will have its own sheet and the name will be their file name.
===========================================================================

import os
from pyexcel.cookbook import merge_all_to_a_book
# import pyexcel.ext.xlsx # no longer required if you use pyexcel >= 0.2.2
import glob
path = '/mnt2/joe/example.csv'

path_out='/mnt2/joe/example.xlsx'
sql = 'select * from dw_dazhongdianping.dazhongdianping_2017 limit 100'
cmd = 'hive -e \''+sql+'\' | sed \'s/\\t/,/g\' >'+path
print(cmd)

os.system(cmd)
merge_all_to_a_book(glob.glob(path), path_out)

Elasticreasch with Hive

PUT /land 
{
  "mappings": {
    "land": {
      "properties": {
        "location": {
          "type": "geo_point"
        }
      }
    }
  }
}


CREATE external TABLE joe.`es_land_json`(
   id string, 
  `area_code` string, 
  `area_name` string, 
  `es_num` string, 
  `project_name` string, 
  `project_location` string, 
  `area` string, 
  `land_source` string, 
  `land_use` string, 
  `land_way` string, 
  `land_usage_term` string, 
  `industry` string, 
  `land_grade` string, 
  `tran_price` string, 
  `land_user` string, 
  `lower_limit` string, 
  `upper_limit` string, 
  `agreed_delivery_time` string, 
  `agreed_startup_time` string, 
  `agreed_completion_time` string, 
  `actual_startup_time` string, 
  `actual_completion_time` string, 
  `approved_unit` string, 
  `contract_signing_date` string, 
  `payable_by_instalments` string, 
  location struct<lat:double,lon:double>)
 STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES(
'es.resource' = 'land/land', 
'es.nodes'='ur ip',
'es.port'='9200',
'es.nodes.discovery'='true',
'es.mapping.id'='id');

CREATE TABLE joe.`es_land`(
  `id` string, 
  `area_code` string, 
  `area_name` string, 
  `es_num` string, 
  `project_name` string, 
  `project_location` string, 
  `area` string, 
  `land_source` string, 
  `land_use` string, 
  `land_way` string, 
  `land_usage_term` string, 
  `industry` string, 
  `land_grade` string, 
  `tran_price` string, 
  `land_user` string, 
  `lower_limit` string, 
  `upper_limit` string, 
  `agreed_delivery_time` string, 
  `agreed_startup_time` string, 
  `agreed_completion_time` string, 
  `actual_startup_time` string, 
  `actual_completion_time` string, 
  `approved_unit` string, 
  `contract_signing_date` string, 
  `payable_by_instalments` string, 
  location struct<lat:double,lon:double>
  );

insert overwrite table joe.`es_land`
select `id` , 
  `area_code` , 
  `area_name` , 
  `es_num` , 
  `project_name` , 
  `project_location` , 
  `area` , 
  `land_source` , 
  `land_use` , 
  `land_way` , 
  `land_usage_term` , 
  `industry` , 
  `land_grade` , 
  `tran_price` , 
  `land_user` , 
  `lower_limit` , 
  `upper_limit` , 
  `agreed_delivery_time` , 
  `agreed_startup_time` , 
  `agreed_completion_time` , 
  `actual_startup_time` , 
  `actual_completion_time` , 
  `approved_unit` , 
  `contract_signing_date` , 
  `payable_by_instalments` , 
named_struct('lat',lat,'lon',lng) from dw_land.land_jingweidu_id_new_double
 where lat is not NULL and lng is not NULL;


insert overwrite table joe.`es_land_json`
select * from joe.`es_land` ;




GET /land/land/_search
{
  "query": {
        "geo_bounding_box": {
          "location": { 
            "top_left": {
              "lat":  39.991851,
              "lon": 116.2326041
            },
            "bottom_right": {
              "lat":  39.845465,
              "lon": 116.48393
            }
          }
        }
      
  }
}
====================================================================
GET /land/land/_search
{
  "query": {
    "bool": {
      "must": [
        {"match_all": {}
    
        }
      ],
      "filter": {
        "geo_bounding_box": {
          "location": {
            "wkt" : "BBOX (115.4883048272 ,117.4438576474,40.9549832419, 39.5294569963)"
            
          }
        }
      }
    }
  }
}

====================================================================
GET /land/land/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "match_all": {}
        }
      ],
      "filter": {
        "geo_distance": {
          "distance": "2km",
          "location": {
            "lat": 45.7905013332,
            "lon": 126.573
          }
        }
      }
    }
  }
}

HDFS zip small file

# /user/hdfs/economy/united_nations_trade2/UN_2006 file zip 2006.har put /user/hdfs/economy/united_nations_trade2
hadoop archive -archiveName 2006.har -p /user/hdfs/economy/united_nations_trade2/UN_2006 /user/hdfs/economy/united_nations_trade2

## Hive create table by csv
create  external table esri.beijing_langfang_1901_grid
 (WKT string,
 OBJECTID string,
 Join_Count string,
 TARGET_FID string,
 tid string,
 TAZID string,
 Shape_Leng string,
 Shape_Area string,
 name string,
 Shape_Le_1 string,
 Shape_Ar_1 string)
 ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.OpenCSVSerde' 
WITH SERDEPROPERTIES ( 
  'escapeChar'='\\', 
  'quoteChar'='\"', 
  'separatorChar'=',')
  STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

location 'hdfs://172.30.10.229:9000/user/hdfs/grid/beijing_1901';

#---------------------------------------------------------------------
# Hive create table by json 
WAY 1 ：

add jar /mnt/modules/hive/lib/json-serde-1.3.6-jar-with-dependencies.jar
create external table json_hive (
id string,
name string,
h_index string,
n_pubs string,
tags string,
pubs string,
n_citation string,
orgs string)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;
LOAD DATA  INPATH 'hdfs://172.30.10.229:9000/user/hdfs/study/thesis/aminer_authors/joe.json' OVERWRITE INTO TABLE json;


CREATE external TABLE aminer_authors (
id string,
name string,
h_index string,
n_pubs string,
tags array<string>,
pubs array<map<string,string>>,
n_citation  string,
orgs array<string>)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE;

LOAD DATA  INPATH 'hdfs://172.30.10.229:9000/user/hdfs/study/thesis/aminer_authors/aminer_authors_0.txt'  INTO TABLE  aminer_authors;



WAY 2：
CREATE external TABLE aminer_papers_json (
json string)
location 'hdfs://172.30.10.229:9000/user/hive/warehouse/ods_thesis.db/aminer_papers';

select 
 get_json_object(t.json,'$.id') as id,
 get_json_object(t.json,'$.title') as title ,
 get_json_object(t.json,'$.authors') as authors,
 get_json_object(t.json,'$.venue') as venue,
 get_json_object(t.json,'$.year') as year,
 get_json_object(t.json,'$.keywords') as keywords,
 get_json_object(t.json,'$.n_citation') as n_citation,
 get_json_object(t.json,'$.page_start') as page_start,
 get_json_object(t.json,'$.page_end') as page_end,
 get_json_object(t.json,'$.lang') as lang,
 get_json_object(t.json,'$.volume') as volume,
 get_json_object(t.json,'$.pdf') as pdf ,
 get_json_object(t.json,'$.issue') as issue,
 get_json_object(t.json,'$.abstract') as abstract,
 get_json_object(t.json,'$.issn') as issn,
 get_json_object(t.json,'$.doi') as doi,
 get_json_object(t.json,'$.url') as url
 
 from ods_thesis.aminer_papers_json t limit 30; 
#------------------------------------------------------------------------

##Get hive metastore from mysql 

SELECT p.TBL_ID,FROM_UNIXTIME(p.CREATE_TIME),p.DB_ID,p.TBL_NAME,p.TBL_TYPE,p.DB_LOCATION_URI,p.NAME,p.totalSize,p.numRows,p.`comment`,group_concat(COLUMNS_V2.COLUMN_NAME) FROM
(
SELECT 
z.TBL_ID,z.CREATE_TIME,z.DB_ID,z.TBL_NAME,
(CASE when z.TBL_TYPE='MANAGED_TABLE' THEN '内部表' when z.TBL_TYPE='EXTERNAL_TABLE' THEN '外部表' else '' end) as TBL_TYPE,
z.DB_LOCATION_URI,z.NAME ,concat(ROUND(SUM(z.totalSize)/1024/1024,2),' MB') as totalSize ,SUM(z.numRows) as numRows,z.`comment` FROM
(
SELECT t.*,
(CASE when s.PARAM_KEY='totalSize' THEN PARAM_VALUE else '' end) as totalSize,
(CASE when s.PARAM_KEY='numRows' THEN PARAM_VALUE   else '' end) as numRows,
(CASE when s.PARAM_KEY='comment' THEN PARAM_VALUE   else '' end) as 'comment'
 
 FROM 
(SELECT TBL_ID,CREATE_TIME,TBLS.DB_ID,TBL_NAME,TBL_TYPE,DB_LOCATION_URI,NAME FROM TBLS LEFT JOIN DBS ON TBLS.DB_ID=DBS.DB_ID ) t 
RIGHT join TABLE_PARAMS s on t.TBL_ID=s.TBL_ID 
) z  GROUP BY z.TBL_ID,z.CREATE_TIME,z.DB_ID,z.TBL_NAME,z.TBL_TYPE,z.DB_LOCATION_URI,z.NAME
) p LEFT JOIN COLUMNS_V2  ON p.TBL_ID=COLUMNS_V2.CD_ID 
GROUP BY p.TBL_ID,p.CREATE_TIME,p.DB_ID,p.TBL_NAME,p.TBL_TYPE,p.DB_LOCATION_URI,p.NAME,p.totalSize,p.numRows,p.`comment`

##====================================

/*

how to set hive partitions

*/

set spark.sql.shuffle.partitions=1;
add jar ${HIVE_HOME}/lib/hive-contrib-2.3.3.jar;
create temporary function row_sequence as 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence';
select row_sequence(), column1, column2, ... from tablename  group by column.....

/*
ps:
先将原来的任务数据写到一个临时分区（如tmp）；
再起一个并行度为1的任务，类似：

insert overwrite 目标表 select * from 临时分区

但是结果小文件数还是没有减少，略感疑惑；
经过多次测后发现原因：‘select * from 临时分区’ 这个任务在spark中属于窄依赖；
并且spark DAG中分为宽依赖和窄依赖，只有宽依赖会进行shuffle；
故并行度shuffle，spark.sql.shuffle.partitions=1也就没有起到作用；

由于数据量本身不是特别大，所以直接采用了group by（在spark中属于宽依赖）的方式，类似：

insert overwrite 目标表 select * from 临时分区 group by *
/*

MongoDB to HDFS

/**
spark-shell  \
--conf "spark.mongodb.input.uri=mongodb://thupdi2016:THUPDI_DATA@ip:27017/house.lj_agent" \
--conf "spark.mongodb.output.uri=mongodb://thupdi2016:THUPDI_DATA@ip:27017/house.out" \
--packages org.mongodb.spark:mongo-spark-connector_2.11:2.2.2 --master yarn 
*/
import com.mongodb.spark._
import com.mongodb.spark.config._
import org.bson.Document 
import com.mongodb.spark.MongoSpark
import org.apache.spark.sql.SparkSession
val sparkSession = SparkSession.builder().master("local").appName("MongoSparkConnectorIntro").config("spark.mongodb.input.uri", "mongodb://thupdi2016:THUPDI_DATA@ip:27017/house.lj_agent").config("spark.mongodb.output.uri", "mongodb://thupdi2016:THUPDI_DATA@ip:27017/house.out").getOrCreate()

case class lj_agent (
_id:String, 
city_id:String, 
app_id:String, 
score_desc:String, 
review_count:String, 
agent_ucid:String, 
agent_position:String, 
shop_name:String, 
se_status:String, 
bad_rate:String, 
photo_url:String, 
job_year:String, 
mobile_phone:String, 
online_status:String, 
agent_level:String, 
soso_rate:String, 
desc:String, 
feedback_count:String, 
name:String, 
m_url:String, 
agent_code:String, 
bit_status:String, 
good_rate:String)

val characters = MongoSpark.load[lj_agent](sparkSession)

characters.createOrReplaceTempView("lj_agent")
val centenarians = sparkSession.sql("select * from lj_agent")
centenarians.show()
centenarians.write.csv("hdfs://master:9000/user/hdfs/mongodb_hadoop/house")


MongoSpark.load(sc).take(1).foreach(println)


val df = MongoSpark.load(sparkSession)  // Uses the SparkSession
df.sparkSession.sql("show tables")

How to export data from Hive and presto

Presto

/mnt2/modules/presto-server-0.189/./presto --server presto_master:9999 --catalog hive --schema dw_unicom --execute "select distinct p_city from act_popu_dis_age limit 50;" --output-format CSV_HEADER > Classify3.csv

Hive
hive -e 'select t.date_dt,t.code,t.industyco,sum(exits_num) from (select date_dt,code,industyco,exits_num from dw_project_0.w_industry_longxin_qx group by date_dt,code,industyco,exits_num) t group by t.date_dt,t.code,t.industyco' | sed 's/\t/,/g' >/mnt2/joe/district.csv

Fei-joe

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
2021-01-26

Csv to Excel$ pip install pyexcel pyexcel-xlsxyou can do it in one command line:from pyexcel.cookbook import merge_all_to_a_book# import pyexcel.ext.xlsx # no longer required if you use pyexcel >= 0.2.2 import globmerge_all_to_a_book(glob....
复制链接

扫一扫