常用笔记
@(我的第一个笔记本)[不慕将来, 不负当下, ]
conf
hdfs配置项,块位置信息开启
<property>
<name>dfs.client.file-block-storage-locations.timeout</name>
<value>10000</value>
</property>
<property>
<name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
<value>true</value>
</property>
sql
spark
spark sql
create table
drop table httpdata;
create EXTERNAL table httpdata(imsi string) partitioned by(day int) stored as textfile location '/home/etl/export/parquet/mobile/httpData/';
alter table httpdata add partition(day=20170211);
create EXTERNAL table shandong_lac_ci(lac_ci string) stored as textfile location '/xiaohui/shandong_lac_ci';
select count(*) from cellinfo_20171124 where concat(cast(lac as string),",",cast(ci as string)) in (select lac_ci from shandong_lac_ci);
select count(*) from cellinfo_20171124 ci join shandong_lac_ci slc on concat(cast(ci.lac as string),",",cast(ci.ci as string)) =slc.lac_ci;
beeline
spark sql语句执行位置
beeline -u jdbc:hive2://dmp162:10000/default -n root
beeline -u jdbc:hive2://dmp8:10000 -n root --showHeader=false --delimiterForDSV='|' --outputformat=csv2 -e "select * from location_gd_00_result where length(msisdn)=11" >/home/00.csv
spark-submit
提交任务
spark-submit --master spark://dmp8:7077 --class com.everdata.mlib.SparkGBTs PhoneScam-0.0.1-SNAPSHOT.jar /xiaohui/gbdt/app_res_output_7days.log
java -classpath eversec-0.1.jar com.eversec.AreaImport importArea_20171030_00.csv >zdqy2017103000.txt
spark-submit --master spark://evercloud47:7077 --jars /home/GN/hbase-1.2.2/lib/hbase-annotations-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-annotations-1.2.2-tests.jar,/home/GN/hbase-1.2.2/lib/hbase-client-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-common-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-common-1.2.2-tests.jar,/home/GN/hbase-1.2.2/lib/hbase-examples-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-external-blockcache-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-hadoop2-compat-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-hadoop-compat-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-it-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-it-1.2.2-tests.jar,/home/GN/hbase-1.2.2/lib/hbase-prefix-tree-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-procedure-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-protocol-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-resource-bundle-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-rest-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-server-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-server-1.2.2-tests.jar,/home/GN/hbase-1.2.2/lib/hbase-shell-1.2.2.jar,/home/GN/hbase-1.2.2/lib/hbase-thrift-1.2.2.jar,/home/GN/hbase-1.2.2/lib/metrics-core-2.2.0.jar --class com.everdata.hbase.ReadParquetSort /home/GN/hbase-1.2.2/PhoneScam-0.0.1-SNAPSHOT.jar /location_hour/day=$day/hour=$hour
spark conf
spark-default.conf
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.driver.memory 30g
spark.executor.memory 40g
spark.sql.autoBroadcastJoinThreshold 2147483624
spark.scheduler.mode FAIR
spark.default.parallelism 100
spark.kryoserializer.buffer.max 2047
#spark.executor.extraJavaOptions -XX:+PrintGCTimeStamps -XX:+UseParNewGC -XX:SurvivorRatio=3 -XX:NewRatio=1 -XX:MaxTenuringThreshold=1000 -XX:MaxPermSize=1024m
spark.executor.extraJavaOptions -XX:+CMSClassUnloadingEnabled -XX:MaxTenuringThreshold=5 -XX:+UseConcMarkSweepGC -XX:+CMSParallelRemarkEnabled -XX:+UseCMSCompactAtFullCollection -XX:+UseCMSInitiatingOccupancyOnly -XX:CMSInitiatingOccupancyFraction=70 -XX:PermSize=2048M -XX:MaxPermSize=4096m -XX:+UseCompressedOops
spark.eventLog.enabled true
spark.eventLog.dir hdfs://logSave/tmp/historydir
#spark.storage.memoryFraction 0.5
spark.executor.extraClassPath=/home/xiaohui/hbase-1.2.2/lib/mysql-connector-java-5.1.21.jar
spark.driver.extraClassPath=/home/xiaohui/hbase-1.2.2/lib/mysql-connector-java-5.1.21.jar
spark.sql.autoBroadcastJoinThreshold 2147483647
spark.rpc.askTimeout 300
spark.locality.wait 10
spark.driver.maxResultSize 10g
spark.shuffle.blockTransferService nio
spark-env.sh
export HADOOP_CONF_DIR=/home/dmp/hadoop/etc/hadoop
export SPARK_MASTER_IP=dmp8
export SPARK_WORKER_CORES=10
export SPARK_WORKER_MEMORY=80g
export SPARK_WORKER_INSTANCES=1
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export SPARK_MASTER_WEBUI_PORT=8098
export SPARK_WORKER_WEBUI_PORT=8099
impala sql
impala-shell
查询结果并输出到文件,自定义分隔符
impala-shell -q "select r.msisdn,r.day,r.hour,c.lat,c.lon,r.num from(select msisdn,lac,ci,day,hour,count(*) as num from roaming_destination_south_xj where day=20171016 and (hour>='8' and hour<'12') group by msisdn,lac,ci,day,hour) r join cellinfo c on c.mnc=1 and r.lac=c.lac and r.ci=c.ci;" -B --output_delimiter="|" -o zdqy2017101612.txt
impala-shell -q "select distinct concat(cast(lac as string),',',cast(ci as string)) from statistic_hour_user_cell where day>20171110 and day<20171118;" -B --output_delimiter="|" -o lac_ci.txt
更改表名
alter table oldtablename rename to newtablename;
创建删除表
drop table cellinfo_20171124;
CREATE TABLE cellinfo_20171124(
mcc int,
mnc int,
lac int,
ci int,
lat double,
lon double,
acc int,
date int,
validity int,
addr string,
province string,
city string,
district string,
township string
)row format delimited FIELDS terminated by '\t' stored as textfile location '/cellinfo/20171124';
插入表
insert overwrite table statistic_hour_app_user partition(day=20171123,hour='12') select appid,msisdn,count(msisdn) as pv,sum(totalflow) as flow from base_dmp where day=20171124 and hour='12' group by appid,msisdn
hive sql
hbase
hbase conf
配置:hbase.master.maxclockske
<property>
<name>hbase.master.maxclockskew</name>
<value>200000</value>
<description>Time difference of regionserver from master</description>
</property>
hbase sql
create 'location',{NAME =>'info', COMPRESSION => 'SNAPPY'}, SPLITS => ['01|','02|','03|','04|','05|','06|','07|','08|','09|','10|','11|','12|','13|','14|','15|','16|','17|','18|','19|','20|','21|','22|','23|','24|','25|','26|','27|','28|','29|','30|','31|','32|','33|','34|','35|','36|','37|','38|','39|','40|','41|','42|','43|','44|','45|','46|','47|','48|','49|','50|','51|','52|','53|','54|','55|','56|','57|','58|','59|','60|','61|','62|','63|','64|','65|','66|','67|','68|','69|','70|','71|','72|','73|','74|','75|','76|','77|','78|','79|','80|','81|','82|','83|','84|','85|','86|','87|','88|','89|','90|','91|','92|','93|','94|','95|','96|','97|','98|','99|']
scan 'location',{COLUMNS=>'info:flow',STARTROW=>'96942102671',ENDROW=>'96942102672'}
96942102671
count 'location',{ENDROW=>'01'}
scan 'location:info', {COLUMNS => ['flow'], LIMIT => 10, STARTROW => '99444008681'}
scan 'location',{LIMIT => 10}
describe 'location'
disable 'location'
drop 'location'
put 'location','11111','flow:age','24'
@(修改hbase表列超时时间,ttl单位是s)
1. disable ‘location’
2. alter “location”,NAME=>’flow’,TTL=>’200’
3. enable ‘location’
4. scan ‘location’,{LIMIT => 5}
mysql
修改密码
update mysql.user set passwd='xxx' where user='root';
update mysql.user set authentication_string="xxx" where user="root";
GRANT ALL PRIVILEGES ON *.* TO 'root'@'%' WITH GRANT OPTION;
FLUSH PRIVILEGES;
grant all privileges on *.* to root@'%' identified by 'xxx';
update mysql.user set password=password("xxx") where user="root";
update mysql.user set password=password("xxx") where user="hive1";
update mysql.user set password=password("starl1ghtbreak_S1B_666") where user="hive";
GRANT ALL PRIVILEGES ON *.TO 'hive'@'%' IDENTIFIED BY 'starl1ghtbreak_S1B_666' WITH GRANT OPTION;
flush privileges;
sql语句
explain select sb.sld,sum(s2.pv),sum(s2.uv) from sld_bc sb join sld_20170927 s2 on s2.sld like concat("%",sb.sld);
regexp_extract(t.sld,".*(\\.[a-zA-Z0-9]+\\.[a-zA-Z0-9]+)",1)
linux
profile
export JAVA_HOME=/usr/java/jdk1.8.0_101
export PATH=$JAVA_HOME/bin:$PATH
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
/etc/security/limits.conf
root soft nofile 655350
root hard nofile 655350
root soft nproc unlimited
root hard nproc unlimited
命令
split
split -b 200M test.csv -a -d 3 location_ 每200M分隔一个文件
split -l 100 test.csv -a -d 3 location_ 每100行分隔一个文件
jstack
jstack -l pid
ls
按时间排序:
ll -ht
按大小排序:
ll -hS
date
date -d "1 days ago" +%Y-%m-%d
kafka
kafka命令
基本操作
创建kafka topic(一般4台机器都为12个分区)
./kafka-topics.sh --zookeeper evercloud44:2181 --partitions 12 --replication-factor 3 --create --topic window-behavior-log
查看topic
./kafka-topics.sh --describe --zookeeper dmp33:2181 --topic cdr
删除topic
./kafka-topics.sh --delete --zookeeper dmp33:2181 --topic cdr
查看topic数据条数
./kafka-run-class.sh kafka.tools.GetOffsetShell --broker-list evercloud44:9092 --time -1 --topic window-behavior-log|awk -F ':' 'BEGIN{sum=0}{sum+=$3;print sum}'
查看topic具体数据
./kafka-console-consumer.sh --zookeeper evercloud44:2181 --topic cdr --from-beginning -max-messages 10
高级操作
修改topic分区数(只能扩容)
./kafka-topics.sh --alter --zookeeper localhost:42182 --topic myboys1 --partitions 2
修改topic备份数
./kafka-reassign-partitions.sh --zookeeper localhost:42182 --reassignment-json-file ../partitions-extension-push-token-topic.json --execute
partitions-extension-push-token-topic.json文件配置
{
"partitions":
[
{
"topic": "myboys1",
"partition": 0,
"replicas": [1,2] //指定副本所在brokerID
},
{
"topic": "myboys1",
"partition": 1,
"replicas": [2,3]
},
{
"topic": "myboys1",
"partition": 2,
"replicas": [1,3]
}
],
"version":1
}
ETL
启动命令
/home/dmp/etl/bin/etl.sh -d -c /home/dmp/etl/config/cdr-qh-unicom_car.config –Xmx20g –Xms10g -Xss2m
/home/dmp/etl/bin/etl.sh -d -c /home/dmp/etl/config/cdr-hb-unicom-simplify-car.config –Xmx20g –Xms10g -Xss2m
/home/dmp/etl/bin/etl.sh -d -c /home/dmp/etl/config/hb-unicom-kafka-getMessageForWeb.config –Xmx20g –Xms10g -Xss2m
/home/etl/bin/etl.sh -d -c /home/etl/config/cdr-hubei-unicom_car.config –Xmx20g –Xms10g -Xss2m
插件
{
"plugin": "elasticsearch",
"enable": true,
"config": {
"es_config": {
"transport_address": [
"192.168.30.17:9300 ","192.168.30.18:9300
","192.168.30.19:9300
"
],
"client_config": {
"cluster.name
": "es-ha",
"client.transport.ignore_cluster_name": "false"
}
},
"index_config": {
"index": "window_message_log",
"type": "message_log",
"bulk_mode": "insert",
"bulk_size": "10m",
"bulk_thread": 6,
"bulk_interval": "180s",
"expand_index_by_time": true,
"expand_interval": "day",
"expand_date_fields": [
"_date_datetime"
],
"id_fields": [],
"fields": [
"datetime",
"msisdn",
"confirmContent",
"content",
"recommend",
"order"
]
},
"parent_config": {
"enable": false,
"parent_bulk_mode": "insert",
"parent_id_fields": [
"msisdn"
],
"parent_fields": []
}
},
"filters": [
]
}
elasticsearch
index
curl -XPUT 'http://192.168.30.17:9200/_template/window_message_log_day_tmp
' -d'{
"template": "window_message_log*",
"settings": {
"number_of_shards": 12,
"number_of_replicas": 1
},
"mappings": {
"order_log": {
"_all": {
"enabled": false
},
"properties": {
"datetime": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss",
"ignore_malformed": true },
"confirmContent": {
"type": "string",
"index": "not_analyzed" },
"msisdn": {
"type": "string",
"index": "not_analyzed" },
"content": {
"type": "string",
"index": "not_analyzed" },
"recommend": {
"type": "string",
"index": "not_analyzed" },
"order": {
"type": "string",
"index": "not_analyzed" }
}
}
}
}'
“`