在提取训练样本的时候,常常会把正负样本按1:10提取. 最开始这样子提取order by是对union之后的结果进行排序,然后总共提取1000条数据
drop table default.test_dp1;
create table default.test_dp1
as
select uuid, newsid, ts, tag, place, tagname, showtype, newstype
from l1_feature_project.l1_user_news_neg_pos
where dt='20171122' and tag='1' limit 100
union
select uuid, newsid, ts, tag, place, tagname, showtype, newstype
from l1_feature_project.l1_user_news_neg_pos
where dt='20171122' and tag='0'
order by rand() limit 1000;
需改成:
drop table default.test_dp1;
create table default.test_dp1
as
select uuid, newsid, ts, tag, place, tagname, showtype, newstype
from l1_feature_project.l1_user_news_neg_pos
where dt='20171122' and tag='1' limit 100
union
select uuid, newsid, ts, tag, place, tagname, showtype, newstype
from
(
select uuid, newsid, ts, tag, place, tagname, showtype, newstype
from l1_feature_project.l1_user_news_neg_pos
where dt='20171122' and tag='0'
order by rand() limit 1000
)a