第十一题:成绩表行转列
CREATE TABLE sql1_11 (`id` int NOT NULL COMMENT '主键 id',`name` string COMMENT '姓名',`course` string COMMENT '课程',`score` int COMMENT '成绩'
);INSERT INTO sql1_11 VALUES (1,'张三', '语文', 95),
(2,'李四', '语文', 99),(3,'王五', '语文', 80),(4,'张三', '数学', 86),(5,'李四', '数学', 96),
(6,'王五', '数学', 81),(7,'张三', '英语', 78),(8,'李四', '英语', 88),(9,'王五', '英语', 87);
这道题是将成绩表进行行转列——case + sum的使用
id name course score
1 张三 语文 95 变成下面这种name 语文 数学 英语
张三 95 86 78select name,sum(case course when '语文' then score else 0 end) `语文`,sum(case course when '数学' then score else 0 end) `数学`,sum(case course when '英语' then score else 0 end) `英语`
from sql1_11 group by name;
第十二题:订单数、用户数、新客数统计
create table sql1_12(dt string,
order_id string,
user_id string,
amount decimal(10,2))
row format delimited fields terminated by ',';load data local inpath '/home/homedata/sql_1/sql1_12.txt' into table sql1_12;dt order_id user_id amount
2017-01-01,10029001,1,33.57
2017-01-03,10029002,1,53.57
2017-01-11,10029003,2,63.57
2017-01-11,10029013,3,63.57
2017-02-02,10029004,1,73.57
2017-02-07,10029005,1,83.57
2017-02-15,10029006,1,93.57
2017-03-04,10029007,2,33.57
2017-04-01,10029008,1,53.57
2017-05-11,10029009,1,63.57
2017-05-21,10029010,2,73.57
2017-06-01,10029011,1,93.57
2017-06-07,10029012,1,13.57
2017-07-09,10029013,2,33.57
2017-07-20,10029014,1,23.57
2017-08-01,10029015,2,73.57
2017-09-09,10029016,2,93.57
2017-10-01,10029017,1,33.57
2017-11-11,10029018,1,36.57
2017-11-12,10029119,6,136.57
2017-11-21,10029019,1,37.57
2017-12-11,10029020,1,38.57
第一问只是简单的汇总 ,第二题需要exists 的使用
-- 1、给出2017年每个月的订单数,用户数,总成交金额select substr(dt,1,7),count(order_id) orderCount,count(distinct user_id) userCount,sum(amount) sumAmount
from sql1_12 where substr(dt,1,4) =2017 group by substr(dt,1,7);-- 2、给出2017年11月的新客数(指在11月才有第一笔订单)with t1 as (select distinct user_id from sql1_12 where substr(dt,1,7) = "2017-11"
),t2 as (select distinct user_id from sql1_12 where substr(dt,1,7) < "2017-11"
)
select count(*) from t1 where not exists (select * from t2 where t1.user_id = t2.user_id);
第十三题:有50W个京东店铺,每个访客访问任何一个店铺的任何一个商品时都会产生一条访问日志
create table sql1_13(user_id string,shop string
)row format delimited
fields terminated by '\t';
load data local inpath '/home/homedata/sql_1/sql1_13.txt' into table sql1_13;user_id shop
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
第一题只是汇总,第二题是count(*) over +rank的使用
-- 1)每个店铺的UV(访客数,需要去重)
select shop,count(distinct user_id) `访客数` from sql1_13 group by shop;-- 2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数with t1 as (select distinct shop,user_id,count(*) over (partition by shop,user_id) userCount from sql1_13
),t2 as (select *,rank() over (partition by shop order by userCount desc ) rankNum from t1
)select shop, user_id, userCount from t2 where rankNum <=3;
第十四题:店铺访问数据统计
create table sql1_14 (userID string,visitDate string,visitCount int
)
row format delimited
fields terminated by ' ';load data local inpath '/home/homedata/sql_1/sql1_14.txt' into table sql1_14;userID visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4
首先在于日期的格式不好,然后是使用sum 进行开窗时 使用order 进行排序,窗口大小会逐渐增大,从而达到累计的效果。
-- 要求使用SQL统计出每个用户的累计访问次数,和该月的累计访问次数方式一:日期格式化
with t1 as (select userID, substr(from_unixtime(unix_timestamp(visitDate,"yyyy/MM/dd"),"yyyy-MM-dd"),1,7) visitDate, visitCount from sql1_14
) select distinct userID,visitDate,sum(visitCount) over (partition by userID,visitDate) `小计`,sum(visitCount)over (partition by userID order by visitDate) `累计` from t1;方式二:字符串拼接
with t1 as( select userID,substr(visitDate,1,6) month,sum(visitCount) count from sql1_14 group by userID,substr(visitDate,1,6) )
select userID `用户`,concat_ws('-0',split(month,'/')) `月份`, count `小计`,sum(count)over (partition by userID order by month) `累计` from t1;
第十五题:计算连续用户完成订单间隔最大的天数和最小的天数
create table sql1_15(user_id string,dt string,amt int
)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.+?)\\s+(.+?)\\s+(\\d+)','output.format.string'='%1$s %2$s %3$s'
);
load data local inpath '/home/homedata/sql_1/sql1_15.txt' into table sql1_15;user_id dt amt
uid_1 20200501 10
uid_1 20200503 160
uid_1 20200508 201
uid_1 20200511 103
uid_1 20200516 119
首先是日期格式的问题,然后就是lag的使用。
-- 计算每个用户两个订单间隔时间最长和最短的天数
with t1 as( select user_id, concat(substr(dt,1,4),'-',substr(dt,5,2),'-',substr(dt,7,2)) dt from sql1_15 ),t2 as(select user_id,datediff(dt,lag(dt,1,dt) over (partition by user_id order by dt))-1 dt from t1 )
select user_id,max(dt) max_day,min(dt) min_day from t2 where dt >= 0 group by user_id;