# Hive综合应用案例:用户搜索日志分析
## 1. 案例背景
某电商平台需要分析用户的搜索行为,以优化搜索算法和推荐系统。原始搜索日志数据存储在HDFS中,需要利用Hive进行数据清洗、转换和分析。
## 2. 数据准备
### 2.1 原始日志格式
```
192.168.1.1 - - [15/Mar/2023:10:12:45 +0800] "GET /search?query=手机&user_id=1001&category=electronics HTTP/1.1" 200 3452
```
### 2.2 创建Hive外部表
```sql
CREATE EXTERNAL TABLE raw_search_logs (
log_string STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\n'
STORED AS TEXTFILE
LOCATION '/user/hive/warehouse/search_logs/raw';
```
## 3. 数据清洗与转换
### 3.1 使用正则表达式解析日志
```sql
CREATE TABLE parsed_search_logs AS
SELECT
regexp_extract(log_string, '^(\\S+)', 1) AS ip,
regexp_extract(log_string, '\\[(.+?)\\]', 1) AS timestamp,
regexp_extract(log_string, 'query=([^&]+)', 1) AS search_query,
regexp_extract(log_string, 'user_id=(\\d+)', 1) AS user_id,
regexp_extract(log_string, 'category=([^&\\s]+)', 1) AS category,
regexp_extract(log_string, 'HTTP/\\d.\\d"\\s(\\d+)', 1) AS status_code,
regexp_extract(log_string, '\\s(\\d+)$', 1) AS response_size
FROM raw_search_logs
WHERE log_string LIKE '%GET /search%';
```
### 3.2 数据质量检查与清洗
```sql
CREATE TABLE cleaned_search_logs AS
SELECT
ip,
from_unixtime(unix_timestamp(timestamp, 'dd/MMM/yyyy:HH:mm:ss Z')) AS search_time,
lower(trim(search_query)) AS search_term,
cast(user_id AS INT) AS user_id,
case
when category is null then 'uncategorized'
else lower(category)
end AS category,
cast(status_code AS INT) AS status_code,
cast(response_size AS INT) AS response_size
FROM parsed_search_logs
WHERE status_code = 200
AND user_id IS NOT NULL
AND search_query IS NOT NULL;
```
## 4. 数据分析
### 4.1 热门搜索词分析
```sql
-- 每日热门搜索词TOP10
CREATE TABLE daily_top_search_terms AS
SELECT
to_date(search_time) AS search_date,
search_term,
count(*) AS search_count,
dense_rank() OVER (PARTITION BY to_date(search_time) ORDER BY count(*) DESC) AS rank
FROM cleaned_search_logs
GROUP BY to_date(search_time), search_term
HAVING rank <= 10;
```
### 4.2 用户搜索行为分析
```sql
-- 用户搜索模式分析
CREATE TABLE user_search_behavior AS
SELECT
user_id,
count(DISTINCT search_term) AS unique_searches,
count(*) AS total_searches,
avg(length(search_term)) AS avg_query_length,
count(DISTINCT category) AS categories_searched,
min(search_time) AS first_search_time,
max(search_time) AS last_search_time
FROM cleaned_search_logs
GROUP BY user_id;
```
### 4.3 搜索转化漏斗分析
```sql
-- 搜索后用户行为漏斗
CREATE TABLE search_conversion_funnel AS
WITH search_actions AS (
SELECT
s.user_id,
s.search_term,
s.search_time,
min(c.click_time) AS first_click_time,
min(p.purchase_time) AS first_purchase_time
FROM cleaned_search_logs s
LEFT JOIN click_logs c ON s.user_id = c.user_id
AND c.click_time > s.search_time
AND c.click_time < date_add(s.search_time, 1)
LEFT JOIN purchase_logs p ON s.user_id = p.user_id
AND p.purchase_time > s.search_time
AND p.purchase_time < date_add(s.search_time, 7)
GROUP BY s.user_id, s.search_term, s.search_time
)
SELECT
search_term,
count(*) AS total_searches,
count(CASE WHEN first_click_time IS NOT NULL THEN 1 END) AS click_throughs,
count(CASE WHEN first_purchase_time IS NOT NULL THEN 1 END) AS conversions,
count(CASE WHEN first_click_time IS NOT NULL THEN 1 END) / count(*) AS ctr,
count(CASE WHEN first_purchase_time IS NOT NULL THEN 1 END) /
count(CASE WHEN first_click_time IS NOT NULL THEN 1 END) AS conversion_rate
FROM search_actions
GROUP BY search_term
HAVING total_searches > 100
ORDER BY conversions DESC
LIMIT 100;
```
## 5. 高级分析
### 5.1 搜索词关联分析
```sql
-- 频繁共现搜索词对
CREATE TABLE search_term_pairs AS
WITH user_search_sequences AS (
SELECT
user_id,
search_term,
lag(search_term, 1) OVER (PARTITION BY user_id ORDER BY search_time) AS previous_search
FROM cleaned_search_logs
)
SELECT
previous_search AS term1,
search_term AS term2,
count(*) AS cooccurrence_count
FROM user_search_sequences
WHERE previous_search IS NOT NULL
AND previous_search != search_term
GROUP BY previous_search, search_term
HAVING count(*) > 10
ORDER BY cooccurrence_count DESC;
```
### 5.2 时间序列分析
```sql
-- 搜索量时间序列分析
CREATE TABLE search_trends AS
SELECT
date_format(search_time, 'yyyy-MM-dd HH') AS hour,
category,
count(*) AS search_count,
count(DISTINCT user_id) AS unique_users
FROM cleaned_search_logs
GROUP BY date_format(search_time, 'yyyy-MM-dd HH'), category
ORDER BY hour, category;
```
## 6. 数据可视化准备
### 6.1 创建分析结果视图
```sql
-- 热门搜索词视图
CREATE VIEW v_top_search_terms AS
SELECT
search_date,
search_term,
search_count,
rank
FROM daily_top_search_terms;
-- 用户行为视图
CREATE VIEW v_user_behavior AS
SELECT
user_id,
unique_searches,
total_searches,
categories_searched,
datediff(last_search_time, first_search_time) AS usage_days
FROM user_search_behavior;
```
## 7. 性能优化技巧
### 7.1 分区表设计
```sql
-- 按日期分区的搜索日志表
CREATE TABLE partitioned_search_logs (
ip STRING,
search_term STRING,
user_id INT,
category STRING,
status_code INT,
response_size INT
)
PARTITIONED BY (search_date STRING)
STORED AS ORC;
-- 动态加载分区
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
INSERT INTO TABLE partitioned_search_logs PARTITION(search_date)
SELECT
ip,
search_term,
user_id,
category,
status_code,
response_size,
to_date(search_time) AS search_date
FROM cleaned_search_logs;
```
### 7.2 索引优化
```sql
-- 为常用查询字段创建索引
CREATE INDEX search_term_idx ON TABLE cleaned_search_logs (search_term)
AS 'org.apache.hadoop.hive.ql.index.compact.CompactIndexHandler';
```
## 8. 案例总结
通过本案例,我们完成了:
1. 原始日志数据的解析与清洗
2. 用户搜索行为的多维度分析
3. 搜索转化漏斗的构建
4. 搜索词关联模式发现
5. 分析结果的可视化准备
这些分析结果可以帮助:
- 优化搜索算法,提升热门查询的准确性
- 改进推荐系统,基于用户搜索历史推荐相关商品
- 识别潜在的搜索需求趋势
- 发现搜索体验中的问题点(如无结果的查询)