搜索重要指标:召回率、准确率。
核心指标:relevance相关性、freshness时效性、quality质量、ctr点击率、confidence权威度、cold_start冷启动。最终的排序依赖这几个字段计算分数
业务接口返回6000-9000个字段
搜索接口查了7个搜索链路
PM需求:
选取某一天人物的query,标识出同时出type=12及type=3的query,计算同时出现的概率,另外把同时出的query给出来,分析使用
query选取:人物top1000 、随机1000
引擎接口传参加上调试信息cmd=xx后的接口返回43846个字段
引擎接口不加调试信息cmd=xx的接口返回23944个字段
如果从业务接口去请求,不可,因为业务接口过滤了引擎返回的大量字段
所以只能测引擎接口
难点:上万个返回字段里如何取到这个“特殊的字段”,查了很多个query,看json格式吐了,在json在线解析直接无响应。。。
最终:取resultList这个jsonArray下的jsonObject的元素即可
QA测试设计:
数据源准备:去FBI捞取或去ODPS读取top1000个人物query、任意1000个人物query
读取引擎接口(对内),判断接口返回,取标志性字段(type类型),for循环遍历JSONObject下的value。分支判断,组合各种场景实现需求要的数据统计
发现开发代码的bug:引擎接口召回不稳定,同一个query请求2次,时而返回3 和12类型,时而仅返回12类型
写给自己的bug:跑完数据才发现,召回比率应该改为百分比
修改后:
代码实现:
模块划分:
1. HTTPCommonMethod为拼接http请求的工具类
package com.xx.searchRecall.utils;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.util.EncodingUtil;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public class HTTPCommonMethod {
/**
* get 请求,只需将变动的参数传入params中即可
*
* @param url_pre
* @param params
* @return
*/
public static String requestURL;
public static String doGet(String url_pre, Map<String, String> params, int count) {
try {
Header header = new Header("Content-type", "application/json");
String response = "";
// HttpClient是Apache Jakarta Common下的子项目,用来提供高效的、最新的、功能丰富的支持HTTP协议的客户端编程工具包,并且它支持HTTP协议最新的版本和建议。
// HttpClient已经应用在很多的项目中,比如Apache Jakarta上很著名的另外两个开源项目Cactus和HTMLUnit都使用了HttpClient。
// 使用HttpClient发送请求、接收响应
HttpClient httpClient = new HttpClient();
if (url_pre != null) {
// NameValuePair是简单名称值对节点类型。多用于Java像url_pre发送Post请求。在发送post请求时用该list来存放参数
// getParamsList(url_online, params, count);
// 预发环境value替换线上环境value
List<NameValuePair> qparams_pre = getParamsList_pre(params);
if (qparams_pre != null && qparams_pre.size() > 0) {
String formatParams = EncodingUtil.formUrlEncode(qparams_pre.toArray(new NameValuePair[qparams_pre.size()]),
"utf-8");
url_pre = url_pre.indexOf("?") < 0 ? url_pre + "?" + formatParams : url_pre + "&" + formatParams;
}
requestURL = url_pre;
// System.out.println("第【" + count + "】条日志,预发环境pre请求的url_pre==" + url_pre);
GetMethod getMethod = new GetMethod(url_pre);
getMethod.addRequestHeader(header);
/*if (null != headers) {
Iterator var8 = headers.entrySet().iterator();
while (var8.hasNext()) {
Map.Entry<String, String> entry = (Map.Entry)var8.next();
getMethod.addRequestHeader((String)entry.getKey(), (String)entry.getValue());
}
}*/
//System.out.println(getMethod.getRequestHeader("User-Agent"));
int statusCode = httpClient.executeMethod(getMethod);
// 如果请求失败则打印出失败的返回码
if (statusCode != 200) {
System.out.println("第" + statusCode + "【" + count + "】条日志,预发环境请求出错,错误码为=======" + statusCode);
return response;
}
response = new String(getMethod.getResponseBody(), "utf-8");
}
return response;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// 参数格式化
private static List<NameValuePair> getParamsList_pre(Map<String, String> paramsMap) {
if (paramsMap != null && paramsMap.size() != 0) {
List<NameValuePair> params = new ArrayList();
Iterator var2 = paramsMap.entrySet().iterator();
while (var2.hasNext()) {
Map.Entry<String, String> map = (Map.Entry) var2.next();
// 预发环境最新版本日志回放,请求参数打开以下if else,注释掉最后一行
// 参数格式化,commons-httpclient自带的方法NameValuePair会自动将==转为=,还有特殊符号格式化
// NameValuePair是简单名称值对节点类型。多用于Java像url_pre发送Post请求。在发送post请求时用该list来存放参数
params.add(new NameValuePair(map.getKey() + "", map.getValue() + ""));
// params.add(new NameValuePair(map.getKey() + "", map.getValue() + ""));
}
return params;
} else {
return null;
}
}
}
2. OdpsUtil为连接数据库的工具类
package com.xx.searchRecall.utils;
import com.aliyun.odps.Instance;
import com.aliyun.odps.Odps;
import com.aliyun.odps.OdpsException;
import com.aliyun.odps.account.Account;
import com.aliyun.odps.account.AliyunAccount;
import com.aliyun.odps.data.Record;
import com.aliyun.odps.task.SQLTask;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
public class OdpsUtil {
// 以下为https://xx.dw.xx-inc.com/ 点击用户的头像,获取如下连接开发环境数据库的参数
private static String accessId = "xx";
private static String accessKey = "xx";
private static String odpsUrl = "http://xx.com/api";
// xx为odps的dev环境(测试环境),线上为xx
// private static String project = "xx";
private static String project = "xx";
public static List<Record> getSQLResult(String sql){
Account account = new AliyunAccount(accessId, accessKey);
Odps odps =new Odps(account);
odps.setEndpoint(odpsUrl);
odps.setDefaultProject(project);
Instance i;
List<Record> records = new ArrayList<>();
try {
i = SQLTask.run(odps, sql);
i.waitForSuccess();
records = SQLTask.getResult(i);
} catch (OdpsException e) {
e.printStackTrace();
}
return records;
}
public static List<Record> getSQLResult(String sql,String accessSelfId,String accessSelfKey){
Account account = new AliyunAccount(accessSelfId, accessSelfKey);
Odps odps =new Odps(account);
odps.setEndpoint(odpsUrl);
odps.setDefaultProject(project);
Instance i;
List<Record> records = new ArrayList<>();
try {
i = SQLTask.run(odps, sql);
i.waitForSuccess();
records = SQLTask.getResult(i);
} catch (OdpsException e) {
e.printStackTrace();
}
return records;
}
public static List<String> record2wordList(List<Record> list)
{
List<String> listFile = new ArrayList<>();
if(list !=null && list.size()>0)
{
Iterator iterator=list.iterator();
while (iterator.hasNext())
{
Record record= (Record) iterator.next();
String keyWord=record.getString(0);
listFile.add(keyWord);
}
}
return listFile;
}
}
3. RunProcess为程序入口方法,传入要查询的SQL,调用odps工具类按行读取数据源(top1000的query)
package com.xx.searchRecall;
import com.xx.searchRecall.utils.OdpsUtil;
import com.xx.searchRecall.utils.TimeTransfer;
import com.xx.searchRecall.utils.logOnlineReadODPS;
import com.xx.odps.data.Record;
import java.text.SimpleDateFormat;
import java.util.List;
public class RunProcess {
private static String accessId = "xx";
private static String accessKey = "xx";
public static void main(String[] args) {
// 运行程序(读取新的log文件)之前,清空旧文件(上次的log日志信息)
String currentDay = TimeTransfer.getCurrentTime();
// top1000个人物卡
// xx
// 随机1000个人物卡,SQL查询条件不一致
String sql="SELECT t0t.query AS f1 FROM( \n" +
"\n" +
"SELECT ftbl_1t.type AS type\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.ctr AS ctr\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.vv AS vv\n" +
" , ftbl_1t.ts AS ts\n" +
" , ftbl_1t.xx AS xx\n" +
" , ftbl_1t.query AS query\n" +
" , ftbl_1t.ds AS ds\n" +
"FROM xx.xx ftbl_1t\n" +
"\n" +
"\n" +
" )t0t WHERE ((t0t.ds >= '20200419') AND(t0t.ds < '20200519')) AND(t0t.type = '人物卡片') ORDER BY TO_DATE(t0t.ds,'yyyymmdd') DESC LIMIT 1000;";
// 定义日期时间格式,DateFormat 类的子类——SimpleDateFormat。SimpleDateFormat使得可以选择任何用户定义的日期/时间格式的模式
SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy年MM月dd日 HH:mm:ss");
long startTime = System.currentTimeMillis();
// 当前时间默认为毫秒,直接转换为年月日时分秒
String startTimeRun = dateFormat.format(startTime);
System.out.println("===============查询客户端传参日志SQL开始执行了,startTimeRun为=================【" + startTimeRun + "】");
// System.out.println(sql);
List<Record> list = OdpsUtil.getSQLResult(sql, accessId, accessKey);
// System.out.println("list==" + list);
logOnlineReadODPS.startSearch(list);
long endTime = System.currentTimeMillis();
// 当前时间默认为毫秒,直接转换为年月日时分秒
String endTimeRun = dateFormat.format(endTime);
System.out.println("==========读取odps当前日期的传参日志完毕,endTimeRun为===========【" + endTimeRun + "】");
long ReadTime = (endTime - startTime) / 1000;
System.out.println("==========从连接到读取数据库日志的时长,ReadTime为===========【" + ReadTime + "】秒");
System.err.println("list.size=" + list.size());
}
}
4. ReadFiles为读取本地数据方法(百度的一段代码),本地文件格式--以=","拆分
赵露思,周星驰,陈芊芊,林正英,迪丽热巴,杨烁,刘德华,吴亦凡
package com.alibaba.searchRecall.utils;
import java.io.*;
import java.util.Arrays;
public class ReadFiles {
public static String[] readTxt(String filePath) {
StringBuilder builder = new StringBuilder();
try {
File file = new File(filePath);
if (file.isFile() && file.exists()) {
InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "utf-8");
BufferedReader br = new BufferedReader(isr);
String lineTxt = null;
int num = 0;
long time1 = System.currentTimeMillis();
while ((lineTxt = br.readLine()) != null) {
System.out.println(lineTxt);
builder.append(lineTxt);
builder.append(",");
num++;
// System.out.println("总共" + num + "条数据!");
}
//System.out.println("总共"+num+"条数据!");
long time2 = System.currentTimeMillis();
long time = time1 - time2;
// System.out.println("共花费" + time + "秒");
br.close();
} else {
System.out.println("文件不存在!");
}
} catch (Exception e) {
System.out.println("文件读取错误!");
}
String[] strings = builder.toString().split(",");
return strings;
}
public static void main(String[] args) {
String filePath = "/Users/xx/searchRecall/utils/person.txt";
System.out.println(filePath);
String[] strings = readTxt(filePath);
System.out.println("strings:"+Arrays.toString(strings));
}
}
5. logOnlineReadODPS为从数据库类取到源数据后请求接口,接口返回解析
package com.xx.searchRecall.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.aliyun.odps.data.Record;
import java.util.*;
public class logOnlineReadODPS {
public static void main(String[] args) {
// startSearch();
}
// 定义集合,把搜索词query放到list集合
public static List<String> list = new ArrayList<>();
public static void startSearch(List<Record> list) {
int only3 = 0;
String query3 = "";
int only12 = 0;
String query12 = "";
int both3_12 = 0;
String query3_12 = "";
int no3_12 = 0;
String queryNO3_12 = "";
int totalCount = 0;
for (int i = 0; i < list.size(); i++) {
// 获取单条SQL的查询字段内容
Record record = list.get(i);
String keywords = record.getString("f1");
Map<String, String> query = new HashMap<>();
query.put("keyword", keywords);
// 如果URL没有公共参数,则把 ?去掉;
// 业务接口传参增加cmd=4拿到引擎字段返回
String url_pre = "http://xx/query?noqc=0&xx=xx&pg=1&nocache=1&sdkver=xx";
// 开始请求,域名、接口名==url+请求参数param(hashMap)
// String response = HTTPCommonMethod.doGet(url_pre, url_online, map, count);
System.out.println("第" + (i + 1) + "条数据==" + query);
String response = HTTPCommonMethod.doGet(url_pre, query, i);
JSONObject responseJson = JSONObject.parseObject(response);
int type = responseToParse(i, keywords, responseJson);
if (type == 1) {
only3++;
query3 = query3 + keywords + ",";
} else if (type == 2) {
only12++;
query12 = query12 + keywords + ",";
} else if (type == 3) {
both3_12++;
query3_12 = query3_12 + keywords + ",";
} else {
no3_12++;
queryNO3_12 = queryNO3_12 + keywords + ",";
}
// 打印接口返回的数据
totalCount = i + 1;
}
System.out.println("totalCount==" + totalCount);
String rate3 = ((float) only3 / (float) totalCount) * 100 + "%";
String rate12 = ((float) only12 / (float) totalCount) * 100 + "%";
String rate3_12 = ((float) both3_12 / (float) totalCount) * 100 + "%";
String rateNO3_12 = ((float) no3_12 / (float) totalCount) * 100 + "%";
System.out.println("------------------------------------------------------------------------------------------------");
System.out.println("------------------------------------------------------------------------------------------------g");
System.out.println("only3---只召回自频道==【" + only3 + "】---比率为==【" + rate3 + "】---query3==【" + query3 + "】");
System.out.println("only12---只召回人物==【" + only12 + "】---比率为==【" + rate12 + "】---query12==【" + query12 + "】");
System.out.println("both3-12---同时召回自频道和人物==【" + both3_12 + "】---比率为==【" + rate3_12 + "】---query3_12==【" + query3_12 + "】");
System.out.println("no3-12---均未召回自频道和人物==【" + no3_12 + "】---比率为==【" + rateNO3_12 + "】---queryNO3_12==【" + queryNO3_12 + "】");
}
/**
* @param count
* @param query
* @param response
* @return 1:只包含自频道类型3 2:只包含人物类型12 3:既包含频道类型3且包含人物类型12 0:既不包含频道类型3且不包含人物类型12
*/
public static int responseToParse(int count, String query, JSONObject response) {
try {
// HashMap<Integer, Integer> hm = new HashMap<Integer, Integer>();
boolean docSource3 = false;
boolean docSource12 = false;
boolean docSource3_12 = false;
if (!response.isEmpty()) {
// 获取JSONArray
JSONArray jsonArray = response.getJSONArray("resultList");
// for循环遍历JSONObject
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
// 获取key对应value的取值getInteger("key")
int doc_source = jsonObject.getInteger("doc_source");
if (doc_source == 3) {
docSource3 = true;
} else if (doc_source == 12) {
docSource12 = true;
} else if (docSource3 && docSource12) {
docSource3_12 = true;
}
// System.out.println("第【" + count + "】条日志,搜索query为==【" + query + "】,doc_source==【" + doc_source + "】");
/*if (hm.containsKey(doc_source)) {
int value = hm.get(doc_source);
if (String.valueOf(value) != null && value != 0) {
hm.put(doc_source, value);
} else {
hm.put(doc_source, 1);
}
// System.out.println("value==" + hm.get(doc_source));
System.out.println("hm=" + hm);
}*/
}
if (docSource3 && !docSource12) {
return 1;
} else if (!docSource3 && docSource12) {
return 2;
} else if (docSource3 && docSource12) {
return 3;
}
} else {
System.err.println("第【" + count + "】条日志,搜索query为==【" + query + "】,接口返回为空");
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
public static JSONObject jsonObject = new JSONObject();
}
6. logOnlineReadFiles为读取本地数据源,请求接口,接口返回解析
package com.alibaba.searchRecall.utils;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import java.util.*;
import static com.alibaba.searchRecall.utils.ReadFiles.readTxt;
public class logOnlineReadFiles {
public static void main(String[] args) {
startSearch();
}
// 定义集合,把搜索场景放到list集合
public static List<String> list = new ArrayList<>();
public static void startSearch() {
// 拼接的传参参数为中文,需要把中文放到map
// 方法1:把待测试的query top排行前1000在odps查询出,存到本地,再通过接口拼接
String filePath = "/Users/lishan/Desktop/xx/xx/src/main/java/com/xx/searchRecall/person.txt";
System.out.println(filePath);
String[] keywords = readTxt(filePath);
System.out.println("strings:" + Arrays.toString(keywords));
// String keywords=record.getString("f1");
// 方法2:代码读取odps工具类,查询top1000的query,再通过接口拼接
// 见logOnlineReadODPS
// String[] keywords={"吴亦凡","杨幂","唐嫣"};
// String[] keywords = {"吴亦凡"};
int only3 = 0;
String query3 = "";
int only12 = 0;
String query12 = "";
int both3_12 = 0;
String query3_12 = "";
int no3_12 = 0;
String queryNO3_12 = "";
int totalCount = 0;
for (int i = 0; i < keywords.length; i++) {
Map<String, String> query = new HashMap<>();
query.put("keyword", keywords[i]);
// 如果URL没有公共参数,则把 ?去掉;
// 业务接口传参增加cmd=4拿到引擎字段返回
String url_pre = "http://xx/query?noqc=0&pg=1&nocache=1&xx=308";
// 开始请求,域名、接口名==url+请求参数param(hashMap)
// String response = HTTPCommonMethod.doGet(url_pre, url_online, map, count);
System.out.println("第" + (i + 1) + "条数据==" + query);
String response = HTTPCommonMethod.doGet(url_pre, query, i);
JSONObject responseJson = JSONObject.parseObject(response);
int type = responseToParse(i, keywords[i], responseJson);
if (type == 1) {
only3++;
query3 = query3 + keywords[i] + ",";
} else if (type == 2) {
only12++;
query12 = query12 + keywords[i] + ",";
} else if (type == 3) {
both3_12++;
query3_12 = query3_12 + keywords[i] + ",";
} else {
no3_12++;
queryNO3_12 = queryNO3_12 + keywords[i] + ",";
}
// 打印接口返回的数据
// System.out.println("第【" + i + "】条日志,预发环境pre接口返回response为=======" + response);
totalCount = i + 1;
// System.out.println("每次循环的totalCount=="+totalCount);
}
System.out.println("totalCount==" + totalCount);
float rate3 = (float) only3 / (float) totalCount;
float rate12 = (float) only12 / (float) totalCount;
float rate3_12 = (float) both3_12 / (float) totalCount;
float rateNO3_12 = (float) no3_12 / (float) totalCount;
System.out.println("------------------------------------------------------------------------------------------------");
System.out.println("------------------------------------------------------------------------------------------------g");
System.out.println("only3---只召回自频道==【" + only3 + "】---比率为==【" + rate3 + "】---query3==【" + query3 + "】");
System.out.println("only12---只召回人物==【" + only12 + "】---比率为==【" + rate12 + "】---query12==【" + query12 + "】");
System.out.println("both3-12---同时召回自频道和人物==【" + both3_12 + "】---比率为==【" + rate3_12 + "】---query3_12==【" + query3_12 + "】");
System.out.println("no3-12---均未召回自频道和人物==【" + no3_12 + "】---比率为==【" + rateNO3_12 + "】---queryNO3_12==【" + queryNO3_12 + "】");
}
/**
* @param count
* @param query
* @param response
* @return 1:只包含自频道类型3 2:只包含人物类型12 3:既包含频道类型3且包含人物类型12 0:既不包含频道类型3且不包含人物类型12
*/
public static int responseToParse(int count, String query, JSONObject response) {
try {
// HashMap<Integer, Integer> hm = new HashMap<Integer, Integer>();
boolean docSource3 = false;
boolean docSource12 = false;
boolean docSource3_12 = false;
if (!response.isEmpty()) {
// 获取JSONArray
JSONArray jsonArray = response.getJSONArray("resultList");
// for循环遍历JSONObject
for (int i = 0; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
// 获取key对应value的取值getInteger("key")
int doc_source = jsonObject.getInteger("doc_source");
if (doc_source == 3) {
docSource3 = true;
} else if (doc_source == 12) {
docSource12 = true;
} else if (docSource3 && docSource12) {
docSource3_12 = true;
}
// System.out.println("第【" + count + "】条日志,搜索query为==【" + query + "】,doc_source==【" + doc_source + "】");
/*if (hm.containsKey(doc_source)) {
int value = hm.get(doc_source);
if (String.valueOf(value) != null && value != 0) {
hm.put(doc_source, value);
} else {
hm.put(doc_source, 1);
}
// System.out.println("value==" + hm.get(doc_source));
System.out.println("hm=" + hm);
}*/
}
if (docSource3 && !docSource12) {
return 1;
} else if (!docSource3 && docSource12) {
return 2;
} else if (docSource3 && docSource12) {
return 3;
}
} else {
System.err.println("第【" + count + "】条日志,搜索query为==【" + query + "】,接口返回为空");
}
} catch (Exception e) {
e.printStackTrace();
}
return 0;
}
public static JSONObject jsonObject = new JSONObject();
}
最终提供给pm的样子
。。。