1.将指定ss文件处理后按照要求处理后输出为另一个ss文件(第一列为id 第二列为json):
Scope 部分:
#DECLARE InputSS string = @"/local/dev-kekambli/EntityMatching/Testing3/SampleReference.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-02-23/result.ss";
Input = SSTREAM @InputSS;
sampleReference =
SELECT Key, Annotation, Values, Contexts, AggregatedConfidence, PropertiesForReference
FROM Input;
sampleReferenceProcessed = PROCESS sampleReference
PRODUCE Sid, Json
USING JsonProcessor;
OUTPUT sampleReferenceProcessed
TO SSTREAM @OutputSS
CLUSTERED BY Sid;
C# 部分:
using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using Newtonsoft.Json;
using System.Linq;
public class JsonProcessor : Processor
{
public const string ReferenceEntityValueDelimiter = "#<>#";
public const string ReferenceEntityConfidenceDelimiter = ",";
String[] SchemaString = { "Sid", "Json" };
public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
{
var output_schema = new Schema(SchemaString);
return output_schema;
}
public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
{
var jsonDictionary = new Dictionary<string, EntityDocument>();
foreach (Row input_row in input_rowset.Rows)
{
var sid = input_row[0].String.Remove(0, "http://knowledge.microsoft.com/".Length);
var annotation = input_row[1].String;
var values = input_row[2].String;
var contexts = input_row[3].String;
var aggregatedConfidence = input_row[4].String;
var propertiesForReference = input_row[5].String;
if (!jsonDictionary.ContainsKey(sid))
{
jsonDictionary.Add(sid, new EntityDocument());
}
var entity = new ReferenceEntityProperty();
entity.Annotation = annotation;
entity.Values = values.Split(new string[] { ReferenceEntityValueDelimiter }, StringSplitOptions.RemoveEmptyEntries).ToList();
entity.AggregatedConfidence = aggregatedConfidence.Split(new string[] { ReferenceEntityConfidenceDelimiter }, StringSplitOptions.RemoveEmptyEntries).ToList();
entity.PropertiesForReference = propertiesForReference;
jsonDictionary[sid].ReferenceEntityProperties.Add(entity);
}
foreach (var sid in jsonDictionary.Keys)
{
var entityDocument= jsonDictionary[sid];
output_row[0].Set(sid);//Sid
output_row[1].Set(JsonConvert.SerializeObject(entityDocument));//entityDocuments Json
yield return output_row;
}
}
}
class EntityDocument
{
[JsonProperty("ReferenceEntityProperties")]
public List<ReferenceEntityProperty> ReferenceEntityProperties { get; set; }
public EntityDocument()
{
this.ReferenceEntityProperties = new List<ReferenceEntityProperty>();
}
}
class ReferenceEntityProperty
{
[JsonProperty("Annotation")]
public string Annotation { get; set; }
[JsonProperty("Values")]
public List<string> Values { get; set; }
[JsonProperty("AggregatedConfidence")]
public List<string> AggregatedConfidence { get; set; }
[JsonProperty("PropertiesForReference")]
public string PropertiesForReference { get; set; }
}
2.将ss文件输出为tsv文件:
#DECLARE StreamExpiry string = @"@@StreamExpiry@@";
#DECLARE InputSS string = @@InputSS@@;
#DECLARE OutTsv string = @@OutTsv@@;
Input = SSTREAM @InputSS;
LinkingResults =
SELECT *
FROM Input;
OUTPUT LinkingResults
TO @OutTsv
WITH STREAMEXPIRY @StreamExpiry;
3.根据输入tsv文件中的sid,抓取bing页面上的指定内容转换为html panel并输出为另一个tsv文件:
Scope 部分:
#DECLARE InputPath string = @@InputPath@@;
#DECLARE OutputPath string = @@OutputPath@@;
searchlog =
EXTRACT Sid : string,
Query : string,
Urls : string
FROM @InputPath
USING DefaultTextExtractor();
rs1 = PROCESS searchlog
PRODUCE Sid, Urls, Html
USING HtmlPanelProcessor;
OUTPUT rs1
TO @OutputPath;
C# 部分:
using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using System.Threading.Tasks;
using System.Net;
using HtmlAgilityPack;
using Polly;
using System.Web;
using System.Net.Http;
public class HtmlPanelProcessor : Processor
{
String[] SchemaString = {"Sid", "Urls", "Html"};
public delegate Task<string> del(string entityId);
static del d;
public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
{
var output_schema = new Schema(SchemaString);
return output_schema;
}
public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
{
foreach (Row input_row in input_rowset.Rows)
{
var satoriId = input_row[0].String;
var displayQuery = input_row[1].String;
var Url = input_row[2].String;
output_row[0].Set(satoriId);//Sid
output_row[1].Set(Url);//Url
Task<string> task = ScrapeBing(satoriId);
task.Wait();
output_row[2].Set(task.Result);//Html Panel
yield return output_row;
}
}
//normal way
private static async Task<string> ScrapeBing(string entityId)
{
var document = await ScrapeWorkerAsync(entityId);
var nodes = document.DocumentNode.SelectSingleNode("//div[@class=\"b_entityTP\"]");
Console.WriteLine($"nodes = {nodes}");
StringBuilder result = new StringBuilder();
if (nodes != null)
{
var imageNode = nodes.SelectSingleNode("//div[@class=\"b_FBleedModule\"]");
var entityFacts = nodes.SelectSingleNode("//div[@class=\"b_sideBleed\"]");
if (imageNode != null)
{
result.Append(imageNode.InnerHtml);
}
if (entityFacts != null)
{
result.Append(entityFacts.InnerHtml);
}
var entityPaneData = result.ToString();
entityPaneData = entityPaneData.Replace(System.Environment.NewLine, "");
entityPaneData = entityPaneData.Replace("/th?id=", "https://www.bing.com/th?id=");
entityPaneData = entityPaneData.Replace("https://www.bing.comhttps://www.bing.com/", "https://www.bing.com/");
return entityPaneData;
}
return null;
}
private static async Task<HtmlDocument> ScrapeWorkerAsync(string satoriId)
{
var url = $"https://www.bing.com/search?q=abc&FORM=MONITR&setflight=0&filters=sid%3a%22{satoriId}%22";
HttpClient httpClient = new HttpClient();
var response = await httpClient.GetAsync(url);
response.EnsureSuccessStatusCode();
var result = await response.Content.ReadAsStringAsync();
var document = new HtmlDocument();
document.LoadHtml(result);
return document;
}
}
4.processor基础教程,用于修改某列数据:
Scope部分:
searchlog = VIEW "/users/v-tiapeng@microsoft.com/scope/SearchLog.view";
rs1 =
SELECT Start, Region
FROM searchlog;
rs2 = PROCESS rs1
PRODUCE Start, Region
USING ModifyProcessor;
OUTPUT rs2
TO "/users/v-tiapeng@microsoft.com/scope/output_ModifyProcessor.tsv";
C#部分:
using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class ModifyProcessor : Processor
{
public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
{
var output_schema = input_schema.Clone();
return output_schema;
}
public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
{
foreach (Row input_row in input_rowset.Rows)
{
input_row.CopyTo(output_row);
string start = input_row[1].String;
output_row[1].Set("Region: " + start);
yield return output_row;
}
}
}
5.Scope里面没有limit关键字,于是利用TOP和EXCEPT进行分页
Scope部分:
#DECLARE InputSS string = @"/projects/merlin/QnACurationNew/xCUVStream/GoogleIntentFiltered_es.2021-03-31_09_58_12.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/QuantusScrapper/GoogleIntentFiltered_es.2021-03-31_09_58_12_100000-150000.ss";
#DECLARE OutputSS2 string = @"/users/v-tiapeng@microsoft.com/QuantusScrapper/Count.ss";
Input = SSTREAM @InputSS;
rs =
SELECT Query
FROM Input;
rs2 =
PROCESS rs
PRODUCE Query,
Market
USING ScrapperProcessor;
current =
SELECT TOP 150000 Query, Market
FROM rs2
EXCEPT
SELECT TOP 100000 Query, Market
FROM rs2;
count =
SELECT COUNT(1) AS Counts
FROM current;
OUTPUT current
TO SSTREAM @OutputSS;
OUTPUT count
TO SSTREAM @OutputSS2;
C#部分:
using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class ScrapperProcessor : Processor
{
String[] SchemaString = { "Query", "Market" };
public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
{
var output_schema = new Schema(SchemaString);
return output_schema;
}
public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
{
foreach (Row input_row in input_rowset.Rows)
{
output_row[0].Set(input_row[0].String);
output_row[1].Set("-");
yield return output_row;
}
}
}
6.利用cross join来合并两个只有单行的表,并进行两列的计算.
#DECLARE InputSS string = @"/projects/merlin/QnACurationNew/monitoring/QnAMonitoring.full.2021-04-07.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-04-08/test.ss";
Input = SSTREAM @InputSS;
rs =
SELECT Value
FROM Input
WHERE Key == "ar-AE_static_Desktop_QnACurationNew_Coverage" AND Date == DateTime.Parse("04/07/2021");
rs2 =
SELECT Value
FROM Input
WHERE Key == "ar-AE_static_Desktop_QnACurationNew_Coverage" AND Date == DateTime.Parse("04/06/2021");
rs3 =
SELECT rs.Value AS Current_Value,
rs2.Value AS Previous_Value
FROM rs
CROSS JOIN
rs2;
rs4 =
SELECT Current_Value / Previous_Value AS Quotient
FROM rs3;
OUTPUT rs4
TO SSTREAM @OutputSS;
7.判断一个表里的Query是否在另一个表里出现,如果存在输出yes,不存在输出no.其中Regex.Replace()方法可以用来使query正则化
Scope部分:
#DECLARE InputSS string = "/users/v-tiapeng%40microsoft.com/scope/2021-03-31/QueriesSS/zh-xx.ss";
#DECLARE InputSS2 string = "/local/OpenDataAnswer/data/qna/feeds/flights/QnACurationNew_zh-CN.2021-04-05_09_33_40.ss";
#DECLARE OutputSS string = "/users/v-tiapeng%40microsoft.com/WPSBS/QnACurationResults/zh-xx.ss";
Input = SSTREAM @InputSS;
Input2 = SSTREAM @InputSS2;
rs =
SELECT Regex.Replace(Query.ToLowerInvariant(), @"\s+", " ").Trim(new Char[]{ ' ', '?', '.'}) AS Query
FROM Input;
rs2 =
SELECT Regex.Replace(Query.ToLowerInvariant(), @"\s+", " ").Trim(new Char[]{ ' ', '?', '.'}) AS Query,
Answer
FROM Input2;
rs3 =
SELECT Query,
Answer
FROM rs
LEFT JOIN
rs2
ON rs.Query == rs2.Query;
rs4 =
PROCESS rs3
PRODUCE Query,
QueryPresentInCuration
USING QueryProcessor;
OUTPUT rs4
TO SSTREAM @OutputSS;
C#部分:
using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class QueryProcessor : Processor
{
string[] schema = { "Query", "QueryPresentInCuration" };
public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
{
var output_schema = new Schema(schema);
return output_schema;
}
public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
{
foreach (Row input_row in input_rowset.Rows)
{
output_row[0].Set(input_row[0].String);
if(input_row[1].String != null)
{
output_row[1].Set("YES");
}
else
{
output_row[1].Set("NO");
}
yield return output_row;
}
}
}
8.从view里面查询信息:
Dataset =
SELECT (string.IsNullOrEmpty(SuggestionString) ? Query_RawQuery : SuggestionString).ToLowerInvariant() AS Query,ReferenceAnswerSubScenario
FROM (
VIEW @"/shares/searchDM/distrib/released/xSLAPI/xSLAPI.view"
PARAMS
(
Start = "2021-03-30", // "2010-06-25"
End = "2021-03-30", // "2014-12-12"
Dataset = "Bing.com", // Bing.com,Mobile,Cortana,Charm,Opal
Traffic = "Normal", // Internal,Normal,Abnormal
QueryType = "Web" // Web,MM,AutoSuggest,Homepage,API,News,All
)
)
WHERE Market IN ("ar-AE") AND ReferenceAnswerSubScenario == "QnACurationNew";
OUTPUT Dataset
TO SSTREAM "/users/v-tiapeng@microsoft.com/scope/2021-04-08/SLAPiLog_2021-03-30.ss";
9.使用UNION合并多个表中的查询记录(与EXCEPT相反).
#DECLARE InputSS string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-19/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS2 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-20/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS3 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-21/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS4 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-22/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS5 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-23/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS6 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-24/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS7 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-25/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS8 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-26/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS9 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-27/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE OutputSS string = "/users/v-tiapeng@microsoft.com/scope/2021-03-30/summary.ss";
Input = SSTREAM @InputSS;
Input2 = SSTREAM @InputSS2;
Input3 = SSTREAM @InputSS3;
Input4 = SSTREAM @InputSS4;
Input5 = SSTREAM @InputSS5;
Input6 = SSTREAM @InputSS6;
Input7 = SSTREAM @InputSS7;
Input8 = SSTREAM @InputSS8;
Input9 =
SSTREAM @InputSS9;
rs =
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input
WHERE Date == "3/19/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input2
WHERE Date == "3/20/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input3
WHERE Date == "3/21/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input4
WHERE Date == "3/22/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input5
WHERE Date == "3/23/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input6
WHERE Date == "3/24/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input7
WHERE Date == "3/25/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input8
WHERE Date == "3/26/2021"
UNION
SELECT Language,
Date,
NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
NumberOfQueriesWithThisQueryLanguage
FROM Input9
WHERE Date == "3/27/2021";
OUTPUT rs
TO SSTREAM @OutputSS;
10.查询过去30天的记录:
//#DECLARE InputSS string = @"/projects/merlin/xCUV/Daily/2021-03-05.ss";
//#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-03-23/Result_AverageNumberOfQueriesPerDayByMarketL30.ss";
#DECLARE InputSS string = @@InputSS@@;
#DECLARE OutputSS string = @@OutputSS@@;
#DECLARE DATECOLUMN string;
#SET DATECOLUMN = "@@DateColumn@@";
Input = SSTREAM @InputSS;
xcuv =
SELECT date,
rawquery,
Market
FROM Input;
rs1 =
SELECT COUNT(DISTINCT date) AS TotalDays,
COUNT(DISTINCT rawquery) AS TotalQueries,
((float) COUNT(DISTINCT rawquery) / 30) AS AverageNumberOfQueriesPer30Days,
Market
FROM xcuv
//WHERE DateTime.Parse(date) >= DateTime.Parse("3/5/2021").AddDays(-30) AND DateTime.Parse(date) <= DateTime.Parse("3/5/2021")
WHERE DateTime.Parse(date) >= DateTime.Parse(@DATECOLUMN).AddDays(-30) AND DateTime.Parse(date) <= DateTime.Parse(@DATECOLUMN)
GROUP BY Market;
OUTPUT rs1
TO SSTREAM @OutputSS
CLUSTERED BY Market;
11.TOP k 当 k大于10000的时候无法与关键字ORDER BY 进行连用,因此解决办法是先用SORT BY… DESC来生成第一个ss文件确保排序,然后再用TOP关键字在这个文件中进行查询:
第一段scope:
rs = SELECT Query, GoogleViewsCount FROM Input;
OUTPUT rs TO SSTREAM @OutputSS CLUSTERED BY Query SORTED BY GoogleViewsCount DESC;
第二段scope:
rs = SELECT TOP @@SampleNumber@@ Query, GoogleViewsCount FROM Input;
OUTPUT rs TO SSTREAM @OutputSS CLUSTERED BY Query SORTED BY GoogleViewsCount DESC;