微软查询Cosmos记录

1.将指定ss文件处理后按照要求处理后输出为另一个ss文件(第一列为id 第二列为json):

Scope 部分:

#DECLARE InputSS string = @"/local/dev-kekambli/EntityMatching/Testing3/SampleReference.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-02-23/result.ss";

Input = SSTREAM @InputSS;

sampleReference = 
    SELECT Key, Annotation, Values, Contexts, AggregatedConfidence, PropertiesForReference
    FROM Input;

sampleReferenceProcessed = PROCESS sampleReference
      PRODUCE Sid, Json
      USING JsonProcessor;


OUTPUT sampleReferenceProcessed
TO SSTREAM @OutputSS
   CLUSTERED BY Sid;

C# 部分:

using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using Newtonsoft.Json;
using System.Linq;

public class JsonProcessor : Processor
{
    public const string ReferenceEntityValueDelimiter = "#<>#";
    public const string ReferenceEntityConfidenceDelimiter = ",";

    String[] SchemaString = { "Sid", "Json" };

    public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
    {
        var output_schema = new Schema(SchemaString);
        return output_schema;
    }

    public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
    {
        var jsonDictionary = new Dictionary<string, EntityDocument>();

        foreach (Row input_row in input_rowset.Rows)
        {
            var sid = input_row[0].String.Remove(0, "http://knowledge.microsoft.com/".Length);
            var annotation = input_row[1].String;
            var values = input_row[2].String;
            var contexts = input_row[3].String;
            var aggregatedConfidence = input_row[4].String;
            var propertiesForReference = input_row[5].String;

            if (!jsonDictionary.ContainsKey(sid))
            {
                jsonDictionary.Add(sid, new EntityDocument());
            }

            var entity = new ReferenceEntityProperty();
            entity.Annotation = annotation;
            entity.Values = values.Split(new string[] { ReferenceEntityValueDelimiter }, StringSplitOptions.RemoveEmptyEntries).ToList();
            entity.AggregatedConfidence = aggregatedConfidence.Split(new string[] { ReferenceEntityConfidenceDelimiter }, StringSplitOptions.RemoveEmptyEntries).ToList();
            entity.PropertiesForReference = propertiesForReference;
            jsonDictionary[sid].ReferenceEntityProperties.Add(entity);     
        }

        foreach (var sid in jsonDictionary.Keys)
        {
            var entityDocument= jsonDictionary[sid];

            output_row[0].Set(sid);//Sid
            output_row[1].Set(JsonConvert.SerializeObject(entityDocument));//entityDocuments Json


            yield return output_row;
        }       
    }
}

class EntityDocument
{

    [JsonProperty("ReferenceEntityProperties")]
    public List<ReferenceEntityProperty> ReferenceEntityProperties { get; set; }

    public EntityDocument()
    {
        this.ReferenceEntityProperties = new List<ReferenceEntityProperty>();
    }
}

class ReferenceEntityProperty
{
    [JsonProperty("Annotation")]
    public string Annotation { get; set; }

    [JsonProperty("Values")]
    public List<string> Values { get; set; }

    [JsonProperty("AggregatedConfidence")]
    public List<string> AggregatedConfidence { get; set; }

    [JsonProperty("PropertiesForReference")]
    public string PropertiesForReference { get; set; }
}

2.将ss文件输出为tsv文件:

#DECLARE StreamExpiry string = @"@@StreamExpiry@@";

#DECLARE InputSS string = @@InputSS@@;

#DECLARE OutTsv string = @@OutTsv@@;

Input = SSTREAM @InputSS;

LinkingResults =
    SELECT *
    
    FROM Input;

OUTPUT LinkingResults
TO @OutTsv
WITH STREAMEXPIRY @StreamExpiry;

3.根据输入tsv文件中的sid,抓取bing页面上的指定内容转换为html panel并输出为另一个tsv文件:

Scope 部分:

#DECLARE InputPath string = @@InputPath@@;
#DECLARE OutputPath string = @@OutputPath@@;

searchlog =
    EXTRACT Sid : string,
            Query : string,
            Urls : string   
    FROM @InputPath
    USING DefaultTextExtractor();

rs1 = PROCESS searchlog
      PRODUCE Sid, Urls, Html
      USING HtmlPanelProcessor;

OUTPUT rs1
    TO @OutputPath;

C# 部分:

using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using System.Threading.Tasks;
using System.Net;
using HtmlAgilityPack;
using Polly;
using System.Web;
using System.Net.Http;

public class HtmlPanelProcessor : Processor
{
    String[] SchemaString = {"Sid", "Urls", "Html"};

    public delegate Task<string> del(string entityId);
    static del d;

    public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
    {
        var output_schema = new Schema(SchemaString);
        return output_schema;
    }

    public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
    {
        foreach (Row input_row in input_rowset.Rows)
        {
            var satoriId = input_row[0].String;
            var displayQuery = input_row[1].String;
            var Url = input_row[2].String;

            output_row[0].Set(satoriId);//Sid
            output_row[1].Set(Url);//Url
            Task<string> task = ScrapeBing(satoriId);
            task.Wait();
            output_row[2].Set(task.Result);//Html Panel

            yield return output_row;
        }
    }

    //normal way
    private static async Task<string> ScrapeBing(string entityId)
    {
        var document = await ScrapeWorkerAsync(entityId);
        var nodes = document.DocumentNode.SelectSingleNode("//div[@class=\"b_entityTP\"]");
        Console.WriteLine($"nodes = {nodes}");
        StringBuilder result = new StringBuilder();
        if (nodes != null)
        {
            var imageNode = nodes.SelectSingleNode("//div[@class=\"b_FBleedModule\"]");
            var entityFacts = nodes.SelectSingleNode("//div[@class=\"b_sideBleed\"]");
            if (imageNode != null)
            {
                result.Append(imageNode.InnerHtml);
            }
            if (entityFacts != null)
            {
                result.Append(entityFacts.InnerHtml);
            }

            var entityPaneData = result.ToString();

            entityPaneData = entityPaneData.Replace(System.Environment.NewLine, "");
            entityPaneData = entityPaneData.Replace("/th?id=", "https://www.bing.com/th?id=");
            entityPaneData = entityPaneData.Replace("https://www.bing.comhttps://www.bing.com/", "https://www.bing.com/");
            return entityPaneData;
        }
        return null;

    }

    private static async Task<HtmlDocument> ScrapeWorkerAsync(string satoriId)
    {
        var url = $"https://www.bing.com/search?q=abc&FORM=MONITR&setflight=0&filters=sid%3a%22{satoriId}%22";
        HttpClient httpClient = new HttpClient();
        var response = await httpClient.GetAsync(url);
        response.EnsureSuccessStatusCode();
        var result = await response.Content.ReadAsStringAsync();
        var document = new HtmlDocument();
        document.LoadHtml(result);

        return document;
    }

}

4.processor基础教程,用于修改某列数据:

Scope部分:

searchlog = VIEW "/users/v-tiapeng@microsoft.com/scope/SearchLog.view";

rs1 =
 SELECT Start, Region
 FROM searchlog;

rs2 = PROCESS rs1
      PRODUCE Start, Region
      USING ModifyProcessor;

OUTPUT rs2
 TO "/users/v-tiapeng@microsoft.com/scope/output_ModifyProcessor.tsv";

C#部分:

using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;


public class ModifyProcessor : Processor
{
    public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
    {
        var output_schema = input_schema.Clone();
        return output_schema;
    }

    public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
    {
        foreach (Row input_row in input_rowset.Rows)
        {
            input_row.CopyTo(output_row);
            string start = input_row[1].String;
            output_row[1].Set("Region: " + start);
            yield return output_row;
        }
    }
}

5.Scope里面没有limit关键字,于是利用TOP和EXCEPT进行分页

Scope部分:

#DECLARE InputSS string = @"/projects/merlin/QnACurationNew/xCUVStream/GoogleIntentFiltered_es.2021-03-31_09_58_12.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/QuantusScrapper/GoogleIntentFiltered_es.2021-03-31_09_58_12_100000-150000.ss";
#DECLARE OutputSS2 string = @"/users/v-tiapeng@microsoft.com/QuantusScrapper/Count.ss";

Input = SSTREAM @InputSS;

rs =
    SELECT Query            
    FROM Input;

rs2 =
    PROCESS rs
    PRODUCE Query,
            Market
    USING ScrapperProcessor;

current =
    SELECT TOP 150000 Query, Market
    FROM rs2
    EXCEPT
    SELECT TOP 100000 Query, Market
    FROM rs2;
    

count =
    SELECT COUNT(1) AS Counts
    FROM current;

OUTPUT current
TO SSTREAM @OutputSS;

OUTPUT count
TO SSTREAM @OutputSS2;

C#部分:

using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class ScrapperProcessor : Processor
{
    String[] SchemaString = { "Query", "Market" };

    public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
    {
        var output_schema = new Schema(SchemaString);
        return output_schema;
    }

    public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
    {
        foreach (Row input_row in input_rowset.Rows)
        {

            output_row[0].Set(input_row[0].String);
            output_row[1].Set("-");

            yield return output_row;
        }
    }
}

6.利用cross join来合并两个只有单行的表,并进行两列的计算.

#DECLARE InputSS string = @"/projects/merlin/QnACurationNew/monitoring/QnAMonitoring.full.2021-04-07.ss";
#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-04-08/test.ss";

Input = SSTREAM @InputSS;

rs =
    SELECT Value
    FROM Input
    WHERE Key == "ar-AE_static_Desktop_QnACurationNew_Coverage" AND Date == DateTime.Parse("04/07/2021");

rs2 = 
    SELECT Value
    FROM Input
    WHERE Key == "ar-AE_static_Desktop_QnACurationNew_Coverage" AND Date == DateTime.Parse("04/06/2021");


rs3 =
    SELECT rs.Value AS Current_Value,
           rs2.Value AS Previous_Value
    FROM rs
         CROSS JOIN
             rs2;

rs4 =
    SELECT Current_Value / Previous_Value AS Quotient
    FROM rs3;


OUTPUT rs4
TO SSTREAM @OutputSS;

7.判断一个表里的Query是否在另一个表里出现,如果存在输出yes,不存在输出no.其中Regex.Replace()方法可以用来使query正则化

Scope部分:

#DECLARE InputSS string = "/users/v-tiapeng%40microsoft.com/scope/2021-03-31/QueriesSS/zh-xx.ss";
#DECLARE InputSS2 string = "/local/OpenDataAnswer/data/qna/feeds/flights/QnACurationNew_zh-CN.2021-04-05_09_33_40.ss";
#DECLARE OutputSS string = "/users/v-tiapeng%40microsoft.com/WPSBS/QnACurationResults/zh-xx.ss";

Input = SSTREAM @InputSS;
Input2 = SSTREAM @InputSS2;

rs =
    SELECT Regex.Replace(Query.ToLowerInvariant(), @"\s+", " ").Trim(new Char[]{ ' ', '?', '.'}) AS Query
    FROM Input;

rs2 =
    SELECT Regex.Replace(Query.ToLowerInvariant(), @"\s+", " ").Trim(new Char[]{ ' ', '?', '.'}) AS Query,
           Answer
    FROM Input2;

rs3 =
    SELECT Query,
           Answer
    FROM rs
         LEFT JOIN
             rs2
         ON rs.Query == rs2.Query;

rs4 =
    PROCESS rs3
    PRODUCE Query,
            QueryPresentInCuration
    USING QueryProcessor;

OUTPUT rs4
TO SSTREAM @OutputSS;

C#部分:

using Microsoft.SCOPE.Types;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class QueryProcessor : Processor
{
    string[] schema = { "Query", "QueryPresentInCuration" };

    public override Schema Produces(string[] requested_columns, string[] args, Schema input_schema)
    {
        var output_schema = new Schema(schema);
        return output_schema;
    }

    public override IEnumerable<Row> Process(RowSet input_rowset, Row output_row, string[] args)
    {
        foreach (Row input_row in input_rowset.Rows)
        {
            output_row[0].Set(input_row[0].String);
            if(input_row[1].String != null)
            {
                output_row[1].Set("YES");
            }
            else
            {
                output_row[1].Set("NO");
            }
            yield return output_row;
        }
    }
}

8.从view里面查询信息:

Dataset =
    SELECT (string.IsNullOrEmpty(SuggestionString) ? Query_RawQuery : SuggestionString).ToLowerInvariant() AS Query,ReferenceAnswerSubScenario
    FROM (
        VIEW @"/shares/searchDM/distrib/released/xSLAPI/xSLAPI.view"
        PARAMS
        (
            Start = "2021-03-30",      // "2010-06-25"
            End = "2021-03-30",          // "2014-12-12"
            Dataset = "Bing.com",      // Bing.com,Mobile,Cortana,Charm,Opal
            Traffic = "Normal",      // Internal,Normal,Abnormal
            QueryType = "Web"   // Web,MM,AutoSuggest,Homepage,API,News,All
        )
    )
    WHERE Market IN ("ar-AE") AND ReferenceAnswerSubScenario == "QnACurationNew";

OUTPUT Dataset
TO SSTREAM "/users/v-tiapeng@microsoft.com/scope/2021-04-08/SLAPiLog_2021-03-30.ss";

9.使用UNION合并多个表中的查询记录(与EXCEPT相反).

#DECLARE InputSS string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-19/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS2 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-20/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS3 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-21/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS4 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-22/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS5 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-23/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS6 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-24/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS7 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-25/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS8 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-26/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE InputSS9 string = "/users/v-tiapeng@microsoft.com/scope/MetricsResults/xCUVStream-2021-03-27/Result_NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguageL30.ss";
#DECLARE OutputSS string = "/users/v-tiapeng@microsoft.com/scope/2021-03-30/summary.ss";

Input = SSTREAM @InputSS;
Input2 = SSTREAM @InputSS2;
Input3 = SSTREAM @InputSS3;
Input4 = SSTREAM @InputSS4;
Input5 = SSTREAM @InputSS5;
Input6 = SSTREAM @InputSS6;
Input7 = SSTREAM @InputSS7;
Input8 = SSTREAM @InputSS8;
Input9 =
    SSTREAM @InputSS9;


rs = 
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input
    WHERE Date == "3/19/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input2
    WHERE Date == "3/20/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input3
    WHERE Date == "3/21/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input4
    WHERE Date == "3/22/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input5
    WHERE Date == "3/23/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input6
    WHERE Date == "3/24/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input7
    WHERE Date == "3/25/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input8
    WHERE Date == "3/26/2021"
    UNION
    SELECT Language,
           Date,
           NumberOfQueriesWithThisQueryLanguageAndMarketContainsThisLanguage,
           NumberOfQueriesWithThisQueryLanguage
    FROM Input9
    WHERE Date == "3/27/2021";

OUTPUT rs
TO SSTREAM @OutputSS;

10.查询过去30天的记录:

//#DECLARE InputSS string = @"/projects/merlin/xCUV/Daily/2021-03-05.ss";
//#DECLARE OutputSS string = @"/users/v-tiapeng@microsoft.com/scope/2021-03-23/Result_AverageNumberOfQueriesPerDayByMarketL30.ss";

#DECLARE InputSS string = @@InputSS@@;
#DECLARE OutputSS string = @@OutputSS@@;

#DECLARE DATECOLUMN string;
#SET DATECOLUMN = "@@DateColumn@@"; 

Input = SSTREAM @InputSS;

xcuv =
    SELECT date,
           rawquery,
           Market
    FROM Input;

rs1 =
    SELECT COUNT(DISTINCT date) AS TotalDays,
           COUNT(DISTINCT rawquery) AS TotalQueries,
           ((float) COUNT(DISTINCT rawquery) / 30) AS AverageNumberOfQueriesPer30Days,
           Market
    FROM xcuv
    //WHERE DateTime.Parse(date) >= DateTime.Parse("3/5/2021").AddDays(-30) AND DateTime.Parse(date) <= DateTime.Parse("3/5/2021")
    WHERE DateTime.Parse(date) >= DateTime.Parse(@DATECOLUMN).AddDays(-30) AND DateTime.Parse(date) <= DateTime.Parse(@DATECOLUMN)
    GROUP BY Market;

OUTPUT rs1
TO SSTREAM @OutputSS
CLUSTERED BY Market;

11.TOP k 当 k大于10000的时候无法与关键字ORDER BY 进行连用,因此解决办法是先用SORT BY… DESC来生成第一个ss文件确保排序,然后再用TOP关键字在这个文件中进行查询:

第一段scope:

rs = SELECT Query, GoogleViewsCount FROM Input;
OUTPUT rs TO SSTREAM @OutputSS CLUSTERED BY Query SORTED BY GoogleViewsCount DESC;

第二段scope:

rs = SELECT TOP @@SampleNumber@@ Query, GoogleViewsCount FROM Input;
OUTPUT rs TO SSTREAM @OutputSS CLUSTERED BY Query SORTED BY GoogleViewsCount DESC;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值