count sketch


<!doctype html>
<html lang="en">

<head>
<meta charset="utf-8">

<title>A practical introduction to the Count-Min Sketch</title>

<meta name="description" content="A practical introduction to the Count-Min Sketch">
<meta name="author" content="Hannes Korte">

<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />

<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">

<link rel="stylesheet" href="http://hkorte.github.io/slides/cmsketch/css/cmsketch.css">
<link rel="stylesheet" href="http://hkorte.github.io/slides/cmsketch/css/reveal.min.css">
<link rel="stylesheet" href="http://hkorte.github.io/slides/cmsketch/css/theme/serif.css" id="theme">

<!-- For syntax highlighting -->
<link rel="stylesheet" href="http://hkorte.github.io/slides/cmsketch/lib/css/zenburn.css">

<!-- If the query includes 'print-pdf', use the PDF print sheet -->
<script>
	document.write('<link rel="stylesheet" href="css/print/'
			+ (window.location.search.match(/print-pdf/gi) ? 'pdf' : 'paper')
			+ '.css" type="text/css" media="print">');
</script>

<script src="http://hkorte.github.io/slides/cmsketch/lib/js/jquery-1.9.1.min.js"></script>

<!--[if lt IE 9]>
		<script src="lib/js/html5shiv.js"></script>
		<![endif]-->
</head>

<body>

	<div class="reveal">

		<!-- Any section element inside of this container is displayed as a slide -->
		<div class="slides">

			<section>
				<h3>A practical introduction to the</h3>
				<h2>Count-Min Sketch</h2>
				<p>
					<small>Hannes Korte<br/>Algorithms &amp; Data Challenges Berlin Meetup<br/>Mar 19, 2013</small>
				</p>
			</section>

			<section>
				<h3>Imagine:</h3>
				<ul>
					<li>You want to count elements.</li>
					<li>You don't need exact results.</li>
				</ul>
			</section>

			<section>
				<h3>Simple solution in Java:</h3>
				<p>Construct a map from elements to counts.</p>
				<p>
					<pre><code> Map&lt;String, Integer&gt; counter = new HashMap&lt;String, Integer&gt;(); </code></pre>
				</p>
				
				<p>You probably want to use libraries like <em>Guava</em> or <em>FastUtil</em> for convenience and better performance.</p>
			
			</section>

			<section>
				<h3>Problem:</h3>
				<ul>
					<li>The number of distinct elements might be very large.</li>
					<li>For realtime applications you need runtime guarantees.</li>
				</ul>
				<aside class="notes">
					<p>Too large to fit into memory.</p>
					<p>Long lookup times. See chart later.</p>
				</aside>
			</section>
			
			<section>
				<h3>Solution: The Count-Min Sketch</h3>
				<ul>
					<li>The trick: Don't store the distinct elements, but just the counters.</li>
					<li>Create an integer array of length x initially filled with 0s.</li>
					<li>Each incoming element gets mapped to a number between 0 and x.</li>
					<li>The corresponding counter in the array gets incremented.</li>
					<li>To query an element's count, simply return the integer value at it's position.</li>
				</ul>
				<p class="fragment">
					You are completely right:<br/><strong>There will be collisions!</strong>
				</p>
			</section>
			
			<section>
				<h3>The Count-Min Sketch</h3>
				<ul>
					<li>Use multiple arrays with different hash functions to compute the index.</li>
					<li>When queried, return the minimum of the numbers the arrays. &rarr; Count-<em>Min</em> Sketch</li>
				</ul>
				<p class="fragment">
					You are completely right:<br/><strong>There will still be collisions!</strong>
				</p>
				<p class="fragment">
					<strong>...but less.</strong>
				</p>
			</section>
			
			<section>
				<h3>Some properties</h3>
				<ul>
					<li>Only over-estimates, never under-estimates the true count.</li>
					<li>Has a <strong>constant</strong> memory and time consumption independent of the number of elements.</li>
					<li>The relative error may be high for low-frequent elements.</li>
			</section>

			<section>
				<h3>Demo sketch:</h3>
				<table id="demosketch" style="text-align: center;">
					<tr><td class="hashfct"><math><mrow><msub><mi>h</mi><mn>0</mn></msub></mrow></math></td>
						<td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
					<tr><td class="hashfct"><math><mrow><msub><mi>h</mi><mn>1</mn></msub></mrow></math></td>
						<td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
					<tr><td class="hashfct"><math><mrow><msub><mi>h</mi><mn>2</mn></msub></mrow></math></td>
						<td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td><td>0</td></tr>
				</table>
				<form id="demoform" class="topmargin">add a new element: <input style="font-size: 100%" type="text" id="demoinput"/></form>
				<p id="demostats" class="topmargin"></p>
<script>
$(function() {
	var map = {};
	var hashA = [3, 5, 7];
	var hashindex = function(a, str) {
		var hash = 0;
		for(var i=0; i < str.length; i++){
			hash = (a * str.charCodeAt(i)) + (hash << 76) + (hash << 16) - hash;
console.log("i:"+i);
console.log("a:"+a);
		}
		if(hash<0) hash = hash * -1;
console.log("hash:"+hash);
		return hash % 7;
	}
	var incrementCell = function(row, col) {
		var cell = $('#demosketch tr:eq('+row+') td:eq('+(col+1)+')');
		cell.css({ opacity: 0	});
		cell.css('border-color', 'red');
		cell.animate({ opacity: 1 }, 500 );
		cell.text(parseInt(cell.text())+1);
	}
	var queryCell = function(word) {
		var minCount = Math.min.apply(null, [
			parseInt($('#demosketch tr:eq(0) td:eq('+(hashindex(hashA[0], word)+1)+')').text()),
			parseInt($('#demosketch tr:eq(1) td:eq('+(hashindex(hashA[1], word)+1)+')').text()),
			parseInt($('#demosketch tr:eq(2) td:eq('+(hashindex(hashA[2], word)+1)+')').text())
		]);
		return minCount;
	}
	$('#demoform').submit(function() {
		$('#demosketch td').css('border-color', 'transparent');
		var word = $('#demoinput').val();
		$('#demoinput').val('');
		incrementCell(0, hashindex(hashA[0], word));
		incrementCell(1, hashindex(hashA[1], word));
		incrementCell(2, hashindex(hashA[2], word));
		if(!map[word]) map[word] = 1;
		else map[word] = map[word] + 1;
		$('#demostats').html('Statistics for word <em>' + word + '</em>:<br/>True count: ' + map[word] + '<br/>Sketch count: ' + queryCell(word));
		return false;
	});
})
</script>
			</section>
			
			<section>
				<h3>Extension: The Conservative Update</h3>
				<ul>
					<li>Instead of incrementing every counter, only increment the ones which need an update.</li>
					<li>Experiments show a strong error reduction using this heuristic.</li>
				</ul>
			</section>
			
			<section>
				<img src="http://hkorte.github.io/slides/cmsketch/img/experiments-mem.png"/><br/>
				Example counting unigrams and bigrams in text documents. 
			</section>
			
			<section>
				<img src="http://hkorte.github.io/slides/cmsketch/img/experiments-time.png"/><br/>
				Example counting unigrams and bigrams in text documents. 
			</section>
			
			<section>
				<h3>Usage</h3>
				<ul>
					<li>General:
						<ul>
							<li>Every counting problem on large data sets, where errors are acceptable.</li>
							<li>Counting elements in data streams.<br/>(e.g. trend detection on twitter)</li>
						</ul>
					</li>
					<li class="fragment topmargin">Mine:
						<ul>
							<li>Feature selection for large data sets using pointwise mutual information.</li>
							<li>All kinds of feature statistics like word co-occurences, IDF, etc.</li>
						</ul>
					</li>
				</ul>
			</section>
			
			<section>
				<h3>Feature Selection Example</h3>
				<p><strong>Input:</strong> Company websites grouped by industry sector</p>
				<p><strong>Output:</strong> Relevant terms for each sector based on PMI</p>
				<pre><code>Label 01         Landwirtschaft und Jagd
#docs: 14
001.     0,0177864  pflanzen
002.     0,0144118  garten
003.     0,0129956  ernte
004.     0,0113797  gewächshausfläche
005.     0,0113797  dwarf
006.     0,0113797  spätreifende
007.     0,0113797  breiti
008.     0,0113797  kellereiartikel
009.     0,0108276  landschaftspflege
010.     0,0102964  cultivated
011.     0,0102964  reintönige
012.     0,0102964  traubensorte
013.     0,0102964  jahrgängen
014.     0,0102964  schwimmteich
015.     0,0102964  imposanter
016.     0,0102964  innenbegrünung
017.     0,0102964  pflanzenpflege
018.     0,0101218  grünanlagen
019.     0,0101218  blüten
020.     0,0101218  ernten

Label 01.12      Gartenbau
#docs: 6
001.     0,0143696  pflanzen
002.     0,0137515  gewächshausfläche
003.     0,0126628  schwimmteich
004.     0,0126628  pflanzenpflege
005.     0,0123234  gärtner
006.     0,0118892  schwimmteiche
007.     0,0107863  gärtnern
008.     0,0103628  topfpflanzen
009.     0,0099942  gartens
010.     0,0099942  schnittblumen
011.     0,0098929  gärten
012.     0,0096679  baumschule
013.     0,0096679  floristik
014.     0,0093751  eden
015.     0,0091096  gartencenter
016.     0,0091096  stauden
017.     0,0088669  blüten
018.     0,0084301  portrait
019.     0,0080621  ablehnen
020.     0,0080621  edith

Label 15         Ernährungsgewerbe
#docs: 20
001.     0,0167030  leckere
002.     0,0163800  zutaten
003.     0,0157303  geschmack
004.     0,0152514  rezepte
005.     0,0149070  spezialitäten
006.     0,0141082  wurst
007.     0,0138228  aprikosen
008.     0,0138228  hefe
009.     0,0134308  inhaltsstoffe
010.     0,0132752  schinken
011.     0,0132752  fettsäuren
012.     0,0132752  mandeln
013.     0,0131221  getraenke
014.     0,0131221  schlachtung
015.     0,0131221  walnüsse
016.     0,0131221  durstlöscher
017.     0,0131221  glasflasche
018.     0,0131221  geschmacklich
019.     0,0131221  maracuja
020.     0,0123445  vitaminen

Label 15.13      Fleischverarbeitung
#docs: 5
001.     0,0186327  schlachtung
002.     0,0179152  wurst
003.     0,0163253  wurstwaren
004.     0,0142521  lammfleisch
005.     0,0142521  qualitätsfleisch
006.     0,0138747  partyservice
007.     0,0131627  aufschnitt
008.     0,0131627  küchenfertige
009.     0,0131627  hackfleisch
010.     0,0131627  artgerechte
011.     0,0125670  fleisch
012.     0,0123884  züchten
013.     0,0123884  vieh
014.     0,0123884  mariniert
015.     0,0117828  kühlkette
016.     0,0112842  geschmacksverstärker
017.     0,0112842  dlg
018.     0,0112842  aufzucht
019.     0,0107535  zubereitung
020.     0,0104908  fleischerei
</code></pre>
			</section>
			
			<section>
				<h3>References</h3>
				<ul>
					<li><strong><a href="https://sites.google.com/site/countminsketch/">https://sites.google.com/site/countminsketch/</a></strong><br/>
						A great resource of implementations and publications.</li>
					<li class="topmargin"><strong>An Improved Data Stream Summary: The Count-Min Sketch and Its Applications</strong><br/>
						Graham Cormode, and S. Muthukrishnan. <em>LATIN 2004: Theoretical Informatics (2004)</em>. (The original paper).<br/>
						<small><a href="http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf">http://dimacs.rutgers.edu/~graham/pubs/papers/cm-full.pdf</a></small>
					</li>
					<li class="topmargin"><strong>Sketching Techniques for Large Scale NLP</strong><br/>
						Amit Goyal, Jagadeesh Jagarlamudi, Hal Daume III, and Suresh Venkatasubramanian. <em>Proceedings of the NAACL HLT 2010 Sixth Web as Corpus Workshop (2010)</em>.
						<small><a href="http://www.umiacs.umd.edu/~jags/pdfs/LSNLPsketchWAC10.pdf">http://www.umiacs.umd.edu/~jags/pdfs/LSNLPsketchWAC10.pdf</a></small>
					</li>
				</ul>
			</section>

		</div>

	</div>

	<script src="http://hkorte.github.io/slides/cmsketch/lib/js/head.min.js"></script>
	<script src="http://hkorte.github.io/slides/cmsketch/js/reveal.min.js"></script>

	<script>
		// Full list of configuration options available here:
		// https://github.com/hakimel/reveal.js#configuration
		Reveal.initialize({
			controls : true,
			progress : true,
			history : true,
			center : true,

			theme : Reveal.getQueryHash().theme, // available themes are in /css/theme
			transition : Reveal.getQueryHash().transition || 'default', // default/cube/page/concave/zoom/linear/fade/none

			// Optional libraries used to extend on reveal.js
			dependencies : [ {
				src : 'lib/js/classList.js',
				condition : function() {
					return !document.body.classList;
				}
			}, {
				src : 'plugin/markdown/showdown.js',
				condition : function() {
					return !!document.querySelector('[data-markdown]');
				}
			}, {
				src : 'plugin/markdown/markdown.js',
				condition : function() {
					return !!document.querySelector('[data-markdown]');
				}
			}, {
				src : 'plugin/highlight/highlight.js',
				async : true,
				callback : function() {
					hljs.initHighlightingOnLoad();
				}
			}, {
				src : 'plugin/zoom-js/zoom.js',
				async : true,
				condition : function() {
					return !!document.body.classList;
				}
			}, {
				src : 'plugin/notes/notes.js',
				async : true,
				condition : function() {
					return !!document.body.classList;
				}
			}
			// { src: 'plugin/search/search.js', async: true, condition: function() { return !!document.body.classList; } }
			// { src: 'plugin/remotes/remotes.js', async: true, condition: function() { return !!document.body.classList; } }
			]
		});
	</script>


</body>
</html>

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值