<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" dir="ltr" lang="en">
<head>
  <title>the all-thing</title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  <link rel="stylesheet" href="/static/style.css" type="text/css" />
  <link rel="alternate" type="application/rss+xml" title="the all-thing RSS feed" href="/index.rss" />
  <link rel="alternate" type="text/plain" title="the all-thing in plain text" href="/index.txt" />
  <script type="text/javascript" src="/static/mootools.js"></script>
  <script type="text/javascript" src="http://music.masanjin.net:9292/waxiest.js"></script>
</head>
<body>

<div id="main">
  <div id="header">
    <h1><a  href="/">the all-thing</a></h1>
    
  </div>
  <div id="sidebar">
    <h3>Recent comments</h3>

    <ul class="sidebar-list">
    
    <li><b><a  href="/whisper-0.5#58174069c046a78e55f02ef81da81e74">Dominique Julia</a></b>
        <i><a  href="/whisper-0.5">Whisper 0.5 released</a></i>
           one week ago
    </li>
    
    <li><b><a  href="/ruby-ncurses-and-thread-blocking#8fa2a0f392d7c0562d630e4936407c11">William Morgan</a></b>
        <i><a  href="/ruby-ncurses-and-thread-blocking">Ruby, Ncurses and blocked threads</a></i>
           three months ago
    </li>
    
    <li><b><a  href="/git-wtf-bf06ab7-released#533654a7a229569e27a6d0afd716c444">William Morgan</a></b>
        <i><a  href="/git-wtf-bf06ab7-released">git wtf bf06ab7 released</a></i>
           three months ago
    </li>
    
    <li><b><a  href="/git-wtf-bf06ab7-released#b7b7a905477674eb6985b34a964a0dca">Joao Nelas</a></b>
        <i><a  href="/git-wtf-bf06ab7-released">git wtf bf06ab7 released</a></i>
           three months ago
    </li>
    
    <li><b><a  href="/ruby-ncurses-and-thread-blocking#b00001114360ac152f87d4ac2a6e0c5b">Ollivier Robert</a></b>
        <i><a  href="/ruby-ncurses-and-thread-blocking">Ruby, Ncurses and blocked threads</a></i>
           three months ago
    </li>
    
    </ul>

    <h3>Authors</h3>
    <ul class="sidebar-list">
    
      <li><a class="author" href="/by/William+Morgan/">William&nbsp;Morgan</a>&nbsp;(65) </li>
    
    </ul>

    <h3>Tags</h3>
    <ul class="sidebar-list">
    
      <li><a class="label" href="/label/releases/">releases</a>&nbsp;(15) </li>
    
      <li><a class="label" href="/label/whisper/">whisper</a>&nbsp;(13) </li>
    
      <li><a class="label" href="/label/git/">git</a>&nbsp;(9) </li>
    
      <li><a class="label" href="/label/stats/">stats</a>&nbsp;(8) </li>
    
      <li><a class="label" href="/label/trollop/">trollop</a>&nbsp;(6) </li>
    
      <li><a class="label" href="/label/ruby/">ruby</a>&nbsp;(6) </li>
    
      <li><a class="label" href="/label/sup/">sup</a>&nbsp;(6) </li>
    
      <li><a class="label" href="/label/git-wtf/">git-wtf</a>&nbsp;(4) </li>
    
      <li><a class="label" href="/label/vm/">vm</a>&nbsp;(4) </li>
    
      <li><a class="label" href="/label/mathml/">mathml</a>&nbsp;(3) </li>
    
      <li><a class="label" href="/label/continuations/">continuations</a>&nbsp;(3) </li>
    
      <li><a class="label" href="/label/ditz/">ditz</a>&nbsp;(3) </li>
    
      <li><a class="label" href="/label/proglang/">proglang</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/optimization/">optimization</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/benchmarks/">benchmarks</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/rubinius/">rubinius</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/inlining/">inlining</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/ubuntu/">ubuntu</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/fibers/">fibers</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/ritex/">ritex</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/ruby1.9/">ruby1.9</a>&nbsp;(2) </li>
    
      <li><a class="label" href="/label/ncurses/">ncurses</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/javascript/">javascript</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/media/">media</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/vim/">vim</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/classification/">classification</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/massachusetts/">massachusetts</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/greasemonkey/">greasemonkey</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/wine/">wine</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/readme/">readme</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/ancient-greek/">ancient-greek</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/web/">web</a>&nbsp;(1) </li>
    
      <li><a class="label" href="/label/current+events/">current&nbsp;events</a>&nbsp;(1) </li>
    
    </ul>

    <h3>Other formats</h3>
    <ul class="sidebar-list">
    <li><a href="/index.rss"><img src="/static/rss-badge.png"/></a></li>
    <li><a href="/index.txt">plain text version</a></li>
    </ul>

    <h3 class="waxiest.author.original">Who is this man?</h3>
    <h3 class="waxiest.author.beautiful" style="display:none">I must find out more about this beautiful creature</h3>
    <h3 class="waxiest.author.beautifulbig" style="display:none">I MUST FIND OUT MORE ABOUT THIS BEAUTIFUL CREATURE</h3>
    <h3 class="waxiest.author.originalbig" style="display:none">WHO IS THIS MAN?</h3>

    <script type="text/javascript">
      var w = waxiest();
      w.optimizeHTMLSection("author", ["original", "beautiful", "beautifulbig", "originalbig"]);
    </script>

    <a href="http://masanjin.net" onClick="w.goalReached('greeting')">William Morgan</a>
  </div>
  <div id="content">
    <h2><a  href="/bayes-vs-mle">Bayes vs <span class="caps">MLE</span>: an estimation theory fairy tale</a></h2>

<div class="byline">
  <a  href="/by/William+Morgan/">William Morgan</a>,
  <span title="17 months ago">October  6, 2008 10:33pm</span>
</div>

  <div class="labels"><span class='label'><a  href="/label/stats/">stats</a></span> <span class='label'><a  href="/label/whisper/">whisper</a></span> </div>


<p class='first'>I found a neat little example in one of my introductory stats books about
Bayesian versus maximum-likelihood estimation for the simple problem of
estimating a binomial distribution given only one sample.</p>
<p>I was going to try and show the math but since Blogger is not making it
possible to actually render MathML I&#8217;ll just hand-wave instead.
<em>[Fixed in <a href="http://masanjin.net/whisper/">Whisper</a>. &#8212;ed.]</em></p>
<p>So let&#8217;s say we&#8217;re trying to estimate a binomial distribution parameterized by
<span title='p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi></math></span>, and that we&#8217;ve only seen one estimate. For example, someone flips a coin
once, and we have to decide what the coin&#8217;s probability of heads is.</p>
<p>The maximum likelhood estimate for <span title='p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi></math></span> is easy: if your single sample is a 1,
then <span title='p=1' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi><mo>=</mo><mn>1</mn></math></span>, and if your sample is 0, <span title='p=0' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi><mo>=</mo><mn>0</mn></math></span>. (And if you go through the laborious
process of writing the log likelihood, setting the derivative equal to 0, and
solving it, you come up with the general rule of (# of 1&#8217;s) / (# of 1&#8217;s + # of
0&#8217;s), which is kinda what you would expect.)</p>
<p>In the coin case it seems crazy to say, I saw one head, so I&#8217;m going to assume
that the coin <em>always</em> turns up heads, but that&#8217;s because of our prior
knowledge of how coins behave. If we&#8217;re given a black box with a button and two
lights, and you press the button, and one of the lights come on, then maybe
estimating that that light always comes on when you press the button makes a
little more sense.</p>
<p>Finding the Bayesian estimate is slightly more complicated. Let&#8217;s use a uniform
prior. Our conditional distribution is <span title='f(1|p)=p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>f</mi><mo stretchy='false'>(</mo><mn>1</mn><mo stretchy='false'>|</mo><mi>p</mi><mo stretchy='false'>)</mo><mo>=</mo><mi>p</mi></math></span> and <span title='f(0|p)=1-p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>f</mi><mo stretchy='false'>(</mo><mn>0</mn><mo stretchy='false'>|</mo><mi>p</mi><mo stretchy='false'>)</mo><mo>=</mo><mn>1</mn><mo>&minus;</mo><mi>p</mi></math></span>, and if you work
it out, the posterior ends up as <span title='h(p|1)=2p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>h</mi><mo stretchy='false'>(</mo><mi>p</mi><mo stretchy='false'>|</mo><mn>1</mn><mo stretchy='false'>)</mo><mo>=</mo><mn>2</mn><mi>p</mi></math></span> and <span title='h(p|0)=2(1-p)' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>h</mi><mo stretchy='false'>(</mo><mi>p</mi><mo stretchy='false'>|</mo><mn>0</mn><mo stretchy='false'>)</mo><mo>=</mo><mn>2</mn><mo stretchy='false'>(</mo><mn>1</mn><mo>&minus;</mo><mi>p</mi><mo stretchy='false'>)</mo></math></span>.</p>
<p>Now if we were in the world of classication, we&#8217;d take the <span class="caps">MAP</span> estimate, which
is a fancy way of saying the value with the biggest probability, or the mode of
the distribution. Since we&#8217;re using a uniform prior, that would end up as the
same as the <span class="caps">MLE</span>. But we&#8217;re not. We&#8217;re in the world of real numbers, so we can
take something better: the expected value, or the mean of the distribution.
This is known as the Bayes estimate, and there are some decision-theoretic
reasons for using it, but informally, it makes more sense than using the <span class="caps">MAP</span>
estimate: you can take into account the entire shape of the distribution, not
just the mode.</p>
<p>Using the Bayes estimate, we arrive at <span title='p=2/3' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi><mo>=</mo><mn>2</mn><mo>/</mo><mn>3</mn></math></span> if the sample was a 1, and
<span title='p=1/3' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi><mo>=</mo><mn>1</mn><mo>/</mo><mn>3</mn></math></span> if the sample was a zero. So we&#8217;re at a place where Bayesian logic and
frequentist logic arrive at very different answers, <em>even with a uniform
prior</em>.</p>
<p>Up till now we&#8217;ve been talking about &#8220;estimation theory&#8221;, i.e. the art of
estimating shit. But estimation theory is basically decision theory in
disguise, where your decision space is the same as your parameter space: you&#8217;re
deciding on a value for <span title='p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi></math></span>, given your input data, and your prior knowledge, if
any.</p>
<p>Now what&#8217;s cool about moving to the world of decision theory is that we can
say: if I have to decide on a particular value for <span title='p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi></math></span>, how can I minimize my
expected cost, aka my risk? A natural choice for a cost, or loss, function, is
squared error. If the true value is <span title='q' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>q</mi></math></span>, I&#8217;d like to estimate <span title='p' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>p</mi></math></span> in such a way
that <span title='E[(q-p)^2]' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>E</mi><mo stretchy='false'>[</mo><mo stretchy='false'>(</mo><mi>q</mi><mo>&minus;</mo><mi>p</mi><msup><mo stretchy='false'>)</mo><mn>2</mn></msup><mo stretchy='false'>]</mo></math></span> is minimized. So we don&#8217;t have to argue philosophically about
<span class="caps">MLE</span> versus <span class="caps">MAP</span> versus minimax versus Bayes estimates; we can quantify how well
each of them do under this framework.</p>
<p>And it turns out that, if you plot the risk for the <span class="caps">MLE</span> estimate and for the
Bayes estimate under different values of the true value <span title='q' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>q</mi></math></span>, then <span class="caps">MOST</span> of the
time, the Bayes estimate has lower risk than the <span class="caps">MLE</span>. It&#8217;s only when <span title='q' style='white-space: nowrap'><math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mi>q</mi></math></span> is
close to 0 or to 1 that <span class="caps">MLE</span> has lower risk.</p>
<p>So that&#8217;s pretty cool. It seems like the Bayes estimate must be a superior
estimate.</p>
<p>Of course, I set this whole thing up. Those &#8220;decision-theoretic reasons&#8221; for
choosing the Bayes estimate I mentioned? Well, they&#8217;re theorems that show that
the Bayes estimate minimizes risk. And, in fact, the Bayes estimate of the mean
of the distribution is <em>specific</em> to squared-error loss. If we chose another
loss function, we could come up with a potentially very different Bayes
estimate.</p>
<p>But my intention wasn&#8217;t really to trick you into believing that Bayes estimates
are awesome. (Though they are!) I wanted to show that:</p>
<ol>
	<li>Bayes and classical approaches can come up with very different estimates,
even with a uniform prior.</li>
	<li>If you cast things in decision-theoretic terms, you can make some real
quantitative statements about different ways of estimating.</li>
</ol>
<p>In the decision theory world, you can <em>customize</em> your estimates to minimize
your particular costs in your particular situation. And that&#8217;s an idea that I
think is very, very powerful.</p>

<h4>Discussion</h4>
<a name="comments"></a>

<ul class="comment-tree">

  <li>
  
  <a name="old-comment-14"></a>
  <div class="comment-body">
    <table class="comment-byline">
      <tr><td class="comment-author">Brendan</td>
          <td class="comment-date" title="17 months ago"><a href="#old-comment-14">October 11, 2008  9:34pm</a></td>
    </tr></table>
    <p class='first'>Very interesting.  But you pushed all the interesting action out to the loss
function.  If you do 1-0 loss &#8212; that is, you get credit if you&#8217;re right, but
for everything else you&#8217;re worthless &#8212; then the mode, not the mean, of the
posterior is optimal.  Therefore <span class="caps">MAP</span>.</p>
<p>It&#8217;s not at all clear to me, for really general estimation settings, whether
1-0 or squared error is better.</p>
    <div class="reply-to-outer">
      <div class="reply-to-header"><a href="#" class="reply-to-link">reply</a></div>
      <div class="reply-to-box" id="reply-to-bayes-vs-mle-old-comment-14" >
        <span class="comment-instructions">To reply, enter your email address. A copy of the comment will be sent to you via email.</span>
        <form id='comment-form-bayes-vs-mle-old-comment-14' action='/comment/bayes-vs-mle' method='post' class='comment-form'>
          <input type='text' name='email' id='textfield-email'/>
          <input type='submit' value='email me' id='submit-email me'/>
        <input type='hidden' name='comment-id' value='old-comment-14'/>
  <span class="form-result" id="form-result-bayes-vs-mle-old-comment-14"><!-- spanna --></span>
</form>

      </div>
    </div>
  </div>

  </li><li>
  
  <a name="old-comment-10"></a>
  <div class="comment-body">
    <table class="comment-byline">
      <tr><td class="comment-author"><a href="http://masanjin.net/">William</a></td>
          <td class="comment-date" title="17 months ago"><a href="#old-comment-10">October 14, 2008  2:45pm</a></td>
    </tr></table>
    <p class='first'>Well 0-1 loss is crazy talk if you&#8217;re estimating a continuous value.</p>
<p>But it looks like it magically all works out for both cases. If you&#8217;re
estimating something discrete, then 0-1 loss means <span class="caps">MAP</span> is optimal, and <span class="caps">MAP</span> kind
of the only thing you can do anyways.</p>
<p>If you&#8217;re doing regression against a continuous value, then squared-error loss
means that EV is optimal, and that&#8217;s kind of the most natural thing to do too.</p>
<p>Magic.</p>
    <div class="reply-to-outer">
      <div class="reply-to-header"><a href="#" class="reply-to-link">reply</a></div>
      <div class="reply-to-box" id="reply-to-bayes-vs-mle-old-comment-10" >
        <span class="comment-instructions">To reply, enter your email address. A copy of the comment will be sent to you via email.</span>
        <form id='comment-form-bayes-vs-mle-old-comment-10' action='/comment/bayes-vs-mle' method='post' class='comment-form'>
          <input type='text' name='email' id='textfield-email'/>
          <input type='submit' value='email me' id='submit-email me'/>
        <input type='hidden' name='comment-id' value='old-comment-10'/>
  <span class="form-result" id="form-result-bayes-vs-mle-old-comment-10"><!-- spanna --></span>
</form>

      </div>
    </div>
  </div>

  </li>
  
</ul>

<p class="comment-instructions">To leave a new comment, enter your email
address. A copy of the article will be sent to you via email.</p>
<form id='comment-form-bayes-vs-mle' action='/comment/bayes-vs-mle' method='post' class='comment-form'>
  <input type='text' name='email' id='textfield-email'/>
  <input type='submit' value='email me' id='submit-email me'/>
  <span class="form-result" id="form-result-bayes-vs-mle"><!-- spanna --></span>
</form>


<script type="text/javascript">
/* <![CDATA[ */
$$('.comment-form').addEvent('submit', function(e) {
  e.stop();
  var el = this.getElement('.form-result');
  var result = el.empty().addClass('ajax-loading');
  this.set('send', {
    method: 'post',
    onComplete: function(response) { 
      result.removeClass('ajax-loading');
      result.set('html', response);
    },
  });
  this.send();
});
/* ]]> */
</script>


<script type="text/javascript">
/* <![CDATA[ */
$$('.reply-to-link').each(function(link, i) {
  var box = link.getParent().getParent().getElement('.reply-to-box');
  var oldHeight = box.getStyle("height");
  box.setStyle("height", 0);
  box.setStyle("opacity", 0);
  link.addEvent('click', function(e) {
    e.stop();
    box.setStyle("opacity", 1);
    box.setStyle("height", oldHeight);
    box.getElement("input").focus();
  });
});
/* ]]> */
</script>

  </div>

  <div id="footer" style="margin: 0px;">
    Served up by <a href="http://masanjin.net/whisper/">Whisper</a>. Yes!
  </div>
</div>
</body>
</html>
