From 3abdf461d28d500c0966ddab61aba3b50255f4ec Mon Sep 17 00:00:00 2001
From: vwang0 <bwang@go.wustl.edu>
Date: Wed, 28 Oct 2020 11:44:18 -0500
Subject: [PATCH] 20201028

---
 .../estimate_population_1018.ipynb            | 228 ++++++++++++++++++
 .../mark_recapture_model.ipynb                | 160 ++++++++++++
 2 files changed, 388 insertions(+)
 create mode 100644 mark_recapture_model/estimate_population_1018.ipynb
 create mode 100644 mark_recapture_model/mark_recapture_model.ipynb

diff --git a/mark_recapture_model/estimate_population_1018.ipynb b/mark_recapture_model/estimate_population_1018.ipynb
new file mode 100644
index 0000000..261c503
--- /dev/null
+++ b/mark_recapture_model/estimate_population_1018.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import math"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  eN - population estimate\n",
+    "#  M - number of marked samples before re-capture\n",
+    "#  C - recapture sample size\n",
+    "#  R - number of marked samples found in re-capture\n",
+    "#  eN = M*C/R , to avoid R = 0 case --> eN = ()(M+1)*(C+1)/(R+1))-1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "myPopulationSize = 47500 "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if (1):\n",
+    "    #lets define total number of rounds to simulate\n",
+    "    nRounds = 20\n",
+    "    #lets define our capture size for each recapture round\n",
+    "    C=[None]*nRounds\n",
+    "    for i in range(0,nRounds) :\n",
+    "        C[i]= 40\n",
+    "else:\n",
+    "# Option B - Directly specified [set above if to false to enter here]\n",
+    "    C=[64,32,16,8]\n",
+    "    nRounds = len(C)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# empty set to start with\n",
+    "setN = set()\n",
+    "for i in range(1,myPopulationSize):\n",
+    "    setN.add(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "setC=[None]*nRounds\n",
+    "setM=[None]*nRounds\n",
+    "M=[None]*nRounds\n",
+    "R=[None]*nRounds\n",
+    "p=[None]*nRounds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Round\tMarked \t p \t\t est \t pest \tstdev \t\tLow \tHigh\n",
+      "  11\t 440 \t 0.0023 \t 105600 \t 17600 \t17359 \t\t5090\t-12073\n",
+      "  15\t 599 \t 0.0017 \t 95920 \t 23960 \t23639 \t\t6928\t-16429\n",
+      "real Popualtion 47499\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "#LP_eN=[None]*nRounds\n",
+    "#SeLP_eN=[None]*nRounds\n",
+    "eN=[None]*nRounds\n",
+    "stdev_eN=[None]*nRounds\n",
+    "confLow_eN=[None]*nRounds\n",
+    "confHigh_eN=[None]*nRounds\n",
+    "boundLow_eN=[None]*nRounds\n",
+    "boundHigh_eN=[None]*nRounds\n",
+    "rN=[]\n",
+    "\n",
+    "print(\"Round\\tMarked \\t p \\t\\t est \\t pest \\tstdev \\t\\tLow \\tHigh\")\n",
+    "#Lets get our first capture\n",
+    "setC[0]=set(random.sample(setN,C[0]))\n",
+    "setM[0]=setC[0] #we mark all in round 1\n",
+    "rN.clear()\n",
+    "for r in range(1,len(C)):\n",
+    "    M[r] = len(setM[r-1]) #number of marked at the begining of round r\n",
+    "    setC[r] = set(random.sample(setN,C[r])) # Generating the Recapture round r\n",
+    "    R[r] = len(set(setM[r-1]&setC[r])) # number of recaptures round r\n",
+    "    setM[r] = set(setM[r-1]|setC[r]) # we mark the new finds in capture round r\n",
+    "    p[r] = round(1.0*R[r]/M[r],9) #capture probability of round r\n",
+    "    #Lets calculate our population estimate\n",
+    "    rN.append(r)\n",
+    "    topSum = botSum = 0\n",
+    "\n",
+    "    #for i in rN :\n",
+    "    #  Lincoln-Petersen Estimator\n",
+    "    #    topSum = topSum + (M[i])*(C[i])\n",
+    "    #    botSum = botSum + (R[i]+1E-6)#1E-6 for zero case\n",
+    "    #LP_eN[r] = round(1.0*topSum/botSum) #population estimation on round r\n",
+    "    # 20/02/19\n",
+    "    #for i in rN :\n",
+    "    # Seber 1982 evolution of Lincoln-Petersen method (also avoids R[i] = 0 case)\n",
+    "    #    topSum = topSum + (M[i]+1)*(C[i]+1)\n",
+    "    #    botSum = botSum + (R[i]+1)\n",
+    "    #SeLP_eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r\n",
+    "    # 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation\n",
+    "    if p[r] > 0 :\n",
+    "        for i in rN :\n",
+    "        #  Lincoln-Petersen Estimator\n",
+    "            topSum = topSum + (M[i])*(C[i])\n",
+    "            botSum = botSum + (R[i])\n",
+    "        eN[r] = round(1.0*topSum/botSum) #population estimation on round r\n",
+    "    #eN[r] = round(1.0*topSum/botSum-len(rN)) #population estimation on round r\n",
+    "    #var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r])))\n",
+    "    #20/02/19: added the +1 below to avoid zero case error, mostly insignificant in comparison to R[r]^3\n",
+    "    #var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]+1)))\n",
+    "    #20/02/20: variance estimation \"Error\" from original approximation is 50% undershoot if R=1;12%,R=2;4%,R=3\n",
+    "    # Modifying to overshoot, by how much depends on how many R[r] s' you add to both top bottom and +1 or +0.1 ,...\n",
+    "    # Play with relative significance of \"R\"'s to zero eliminating constant \"+1,+0.1,...\"\n",
+    "    # we want the zero case eliminator (+0.1) to influence as little as possible\n",
+    "    # {...}*(R[r]+0.01)/((R[r]^3)*R[r]+0.01) -> 0%,R=1; +0.4%,R=2; +0.3%,R=3; +0.2%,R=4\n",
+    "    # var_eN[r] = abs(round((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r])*(R[r]+0.01))/((R[r]*R[r]*R[r])*R[r]+0.01)))\n",
+    "    # 20/02/20 : since we are going to skip R[r] = 0 cases -> back to initial variable calculation\n",
+    "        # simplified math.sqrt(var_eN[r]) -> stdev_eN[]\n",
+    "        stdev_eN[r] = round(math.sqrt(abs((M[r]*C[r]*(M[r]-R[r])*(C[r]-R[r]))/(R[r]*R[r]*R[r]))))\n",
+    "    #Confidence Level\t0.70 0.75 0.80 0.85 0.90  0.92 0.95 0.96 0.98 0.99\n",
+    "    #z                  1.04 1.15 1.28 1.44 1.645 0.75 1.96 2.05 2.33 2.58\n",
+    "        confLow_eN[r] = 1.96*math.sqrt(((1-R[r]/M[r])*(R[r]/C[r])*(1-R[r]/C[r]))/(C[r]-1))+1/(2*C[r])\n",
+    "        confHigh_eN[r] = R[r]/C[r]-confLow_eN[r]\n",
+    "        confLow_eN[r] = R[r]/C[r]+confLow_eN[r]\n",
+    "        boundLow_eN[r] = round(M[r]/confLow_eN[r])\n",
+    "        boundHigh_eN[r] = round(M[r]/confHigh_eN[r])\n",
+    "    #std deviation =sqrt(Var)\n",
+    "        #print(\"  %d\\t %d \\t %1.3f \\t %d \\t %d \\t%3.2f \\t\\t%3.2f\\t%3.2f\"%(r,M[r],p[r],eN[r],round(C[r]/(p[r]+1E-6)),stdev_eN[r],M[r]/confLow_eN[r],M[r]/confHigh_eN[r]))\n",
+    "        #lets define when an estimate is valid... conditions:\n",
+    "        # 2*stddev -> 95% of cases contemplated\n",
+    "        # z= 1.96 --> 0.95 confidence level for boundries\n",
+    "        # 1. is estimate > 2*stddev ?\n",
+    "        # 2. is estimate between boundries ?\n",
+    "        # ?3. is pest < est ? <--- maybe --> in large numbers always falls behind\n",
+    "        #if (eN[r]>(stdev_eN[r])) & (eN[r]>boundLow_eN[r]) & (eN[r]<boundHigh_eN[r]):\n",
+    "        #boundries for large numbers doesnt seem to work out\n",
+    "        if (eN[r]>(stdev_eN[r])) :\n",
+    "            print(\"  %d\\t %d \\t %1.4f \\t %d \\t %d \\t%d \\t\\t%d\\t%d\"%(r,M[r],p[r],eN[r],round(C[r]/(p[r])),stdev_eN[r],boundLow_eN[r],boundHigh_eN[r]))\n",
+    "\n",
+    "#eN.remove(None) #None is not accepted in the function median\n",
+    "#print(\"median of last %d rounds: %d\"%(int(nRounds/2),median(eN[int(nRounds/-2):])))\n",
+    "print(\"real Popualtion %d\"%len(setN))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/mark_recapture_model/mark_recapture_model.ipynb b/mark_recapture_model/mark_recapture_model.ipynb
new file mode 100644
index 0000000..2e5177e
--- /dev/null
+++ b/mark_recapture_model/mark_recapture_model.ipynb
@@ -0,0 +1,160 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "I will implement three Bayesian capture-recapture models:   \n",
+    "\n",
+    "the **Lincoln-Petersen model of abundance**, \n",
+    "\n",
+    "the **Cormack-Jolly-Seber model of survival**, and   \n",
+    "\n",
+    "the **Jolly-Seber model of abundance and survival**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from matplotlib import pyplot as plt\n",
+    "from matplotlib.ticker import StrMethodFormatter\n",
+    "import numpy as np\n",
+    "import pymc3 as pm\n",
+    "from pymc3.distributions.dist_math import binomln, bound, factln\n",
+    "import scipy as sp\n",
+    "import seaborn as sns\n",
+    "import logging\n",
+    "%matplotlib inline\n",
+    "sns.set()\n",
+    "\n",
+    "# PCT_FORMATTER = StrMethodFormatter('{x:.1%}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SEED = 123456 \n",
+    "np.random.seed(SEED)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### The Lincoln-Petersen model   \n",
+    "The simplest model of abundace, that is, the size of a population, is the Lincoln-Petersen model. While this model is a bit simple for most practical applications, it will introduce some useful modeling concepts and computational techniques.\n",
+    "\n",
+    "The idea of the Lincoln-Petersen model is to visit the observation site twice to capture individuals from the population of interest. The individuals captured during the first visit are marked (often with tags, radio collars, microchips, etc.) and then released. The number of individuals captured, marked, and released is recorded as n1. On the second visit, the number of captured individuals is recorded as n2. If enough individuals are captured on the second visit, chances are quite high that several of them will have been marked on the first visit. The number of marked individuals recaptured on the second visit is recorded as n1,2.\n",
+    "\n",
+    "The Lincoln-Petersen model assumes that: 1. each individuals has an equal probability to be captured on both visits (regardless of whether or not they were marked), 2. no marks fall off or become illegible, and 3. the population is closed, that is, no individuals are born, die, enter, or leave the site between visits.\n",
+    "\n",
+    "The third assumption is quite restrictive, and will be relaxed in the two subsequent models. The first two assumptions can be relaxed in various ways, but we will not do so in this post. First we derive a simple analytic estimator for the total population size given $n_1$, $n_2$, and $n_{1, 2}$, then we fit a Bayesian Lincoln-Petersen model using PyMC3 to set the stage for the (Cormack-)Jolly-Seber models.\n",
+    "\n",
+    "Let $N$ denote the size of the unknown total population, and let p denote the capture probability. We have that\n",
+    "\n",
+    "$\\begin{align*}\n",
+    "n_1, n_2\\ |\\ N, p\n",
+    "    & \\sim \\textrm{Bin}(N, p) \\\\\n",
+    "n_{1, 2}\\ |\\ n_1, p\n",
+    "    & \\sim \\textrm{Bin}(n_1, p).\n",
+    "\\end{align*}$\n",
+    "\n",
+    "Therefore n2N and n1,2n1 are unbiased estimates of p. The Lincoln-Peterson estimator is derived by equating these estimators\n",
+    "\n",
+    "$\\frac{n_2}{\\hat{N}} = \\frac{n_{1, 2}}{n_1}$\n",
+    "\n",
+    "and solving for\n",
+    "\n",
+    "$\\hat{N} = \\frac{n_1 n_2}{n_{1, 2}}.$.\n",
+    "\n",
+    "We now simulate a data set where $N$=1000 and the capture probability is $p$=0.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "N_LP = 1000\n",
+    "P_LP = 0.1\n",
+    "\n",
+    "x_lp = sp.stats.bernoulli.rvs(P_LP, size=(2, N_LP))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "https://austinrochford.com/posts/2018-01-31-capture-recapture.html   \n",
+    "\n",
+    "https://gist.github.com/paucoma/36343a189ffd62aefa7ee87d71b18fcf   \n",
+    "\n",
+    "https://gist.github.com/AustinRochford/e67cb0c628b3692ecc669190fe86990c"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}