google · ebadi · Mar 9, 2016 · Mar 9, 2016 · Aug 24, 2016 · Aug 24, 2016
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ The demo strings together the Python and R code.  It:
 2. Runs it through the RAPPOR privacy-preserving reporting mechanisms
 3. Analyzes and plots the aggregated reports against the true input
 
-The output is written to `_tmp/regtest/results.html`, and can be opened with a
+The output is written to `_tmp/cpp/results.html` and `_tmp/python/results.html`, and can be opened with a
 browser.
 
 Dependencies

diff --git a/analysis/R/association_test.R b/analysis/R/association_test.R
@@ -282,7 +282,7 @@ TestCppImplementation <- function() {
   fit1 <- RunEmFunction(cond_prob, max_em_iters)
 
   # Assume we're in the repo root
-  em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em")
+  em_cpp <- file.path(getwd(), "analysis/cpp/fast_em")
   fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters)
 
   cpp_diff <- abs(fit1 - fit2)

diff --git a/apps/rappor-analysis/params.csv b/apps/rappor-analysis/params.csv
@@ -1,2 +1,2 @@
 "k","h","m","p","q","f"
-128,2,8,0.5,0.75,0
+128,2,8,0.5,0.25,0.5
diff --git a/apps/rappor-analysis/server.R b/apps/rappor-analysis/server.R
@@ -1,7 +1,9 @@
 library(shiny)
 
-source("../../analysis/R/read_input.R")
-source("../../analysis/R/decode.R")
+setwd("../../") # makes it possible to call libraries analysis/R/util.R and analysis/R/alternative.R from analysis/R/read_input.R and analysis/R/decode.R
+source("analysis/R/read_input.R")  
+source("analysis/R/decode.R")
+setwd("apps/rappor-analysis") # go back to the current directory to use *.csv files
 
 # Random number associated with the session used in exported file names.
 seed <- sample(10^6, 1)
@@ -192,7 +194,8 @@ shinyServer(function(input, output, session) {
                                 include.rownames = FALSE)
 
   output$example_counts <- renderTable({
-    counts <- ReadCountsFile("counts.csv")[, 1:15]
+    params <- Params()
+    counts <- ReadCountsFile("counts.csv", params)[, 1:15]
     cbind(counts, rep("...", nrow(counts)))
   },
                                 include.rownames = FALSE, include.colnames = FALSE)

diff --git a/demo.sh b/demo.sh
@@ -44,14 +44,14 @@ rappor-sim-profile() {
 }
 
 quick-python() {  
-  ./regtest.sh run-seq '^demo3' python
+  ./regtest.sh run-seq '^demo2' python
 }
 
 quick-cpp() {
   # For now we build it first.  Don't want to build it in parallel.
   ./build.sh cpp-client
 
-  ./regtest.sh run-seq '^demo3' cpp
+  ./regtest.sh run-seq '^demo2' cpp
 }
 
 quick() {

diff --git a/doc/data-flow.md b/doc/data-flow.md
@@ -10,12 +10,12 @@ Overview
 
 Start with this command:
 
-    $ ./demo.sh run
+    $ ./demo.sh
 
 It takes a minute or so to run.  The dependencies listed in the
 [README][] must be installed.  At the end, it will say:
 
-    Wrote _tmp/report.html.  Open this in your browser.
+    Wrote _tmp/\[cpp and python\]/results.html.  Open this in your browser.
 
 It should look like [this][example].
 
@@ -52,26 +52,27 @@ The `tests/gen_sim_input.py` tool generates CSV data, like this:
 
 <!-- TODO: a realistic data set would be nice? How could we generate one?  -->
 
-**exp.csv**
-
-    client, true_value
-    1,      v6
-    1,      v3
-    1,      v3
-    1,      v5
-    1,      v13
-    1,      v1
-    1,      v8
-    2,      v2
-    2,      v3
-    2,      v1
-    2,      v8
-    2,      v1
-    2,      v30
-    2,      v10
-    3,      v4
+**case_true_values.csv**
+```
+client,cohort,value
+c1,1,v16
+c1,1,v13
+c1,1,v19
+c1,1,v16
+c1,1,v2
+c1,1,v31
+c1,1,v8
+c1,1,v19
+c1,1,v13
+c1,1,v21
+c2,2,v58
+c2,2,v4
+c2,2,v26
+c2,2,v22
+c2,2,v20
+c2,2,v2
     ...
-
+```
 *(spaces added for clarity)*
 
 By default we generate 700,000 rows: 7 random values from `v1` to `v50` for
@@ -95,35 +96,33 @@ don't want to know any individual values.
 After the RAPPOR transformation, we get another CSV file with 700,000 rows.
 Each client is assigned a cohort.
 
-**exp_out.csv**
-
-    client, cohort, rappor
-    1,      63,     1111101011110111
-    1,      15,     1110110011111100
-    1,      12,     0110101111100101
-    1,       0,     1111100111110111
-    1,       3,     1001110111110011
-    1,      14,     1011111010110011
-    1,      33,     0111010100101011
-    2,      40,     0011011010101001
-    2,      35,     1010110101110100
-    2,      58,     1110110110111110
-    2,      38,     0010001111001010
-    2,       5,     1110111011100101
-    2,      36,     0111010100111111
-    2,      39,     0101101000101101
-    3,      32,     0011100111111110
+**case_reports.csv**
+```
+client,cohort,bloom,prr,irr
+c1,1,00000000000000000010000000000000,11011100100000000010000101100001,01001101100000000010000111000010
+c1,1,00000000000000001000000000000000,00011011010000101010010000010100,00011011000001100010010000011100
+c1,1,10000000000000000000000000000000,10101000001110000000000000010000,00101010010111101000010100010000
+c1,1,00000000000000000010000000000000,11011100100000000010000101100001,10011000100101101110000101100000
+c1,1,00000000000000000000000000000001,00100100000000000100001000000100,01000010100100010000011000101101
+c1,1,00010000000000000000000000000000,00010001000100000100000011000010,10001001000100001110010011000000
+c1,1,00000000010000000000000000000000,00000010001000000001100000101110,10011110001000000101100100101110
+c1,1,10000000000000000000000000000000,10101000001110000000000000010000,10101101101110000000000000010000
+c1,1,00000000000000001000000000000000,00011011010000101010010000010100,00011011010001101010110110110100
+c1,1,00000000010000000000000000000000,00000010001000000001100000101110,00001110001000000001110100101111
+c2,2,00000000000010000000000000000000,10011001001000110001000001000000,10011111011011111000000011001010
+c2,2,00000000000100000000000000000000,01100000000100000001001000000010,11110000110010010000001010001110
+c2,2,00000000000000000000000000001000,10001010101000000000100000001000,00001010101000000100000110101000
     ...
-
+```
 *(spaces added for clarity)*
 
 We also get a one-row CSV file that contains the RAPPOR parameters:
 
-**exp_params.csv**
-
-    k,h,m,p,q,f
-    16,2,64,0.5,0.75,0.5
-
+**case_params.csv**
+```
+k,h,m,p,q,f
+32,1,64,0.25,0.75,0.5
+```
 These are described in the [paper][]. The parameters that the clients use
 must be known to the server, or the decoding will fail.  In addition, all
 clients must use the same parameters for a given variable.
@@ -132,7 +131,7 @@ The `rappor_sim.py` process also writes these files:
 
 - `exp_hist.csv`: The true histogram of input values.  This is used only for
   comparison.  In the real world you obviously won't have this.
-- `exp_true_inputs.txt`: A list of the unique values reported, e.g. `v1` ..
+- `case_unique_values.txt`: A list of the unique values reported, e.g. `v1` ..
   `v50`.  You won't have this either, in general.  To use RAPPOR, you must
   supply *candidate strings*, described below.
 
@@ -141,17 +140,19 @@ The `rappor_sim.py` process also writes these files:
 
 `sum_bits.py` takes the `exp_out.csv` output, and produces the "counts" file:
 
-**exp_counts.csv**
-
-    11116,6273,6433,6347,6385,6290,6621,6359,6747,6623,6321,6696,6282,6652,6368,6286,6222
-    10861,6365,6263,6170,6258,6107,6633,6171,6226,6123,6286,6254,6408,6182,6442,6195,6187
+**case_counts.csv**
+```
+16170,6268,6102,6187,6143,6414,6090,6032,6454,6013,6105,6086,6399,6093,6265,6256,6167,6049,6110,6188,6348,6190,6131,6163,6035,6568,6123,6399,6094,6048,6295,6227,6310
+15630,6215,5966,5878,6016,5881,5976,5985,5950,6121,5781,5974,5985,5988,6143,5799,6196,5795,5928,6409,5938,6100,5917,6067,5878,6033,5903,6152,5835,6058,5874,5860,5974
+15500,6040,5910,5741,6103,6075,5826,5972,5892,6078,5953,5978,5957,5813,5739,6079,5915,5931,6076,6072,5991,6010,5768,5887,5830,6003,5887,5980,5873,5806,5915,5971,6027
     ...
+```
 
 The file has 64 rows, because the simulation has 64 cohorts by default (`m =
 64`).  This parameter should be adjusted based on the number of unique true
 values expected.  <!-- TODO: more detail -->
 
-There are 17 columns.  The left-most column is the total number of reports in
+There are 1+32 columns.  The left-most column is the total number of reports in
 the cohort.  The remaining 16 columns correspond to the `k = 16` bits in the
 Bloom filter.  Each column contains the number of reports with that bit set
 to 1.
@@ -163,7 +164,7 @@ So, in general, the "counts" file is a `(k+1) * m` matrix.
 
 In the simulation, we assume that the analyst will come up with a *superset* of
 the candidate strings.  This is done in the `more-candidates` /
-`print-candidates` functions in `demo.sh`.
+`print-candidates` functions in `regtest.sh`.
 
 You can also test what happens if you omit true strings from the candidates, by
 editing the invocation of `print-candidates` in `run-dist`:
@@ -179,13 +180,13 @@ process.
 The candidates are hashed by `hash_candidates.py` to create the "map" file,
 before being passed to R for analysis.
 
-**exp_map.csv**
+**case_map.csv**
 
     v1,8,13,30,22,37,37,53,53,77,67,89,86,97,97,118,128,139,136,157,<truncated>
     v10,13,2,25,28,42,45,58,60,68,66,91,89,108,102,113,125,130,131,<truncated>
 
-The map file has one row per candidate.  In this case, there are 60 rows: 
-50 for the true values and 10 for "fake" values, which make the candidates a
+The map file has one row per candidate.  In this case, there are 200 rows: 
+100 for the true values and 100 for "fake" values, which make the candidates a
 superset of the true input.
 
 The left most column is the raw candidate string.  Then there are 128 more

diff --git a/pipeline/task_spec_test.py b/pipeline/task_spec_test.py
@@ -24,10 +24,6 @@ def testDist(self):
     # NOTE: These files are opened, in order to count the reports.  Maybe skip
     # that step.
     f = cStringIO.StringIO("""\
-_tmp/counts/2015-12-01/exp_counts.csv
-_tmp/counts/2015-12-01/gauss_counts.csv
-_tmp/counts/2015-12-02/exp_counts.csv
-_tmp/counts/2015-12-02/gauss_counts.csv
 """)
     input_iter = task_spec.DistInputIter(f)
     #for row in input_iter:

diff --git a/regtest.sh b/regtest.sh
@@ -169,7 +169,7 @@ _run-one-instance() {
   local instance_dir=$case_dir/$test_instance
   mkdir --verbose -p $instance_dir
 
-  banner "Generating reports (gen_reports.R)"
+  banner "Generating reports (gen_true_values.R)"  # gen_true_values.R instead ?
 
   # the TRUE_VALUES_PATH environment variable can be used to avoid
   # generating new values every time.  NOTE: You are responsible for making

diff --git a/setup.sh b/setup.sh
@@ -34,7 +34,7 @@ r-packages() {
   # RUnit: for unit tests
   # abind: for decode_test only
   sudo R -e \
-    'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
+    'install.packages(c("glmnet", "Cairo",  "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
 }
 
 # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround.

diff --git a/test.sh b/test.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 #
 # Copyright 2014 Google Inc. All rights reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -87,7 +87,7 @@ lint() {
   banner "Linting Python source files"
   py-lint
   echo
-  
+
   banner "Linting Documentation files"
   doc-lint
 }
@@ -126,12 +126,12 @@ r-unit() {
   tests/compare_dist_test.R
 
   tests/gen_counts_test.R
-  
+
   tests/gen_true_values_test.R
 
   analysis/R/decode_test.R
 
-  analysis/test/run_tests.R
+  analysis/R/run_tests.R
 }
 
 doc-lint() {

diff --git a/tests/gen_true_values.R b/tests/gen_true_values.R
@@ -18,13 +18,14 @@ source('tests/gen_counts.R')
 
 # Usage:
 #
-# $ ./gen_true_values.R exp 100 10000 1 foo.csv
+# $ ./gen_true_values.R exp 100 10000 1 32 foo.csv
 #
 # Inputs:
 #   distribution name
 #   size of the distribution's support
 #   number of clients
 #   reports per client
+#   number of cohorts
 #   name of the output file
 # Output:
 #   csv file with reports sampled according to the specified distribution. 

diff --git a/tests/rappor_sim.py b/tests/rappor_sim.py
@@ -16,10 +16,10 @@
 
 """Run the RAPPOR Python client on simulated input.
 
-It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
+It takes a 3-column CSV file as generated by gen_true_values.R, and outputs a 5
 column CSV of RAPPOR'd data.
 
-Input columns: client,true_value
+Input columns: client,cohort,true_value
 Output coumns: client,cohort,bloom,prr,rappor
 
 TODO:

diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
@@ -18,7 +18,7 @@
     # (case_name distr num_unique_values num_clients values_per_client)
     # (num_bits num_hashes num_cohorts)
     # (p q f) (num_additional regexp_to_remove)
-    ('demo1 unif    100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+    ('demo1 unif    100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), # regex v[0-9]*9$ removes 9,19,29,...,99 and NOT 109, 119, ... Is it desirable or a mistake? 
     ('demo2 gauss   100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
     ('demo3 exp     100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
     ('demo4 zipf1   100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),

diff --git a/ubuntu.txt b/ubuntu.txt
@@ -0,0 +1,24 @@
+# Maybe this need to be embedded to setup.sh
+sudo apt-get update;
+sudo apt-get install build-essential g++ python-dev graphviz libssl-dev r-base r-base-dev r-cran-ggplot2 r-cran-gplots 
+# Cairo package
+sudo apt-get install libcairo2-dev
+sudo apt-get install libxt-dev
+
+
+
+R #run R in terminal
+> install.packages('Cairo')
+install.packages('limSolve')
+
+./setup.sh
+./build.sh
+
+
+Issues with the web app
+----
+You may need to upgrade the "R" package on Ubuntu if you get the following error message:
+"ERROR: names(x) must be a character vector of the same length as x"
+https://www.biostars.org/p/88651/
+
+> install.packages('shiny')