Skip to content

Commit

Permalink
Merge branch 'master' into swig_1_db
Browse files Browse the repository at this point in the history
  • Loading branch information
twj8CDC authored Jun 23, 2023
2 parents baa9ebc + 439b723 commit 438ac9c
Show file tree
Hide file tree
Showing 153 changed files with 20,194 additions and 172 deletions.
7 changes: 6 additions & 1 deletion BART/BART/BART/bart_python/bart_py_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@

# COMMAND ----------

# MAGIC %sh
# MAGIC ls src

# COMMAND ----------

# MAGIC %md
# MAGIC # Run SWIG

Expand All @@ -31,4 +36,4 @@

# COMMAND ----------

# MAGIC %sh gcc -v -c example.c example_wrap.c -I/usr/include/python3.10
# MAGIC %sh gcc -v -c example.c example_wrap.c -I/usr/include/python3.10 -I/usr/share/R/include
Binary file added BART/BART/BART/build/vignette.rds
Binary file not shown.
38 changes: 19 additions & 19 deletions BART/BART/BART/inst/CITATION
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
bibentry(bibtype = "Article",
title = "Nonparametric Machine Learning and Efficient Computation with {B}ayesian Additive Regression Trees: The {BART} {R} Package",
author = c(person(given = "Rodney",
family = "Sparapani",
email = "[email protected]"),
person(given = "Charles",
family = "Spanbauer"),
person(given = "Robert",
family = "McCulloch")),
journal = "Journal of Statistical Software",
year = "2021",
volume = "97",
number = "1",
pages = "1--66",
doi = "10.18637/jss.v097.i01",

header = "To cite BART in publications use:"
)

bibentry(bibtype = "Article",
title = "Nonparametric Machine Learning and Efficient Computation with {B}ayesian Additive Regression Trees: The {BART} {R} Package",
author = c(person(given = "Rodney",
family = "Sparapani",
email = "[email protected]"),
person(given = "Charles",
family = "Spanbauer"),
person(given = "Robert",
family = "McCulloch")),
journal = "Journal of Statistical Software",
year = "2021",
volume = "97",
number = "1",
pages = "1--66",
doi = "10.18637/jss.v097.i01",
header = "To cite BART in publications use:"
)
14 changes: 10 additions & 4 deletions BART/BART/BART/inst/cxx-ex/Makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,23 @@

CXX = g++
CXXFLAGS = -std=gnu++11 -Wall -g -O2 -fpic -mtune=native
##CXX = `R CMD config CXX11`
##CXXFLAGS = `R CMD config CXX11STD` `R CMD config CXX11FLAGS` `R CMD config CXX11PICFLAGS`
##CXX = g++
##CXXFLAGS = -std=gnu++11 -Wall -g -O2 -fpic -mtune=native
CXX = `R CMD config CXX11`
CXXFLAGS = `R CMD config CXX11STD` `R CMD config CXX11FLAGS` `R CMD config CXX11PICFLAGS`

## Rmath library for random number generation and other needs

# CPPFLAGS = `R CMD config CPPFLAGS` -I. -DMATHLIB_STANDALONE -DRNG_Rmath
CPPFLAGS = -I/usr/share/R/include -I. -DMATHLIB_STANDALONE -DRNG_Rmath
LIB = `R CMD config LDFLAGS` -lRmath
# RNG_random = 'noR'

## STL random class for random number generation and the Rmath functions for other needs
# CPPFLAGS = -I. -I/usr/local/include -DMATHLIB_STANDALONE -DRNG_random
CPPFLAGS = `R CMD config CPPFLAGS` -I. -DMATHLIB_STANDALONE -DRNG_Rmath -I /usr/share/R/include
LIB = `R CMD config LDFLAGS` -lRmath

## STL random class for random number generation and the Rmath functions for other needs
## CPPFLAGS = -I. -I/usr/local/include -DMATHLIB_STANDALONE -DRNG_random

.cpp.o : Makefile common.h rn.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $*.o
Expand Down
9 changes: 8 additions & 1 deletion BART/BART/BART/inst/cxx-ex/build_sh.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# Databricks notebook source
# MAGIC %md
# MAGIC R.home('include')

# COMMAND ----------

# MAGIC %sh
# MAGIC make wmain.out check
# MAGIC make clean
# MAGIC make check
# MAGIC
292 changes: 146 additions & 146 deletions BART/BART/BART/man/alligator.Rd
Original file line number Diff line number Diff line change
@@ -1,146 +1,146 @@
\name{alligator}
\alias{alligator}
\docType{data}
\title{ American alligator Food Choice }

\description{ In 1985, American alligators were harvested by hunters
from August 26 to September 30 in peninsular Florida from lakes Oklawaha
(Putnam County), George (Putnam and Volusia counties), Hancock (Polk
County) and Trafford (Collier County). Lake, length and sex were
recorded for each alligator. Stomachs from a sample of alligators
1.09-3.89m long were frozen prior to analysis. After thawing, stomach
contents were removed and separated and food items were identified and
tallied. Volumes were determined by water displacement. The stomach
contents of 219 alligators were classified into five categories of
primary food choice: Fish
(the most common primary food choice), Invertebrate (snails, insects,
crayfish, etc.), Reptile (turtles, alligators), Bird, and Other
(amphibians, plants, household pets, stones, and other debris). }

\usage{data(alligator)}
\format{
A data frame with 80 observations on the following 5 variables.
\describe{
\item{\code{lake}}{a factor with levels \code{George} \code{Hancock} \code{Oklawaha} \code{Trafford}}
\item{\code{sex}}{a factor with levels \code{female} \code{male}}
\item{\code{size}}{alligator size, a factor with levels \code{large} (>2.3m) \code{small} (<=2.3m)}
\item{\code{food}}{primary food choice, a factor with levels \code{bird} \code{fish} \code{invert} \code{other} \code{reptile}}
\item{\code{count}}{cell frequency, a numeric vector}
}
}

\details{ The table contains a fair number of 0 counts. \code{food} is
the response variable. \code{fish} is the most frequent choice, and
often taken as a baseline category in multinomial response models. }

\source{
Agresti, A. (2002).
\emph{Categorical Data Analysis},
New York: Wiley, 2nd Ed., Table 7.1
}

\references{
Delany MF, Linda SB, Moore CT (1999).
"Diet and condition of American alligators in 4 Florida lakes."
In \emph{Proceedings of the Annual Conference of the Southeastern
Association of Fish and Wildlife Agencies}, \bold{53},
375--389.
}

\examples{

data(alligator)

## nnet::multinom Multinomial logit model fit with neural nets
fit <- multinom(food ~ lake+size+sex, data=alligator, weights=count)

summary(fit$fitted.values)
## 1=bird, 2=fish, 3=invert, 4=other, 5=reptile

(L=length(alligator$count))
(N=sum(alligator$count))
y.train=integer(N)
x.train=matrix(nrow=N, ncol=3)
x.test=matrix(nrow=L, ncol=3)
k=1
for(i in 1:L) {
x.test[i, ]=as.integer(
c(alligator$lake[i], alligator$size[i], alligator$sex[i]))
if(alligator$count[i]>0)
for(j in 1:alligator$count[i]) {
y.train[k]=as.integer(alligator$food[i])
x.train[k, ]=as.integer(
c(alligator$lake[i], alligator$size[i], alligator$sex[i]))
k=k+1
}
}
table(y.train)
##test mbart with token run to ensure installation works
set.seed(99)
check = mbart(x.train, y.train, nskip=1, ndpost=1)

\dontrun{
set.seed(99)
check = mbart(x.train, y.train, nskip=1, ndpost=1)
post=mbart(x.train, y.train, x.test)

##post=mc.mbart(x.train, y.train, x.test, mc.cores=8, seed=99)
##check=predict(post, x.test, mc.cores=8)
##print(cor(post$prob.test.mean, check$prob.test.mean)^2)

par(mfrow=c(3, 2))
K=5
for(j in 1:5) {
h=seq(j, L*K, K)
print(cor(fit$fitted.values[ , j], post$prob.test.mean[h])^2)
plot(fit$fitted.values[ , j], post$prob.test.mean[h],
xlim=0:1, ylim=0:1,
xlab=paste0('NN: Est. Prob. j=', j),
ylab=paste0('BART: Est. Prob. j=', j))
abline(a=0, b=1)
}
par(mfrow=c(1, 1))

L=16
x.test=matrix(nrow=L, ncol=3)
k=1
for(size in 1:2)
for(sex in 1:2)
for(lake in 1:4) {
x.test[k, ]=c(lake, size, sex)
k=k+1
}
x.test

## two sizes: 1=large: >2.3m, 2=small: <=2.3m
pred=predict(post, x.test)
##pred=predict(post, x.test, mc.cores=8)
ndpost=nrow(pred$prob.test)

size.test=matrix(nrow=ndpost, ncol=K*2)
for(i in 1:K) {
j=seq(i, L*K/2, K) ## large
size.test[ , i]=apply(pred$prob.test[ , j], 1, mean)
j=j+L*K/2 ## small
size.test[ , i+K]=apply(pred$prob.test[ , j], 1, mean)
}
size.test.mean=apply(size.test, 2, mean)
size.test.025=apply(size.test, 2, quantile, probs=0.025)
size.test.975=apply(size.test, 2, quantile, probs=0.975)

plot(factor(1:K, labels=c('bird', 'fish', 'invert', 'other', 'reptile')),
rep(1, K), col=1:K, type='n', lwd=1, lty=0,
xlim=c(1, K), ylim=c(0, 0.5), ylab='Prob.',
sub="Multinomial BART\nFriedman's partial dependence function")
points(1:K, size.test.mean[1:K+K], col=1)
lines(1:K, size.test.025[1:K+K], col=1, lty=2)
lines(1:K, size.test.975[1:K+K], col=1, lty=2)
points(1:K, size.test.mean[1:K], col=2)
lines(1:K, size.test.025[1:K], col=2, lty=2)
lines(1:K, size.test.975[1:K], col=2, lty=2)
## legend('topright', legend=c('Small', 'Large'),
## pch=1, col=1:2)

}
}
\keyword{datasets}
\name{alligator}
\alias{alligator}
\docType{data}
\title{ American alligator Food Choice }

\description{ In 1985, American alligators were harvested by hunters
from August 26 to September 30 in peninsular Florida from lakes Oklawaha
(Putnam County), George (Putnam and Volusia counties), Hancock (Polk
County) and Trafford (Collier County). Lake, length and sex were
recorded for each alligator. Stomachs from a sample of alligators
1.09-3.89m long were frozen prior to analysis. After thawing, stomach
contents were removed and separated and food items were identified and
tallied. Volumes were determined by water displacement. The stomach
contents of 219 alligators were classified into five categories of
primary food choice: Fish
(the most common primary food choice), Invertebrate (snails, insects,
crayfish, etc.), Reptile (turtles, alligators), Bird, and Other
(amphibians, plants, household pets, stones, and other debris). }

\usage{data(alligator)}
\format{
A data frame with 80 observations on the following 5 variables.
\describe{
\item{\code{lake}}{a factor with levels \code{George} \code{Hancock} \code{Oklawaha} \code{Trafford}}
\item{\code{sex}}{a factor with levels \code{female} \code{male}}
\item{\code{size}}{alligator size, a factor with levels \code{large} (>2.3m) \code{small} (<=2.3m)}
\item{\code{food}}{primary food choice, a factor with levels \code{bird} \code{fish} \code{invert} \code{other} \code{reptile}}
\item{\code{count}}{cell frequency, a numeric vector}
}
}

\details{ The table contains a fair number of 0 counts. \code{food} is
the response variable. \code{fish} is the most frequent choice, and
often taken as a baseline category in multinomial response models. }

\source{
Agresti, A. (2002).
\emph{Categorical Data Analysis},
New York: Wiley, 2nd Ed., Table 7.1
}

\references{
Delany MF, Linda SB, Moore CT (1999).
"Diet and condition of American alligators in 4 Florida lakes."
In \emph{Proceedings of the Annual Conference of the Southeastern
Association of Fish and Wildlife Agencies}, \bold{53},
375--389.
}

\examples{

data(alligator)

## nnet::multinom Multinomial logit model fit with neural nets
fit <- multinom(food ~ lake+size+sex, data=alligator, weights=count)

summary(fit$fitted.values)
## 1=bird, 2=fish, 3=invert, 4=other, 5=reptile

(L=length(alligator$count))
(N=sum(alligator$count))
y.train=integer(N)
x.train=matrix(nrow=N, ncol=3)
x.test=matrix(nrow=L, ncol=3)
k=1
for(i in 1:L) {
x.test[i, ]=as.integer(
c(alligator$lake[i], alligator$size[i], alligator$sex[i]))
if(alligator$count[i]>0)
for(j in 1:alligator$count[i]) {
y.train[k]=as.integer(alligator$food[i])
x.train[k, ]=as.integer(
c(alligator$lake[i], alligator$size[i], alligator$sex[i]))
k=k+1
}
}
table(y.train)
##test mbart with token run to ensure installation works
set.seed(99)
check = mbart(x.train, y.train, nskip=1, ndpost=1)

\dontrun{
set.seed(99)
check = mbart(x.train, y.train, nskip=1, ndpost=1)
post=mbart(x.train, y.train, x.test)

##post=mc.mbart(x.train, y.train, x.test, mc.cores=8, seed=99)
##check=predict(post, x.test, mc.cores=8)
##print(cor(post$prob.test.mean, check$prob.test.mean)^2)

par(mfrow=c(3, 2))
K=5
for(j in 1:5) {
h=seq(j, L*K, K)
print(cor(fit$fitted.values[ , j], post$prob.test.mean[h])^2)
plot(fit$fitted.values[ , j], post$prob.test.mean[h],
xlim=0:1, ylim=0:1,
xlab=paste0('NN: Est. Prob. j=', j),
ylab=paste0('BART: Est. Prob. j=', j))
abline(a=0, b=1)
}
par(mfrow=c(1, 1))

L=16
x.test=matrix(nrow=L, ncol=3)
k=1
for(size in 1:2)
for(sex in 1:2)
for(lake in 1:4) {
x.test[k, ]=c(lake, size, sex)
k=k+1
}
x.test

## two sizes: 1=large: >2.3m, 2=small: <=2.3m
pred=predict(post, x.test)
##pred=predict(post, x.test, mc.cores=8)
ndpost=nrow(pred$prob.test)

size.test=matrix(nrow=ndpost, ncol=K*2)
for(i in 1:K) {
j=seq(i, L*K/2, K) ## large
size.test[ , i]=apply(pred$prob.test[ , j], 1, mean)
j=j+L*K/2 ## small
size.test[ , i+K]=apply(pred$prob.test[ , j], 1, mean)
}
size.test.mean=apply(size.test, 2, mean)
size.test.025=apply(size.test, 2, quantile, probs=0.025)
size.test.975=apply(size.test, 2, quantile, probs=0.975)

plot(factor(1:K, labels=c('bird', 'fish', 'invert', 'other', 'reptile')),
rep(1, K), col=1:K, type='n', lwd=1, lty=0,
xlim=c(1, K), ylim=c(0, 0.5), ylab='Prob.',
sub="Multinomial BART\nFriedman's partial dependence function")
points(1:K, size.test.mean[1:K+K], col=1)
lines(1:K, size.test.025[1:K+K], col=1, lty=2)
lines(1:K, size.test.975[1:K+K], col=1, lty=2)
points(1:K, size.test.mean[1:K], col=2)
lines(1:K, size.test.025[1:K], col=2, lty=2)
lines(1:K, size.test.975[1:K], col=2, lty=2)
## legend('topright', legend=c('Small', 'Large'),
## pch=1, col=1:2)

}
}
\keyword{datasets}
2 changes: 1 addition & 1 deletion BART/Snakeify BART for scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
# MAGIC cd ./BART
# MAGIC wget -O BART.pdf https://cran.r-project.org/web/packages/BART/BART.pdf
# MAGIC wget -O BART.tar.gz https://cran.r-project.org/src/contrib/BART_2.9.4.tar.gz
# MAGIC tar -xvf ./BART.tar.gz
# MAGIC tar -m -xvf ./BART.tar.gz

# COMMAND ----------

Expand Down
Loading

0 comments on commit 438ac9c

Please sign in to comment.