diff --git a/Examples/SIMPLISMA.png b/Examples/SIMPLISMA.png new file mode 100644 index 0000000..c63a124 Binary files /dev/null and b/Examples/SIMPLISMA.png differ diff --git a/Examples/SIMPLISMA_LCF.png b/Examples/SIMPLISMA_LCF.png new file mode 100644 index 0000000..90f24eb Binary files /dev/null and b/Examples/SIMPLISMA_LCF.png differ diff --git a/Examples/SVD.png b/Examples/SVD.png new file mode 100644 index 0000000..f447d10 Binary files /dev/null and b/Examples/SVD.png differ diff --git a/README.rst b/README.rst index 8384416..22ca589 100644 --- a/README.rst +++ b/README.rst @@ -69,14 +69,7 @@ What it **does** do: This is the core the MCR-ALS methods. - Enable the application of certain constraints (currently): sum-to-one, non-negativity, normalization, maximum limits (closure) - -What it **does not** do: - -- Estimate the number of components in the sample. This is a bonus feature in - some more-advanced MCR-ALS packages. - - - In MATLAB: https://mcrals.wordpress.com/ - - In R: https://cran.r-project.org/web/packages/ALS/index.html +- Estimate the number of components in the data using SVD and the SIMPLISMA Algorithm Dependencies ------------ @@ -232,3 +225,4 @@ Contributors - Charles H Camp Jr - Charles Le Losq (charles.lelosq@anu.edu.au) +- Adam H Clark (adam.clark@psi.ch) diff --git a/pymcr/__init__.py b/pymcr/__init__.py index 45addf9..04dfc9e 100644 --- a/pymcr/__init__.py +++ b/pymcr/__init__.py @@ -2,4 +2,5 @@ from . import mcr from . import constraints from . import metrics -from . import regressors \ No newline at end of file +from . import regressors +from . import simplisma \ No newline at end of file diff --git a/pymcr/metrics.py b/pymcr/metrics.py index e022561..9c7f935 100644 --- a/pymcr/metrics.py +++ b/pymcr/metrics.py @@ -4,6 +4,11 @@ All functions must take C, ST, D_actual, D_calculated """ +import numpy as np + def mse(C, ST, D_actual, D_calculated): """ Mean square error""" return ((D_actual - D_calculated)**2).sum()/D_actual.size + +def lof(C, ST, D_actual, D_calculated): + return 100*np.sqrt(np.sum((D_actual - D_calculated)**2)/np.sum((D_actual**2))) diff --git a/pymcr/simplisma.py b/pymcr/simplisma.py new file mode 100644 index 0000000..beaf2ae --- /dev/null +++ b/pymcr/simplisma.py @@ -0,0 +1,185 @@ +import numpy as np +import matplotlib.pyplot as plt +import random, sys +import scipy.optimize as optimize +from sklearn.decomposition import PCA +import matplotlib.pyplot as plt +from pymcr.lcf import lcf + +#Main Algorithm +class simplisma: + def __init__(self,D, nPure, noise, lcf_flag): + """Perform SIMPLISMA + + D = data matrix + + nPure = number of pure components + + noise = allowed noise percentage + + lcf = flag to perform constrained lcf + + """ + print('Performing Simplisma') + + def pure(D, nr, error, lcf_flag): + + if lcf_flag == True: + xplt = 5 + else: + xplt = 3 + + def wmat(c,imp,irank,jvar): + dm=np.zeros((irank+1, irank+1)) + dm[0,0]=c[jvar,jvar] + + for k in range(irank): + kvar=np.int(imp[k]) + + dm[0,k+1]=c[jvar,kvar] + dm[k+1,0]=c[kvar,jvar] + + for kk in range(irank): + kkvar=np.int(imp[kk]) + dm[k+1,kk+1]=c[kvar,kkvar] + + return dm + + nrow,ncol=D.shape + + dl = np.zeros((nrow, ncol)) + imp = np.zeros(nr) + mp = np.zeros(nr) + + w = np.zeros((nr, ncol)) + p = np.zeros((nr, ncol)) + s = np.zeros((nr, ncol)) + + error=error/100 + mean=np.mean(D, axis=0) + error=np.max(mean)*error + + s[0,:]=np.std(D, axis=0) + w[0,:]=(s[0,:]**2)+(mean**2) + p[0,:]=s[0,:]/(mean+error) + + imp[0] = np.int(np.argmax(p[0,:])) + mp[0] = p[0,:][np.int(imp[0])] + + l=np.sqrt((s[0,:]**2)+((mean+error)**2)) + + for j in range(ncol): + dl[:,j]=D[:,j]/l[j] + + c=np.dot(dl.T,dl)/nrow + + w[0,:]=w[0,:]/(l**2) + p[0,:]=w[0,:]*p[0,:] + s[0,:]=w[0,:]*s[0,:] + + print('purest variable 1: ', np.int(imp[0]+1), mp[0]) + + for i in range(nr-1): + for j in range(ncol): + dm=wmat(c,imp,i+1,j) + w[i+1,j]=np.linalg.det(dm) + p[i+1,j]=w[i+1,j]*p[0,j] + s[i+1,j]=w[i+1,j]*s[0,j] + + imp[i+1] = np.int(np.argmax(p[i+1,:])) + mp[i+1] = p[i+1,np.int(imp[i+1])] + + print('purest variable '+str(i+2)+': ', np.int(imp[i+1]+1), mp[i+1]) + + S=np.zeros((nrow, nr)) + + for i in range(nr): + S[0:nrow,i]=D[0:nrow,np.int(imp[i])] + + plt.subplot(xplt, 1, 2) + plt.plot(S) + + C_u = np.dot(D.T, np.linalg.pinv(S.T)) + + plt.subplot(xplt, 1, 3) + for i in range(nr): + plt.plot(C_u[:,i]) + plt.ylabel('Unconstrained') + + if lcf_flag == True: + #Constrained linear combination + + C_c, LOF = lcf.lcf(D, S) + plt.subplot(xplt, 1, 4) + for i in range(nr): + plt.plot(C_c[:,i]) + plt.ylabel('Constrained') + + plt.subplot(xplt,1,5) + plt.plot(LOF) + plt.ylabel('LOF (%)') + plt.show() + + return S, C_u, C_c, LOF + + else: + plt.show() + return S, C_u, 0, 0 + +class svd: + def __init__(self, D, nSVD): + """Perform SIMPLISMA + + D = data matrix + + nSVD = maximum number of SVD components + + """ + print('Performing SVD') + + def svd(D, nSVD): + D_0mean = D - D.mean(0) + + U, s, Vh = np.linalg.svd(D_0mean, full_matrices=False) + + pca = PCA(n_components=nSVD) + pca.fit(D) + D_transformed = pca.transform(D) + + eigens = pca.singular_values_ + variance = pca.explained_variance_ + n_sample = D.shape[0] + + total_var = (D_0mean**2).sum()/(n_sample-1) + vs = np.zeros(nSVD) + + for i in range(nSVD): + Xi = U[:,i].reshape(-1, 1)*s[i]@Vh[i].reshape(1, -1) + vs[i] = np.sum(Xi**2)/(n_sample-1) + explained = vs + + total_var = (D_0mean**2).sum()/(n_sample-1) + rs = np.zeros(nSVD) + for i in range(nSVD): + Xi = U[:,i].reshape(-1, 1)*s[i]@Vh[i].reshape(1, -1) + rs[i] = np.sum(Xi**2)/((n_sample-1)*total_var) + explained_variance_ratio = rs + + plt.subplot(1, 2, 1) + plt.plot(np.asarray(range(nSVD))+1, eigens, 'o-') + plt.ylabel('Eigenvalues') + plt.xlabel('Principle Component') + plt.xticks(np.arange(0, nSVD+1, 2)) + + plt.subplot(1, 2, 2) + plt.plot(np.asarray(range(nSVD))+1, np.cumsum(explained_variance_ratio), 'o-') + plt.ylabel('Explained Variance') + plt.xlabel('Principle Component') + plt.xticks(np.arange(0, nSVD+1, 2)) + + plt.show() + + return eigens, explained_variance_ratio + + + diff --git a/pymcr/tests/test_svd_simplisma.py b/pymcr/tests/test_svd_simplisma.py new file mode 100644 index 0000000..cebbae6 --- /dev/null +++ b/pymcr/tests/test_svd_simplisma.py @@ -0,0 +1,79 @@ +import numpy as np +import random +import matplotlib.pyplot as plt +from pymcr.simplisma import svd, simplisma + +#generate some sample data D +#create random normalised gaussian functions + +#Number of Spectral Components +nPure = 5 +#maximum number of components for SVD +nSVD = 15 +#Allowed Noise Percentage +noise = 5 +#manual +manual = False + +x0 = np.zeros(nPure) +sigma = np.zeros(nPure) +for i in range(nPure): + x0[i] = random.uniform(-100, 100) + sigma[i] = random.uniform(3, 25) + +x = np.linspace(start = -120, stop = 120, num = 2000) + +gx = np.zeros((len(x),nPure)) +plt.subplot(5, 1, 1) +plt.subplots_adjust(left=0.1, bottom=0.075, right=0.95, top=0.9, wspace=0.2, hspace=0.5) + +for i in range(nPure): + gx[:,i] = np.exp(-(x-x0[i])**2/(2*sigma[i]**2))/np.sqrt(2*np.pi*sigma[i]**2) + plt.plot(gx[:,i]) + plt.title('Real Components') + +#create D with random normalised linear combination of gaussian functions +nspec = 200 +D = np.zeros((len(x), nspec)) +idx = list(range(nPure)) + +C_r = np.zeros((nspec, nPure)) + +for i in range(nspec): + randj = np.zeros(nPure) + random.shuffle(idx) + for j in range(nPure): + if j < nPure-1: + randj[j] = random.uniform(0, 1-np.sum(randj)) + D[:,i] = D[:,i]+(gx[:,idx[j]]*randj[j]) + C_r[i,j] = randj[j] + elif j == nPure-1: + randj[j] = 1-np.sum(randj) + D[:,i] = D[:,i]+(gx[:,idx[j]]*randj[j]) + C_r[i,j] = randj[j] + +#run SVD +eigens, explained_variance_ratio = svd.svd(D, nSVD) +nPure = np.int(input('Number of Principle Components for SIMPLISMA :')) + +#Run Simplisma +plt.subplot(3, 1, 1) +plt.subplots_adjust(left=0.1, bottom=0.075, right=0.95, top=0.9, wspace=0.2, hspace=0.5) +for i in range(nPure): + gx[:,i] = np.exp(-(x-x0[i])**2/(2*sigma[i]**2))/np.sqrt(2*np.pi*sigma[i]**2) + plt.plot(gx[:,i]) + plt.title('Real Components') +S, C_u, _, _ = simplisma.pure(D, nPure, noise, False) + +#Run Simplisma with constrained LCF +plt.subplot(5, 1, 1) +plt.subplots_adjust(left=0.1, bottom=0.075, right=0.95, top=0.9, wspace=0.2, hspace=0.5) +for i in range(nPure): + gx[:,i] = np.exp(-(x-x0[i])**2/(2*sigma[i]**2))/np.sqrt(2*np.pi*sigma[i]**2) + plt.plot(gx[:,i]) + plt.title('Real Components') +S, C_u, C_c, LOF = simplisma.pure(D, nPure, noise, True) + + + +