From 97b93cd0efc3a4c7641a551a27725677f552afc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Kl=C3=BCttermann?= Date: Thu, 30 Dec 2021 09:23:31 +0100 Subject: [PATCH] initial push --- README | 15 ++++++ __pycache__/data.cpython-39.pyc | Bin 0 -> 665 bytes __pycache__/data2.cpython-39.pyc | Bin 0 -> 666 bytes __pycache__/loss.cpython-39.pyc | Bin 0 -> 1695 bytes __pycache__/mu.cpython-39.pyc | Bin 0 -> 1157 bytes __pycache__/n2ulayer.cpython-39.pyc | Bin 0 -> 2106 bytes data.py | 62 +++++++++++++++++++++++ loss.py | 50 +++++++++++++++++++ main.py | 75 ++++++++++++++++++++++++++++ mu.py | 35 +++++++++++++ n2ulayer.py | 72 ++++++++++++++++++++++++++ requirements.txt | 4 ++ show.py | 23 +++++++++ 13 files changed, 336 insertions(+) create mode 100644 README create mode 100644 __pycache__/data.cpython-39.pyc create mode 100644 __pycache__/data2.cpython-39.pyc create mode 100644 __pycache__/loss.cpython-39.pyc create mode 100644 __pycache__/mu.cpython-39.pyc create mode 100644 __pycache__/n2ulayer.cpython-39.pyc create mode 100644 data.py create mode 100644 loss.py create mode 100644 main.py create mode 100644 mu.py create mode 100644 n2ulayer.py create mode 100644 requirements.txt create mode 100644 show.py diff --git a/README b/README new file mode 100644 index 0000000..cfb5e10 --- /dev/null +++ b/README @@ -0,0 +1,15 @@ +Usual feature selection tries to find the most important features of a given dataset. This is often done to make any downstream machine learning task easier. +This repository (and the connected thesis) tries to extend the space of possibly selected features from just the given features to linear combinations of features. This could every existing machine learning pipeline relying on feature selection to work better and make lots of data more interpretable. + +To do this, it uses a special tensorflow layer (implemented in n2ulayer.py and applied in mu.py), which only allows for special linear combinations (Rotations in nd space, as they dont allow the network to lose information or to create artificial patterns that could be detected by the loss). +This loss (defined in loss.py) is minimal for a high contrast between features. It is currently only able to work with 2 dimensional output data, but is (mostly) differentiable, making it possible to use both it and this kind of rotation networks (in main.py) to combine the features of some toy dataset (defined in data.py) in a highly contrastful manner. +data.py contains a hidden feature that should become visible by comparing x to y-z. But as you can see from multiple runs of main.py, there are also other reations (for example y and z, as y contains some z) and combinations of features to be found. In show.py you find the three toy features x,y,z plottet against each other and the designated feature plotted to for reference. + +As you migth see, the basic idea works, but there are some caveats: +- The loss through training is not monotonic. Often is the final loss higher than 1 (the expectation for random data). We can only solve this by restoring the best weights. This migth only be solved by a better loss function or different optimizer +- This is important as it is not clear if the achived value is actually the minimum +- This is implemented only for 2 dimensional output data. For a good feature selection algorithm this would be important to extend. +- It is also not clear how well this scales to higher dimensions. The number of layers is proportional to input_dim*output_dim, but it is unclear how well the algorithm converges with a high number of layers +- You might be able to solve a restricted version of this algorithm analytically and extend this to a greedy algorithm (but only if this is something youre interested in) + + diff --git a/__pycache__/data.cpython-39.pyc b/__pycache__/data.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5f608b51f0c089e35e42101e023802819113388d GIT binary patch literal 665 zcmZ8fJC74F5cb&KM?!K6qJYvMoy1DfZE-E(kSL%)6ws}ZR!;1@1ojbmcP|fZD)oPAfcKm?*lb2HT_85v4kgH;$xPd1$mGJKy+{wYOXyMY}wxHYzbhMwvSk}+L9u&>C|90~-X|OSUmj~=!&}+NBRo(YArU@+2QZ)GBd~0q)fP@7Fz%{R z*Lp>h&sPS23Hg^@$XutX>A0BtNal{C0g~nOtgI7WP0Pjpmba+y_QNN6shUhbb#Gs} PIl_nCyCL*wj~>Ba6lS0D literal 0 HcmV?d00001 diff --git a/__pycache__/data2.cpython-39.pyc b/__pycache__/data2.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb1d189537a1cb8ae8042ed2ad9e21baac60aa0d GIT binary patch literal 666 zcmZ8fJC74F5cb&KM?!K6qJRo43dBm$tpx3bL!y8JQ9!psS~;=r64*!N-6aogD){!t$K}AlXq}I6)4)0E!#i~irqRyx$<1}Hrdes@>mNS-qP0X zc`e)cEpNTnKV#V3UuRu`c&-a=WUa+xC6uh?P%Ng}Xd#6xlo*St{NNm;Gg;5dVkpK< zEtaF@LpdBrL$MmIuGD(8-Yc<~WtkXjJW@BN&}!f29H&71zC4K29Yz2A%V&r0mj`hg zF-wbi;u~yV=1I^Lv$N9VQDC94AT5k7Oe`!dtSqd*64S#ne;&lsGS~6EDosl3}c7r`Sp06x->K>k`j*ti+z#SM+4k2)6T*!uLn#);{rV&^+&uR%GB--P_jE@ZCL)O1|T{Ume4(E!Qvc~;g5ucqZ&2Aev5GJ|Y)oNF=WjjAM(gK3h0ufL|MbTps6i9CceCbUWl@TdN_U_6dSE+5R zPHB#Mhxq6h=%FvrbFaMiQoTYi?Kdk+acv+3I6K4Pa^~ZFL)C644CBv(Kh#_@_7@qq zM+D=0O!qs4W}0Vg!0{Cs%f&!&S_`rumxBQ^y1a_>` zoBb`_hTYVU^p@U+mY7s`PKEugj@TNsP5Wf)WLxj(UA?CtEP_GD?3l!KG}rvS&}=LY z*v8r=OQ-tbB6!D<75|C#9xstUJL+*Co*Jthl*TJHIyHq}YIsHfzZoGkvR`7luOO=6 z1FP5-pUX;Cb;tu&NzE&sviAtK&r0a;_!EXu;_~aHi}6xrqgkd((=A_{uALRdq&V*8 zMit%ZxC^PQRTuTvqn_~5XqLNK?!$35Rb?;q{M_@x^B=t^$CgsMW;Vs=pD+8br@85$ zxv8DGzV4UCm3?P2Gb;Oe*3YKSJwLnft(%ZT3Q2WzHAb0m+lKW&#N=sU)`*`$DS=n> zJy!8bRDl)?Nh=>M^6eAa;PTr~pna?CL=_`*5AjkU$aP?2d~A~h@vyf_?OIw&=8m%l zzY;BYeP@kt9fVdLoy(-%fxL&1g4GBgL8-(Cs@fV8U&t#Winx672|TV;g;rK~wRt;H zThZNtc)VN`q`Rw3F0 z{L`EWzp(^ag3Y6Oqmtll1m4Ebc-5%lD=}}DqH0z`2MaMz!0e!EVn+&1R>>8w(wfEc z$a2J4lDOt|DZt5wvd5rZ>7!!>R$EKy`T!WbB%1YFo-5B^-6@^tZ|aJAelZBNIrrk? z!VBj-cX#h4BKm%#ALCrN1;M!Fha%xE(c+1)hgcZ}w0<+_mNv}&0S2A~ma%zI1t%Ps z1b8g2z+q^dw6V}D>=HY$YbeVQT`h^{V7gp{U_rP-Zy(}7JQY%%Pu!}okq?Vmj`m7N z(_)0~E--y+&dF5oF# zj&?I1^FvJAgYM(uFjtdeI9&dVvC2F`1x-H7pO)iZkhNGJv&}Vm$AWf7FFUv1= zpIPdXLFeP@^ZHS=bg`>$q!(gEUaD}7W0OFkax)y-FUau>!bcX^r@3#;iph9t^Hr_G qGdn5DTNrD!b?e68_b-+rsqjtdtiXM?c_Y&SD!x2Z*3<)-X`2l9}29}ao zx}=BX^UIpUo-4YK9tNI%EiALtax0eHk|nLl{K-7BdQ`?)8fVq3eBnZ+OKq}B zU~&;Ggm!6<_Nm!}{T2r@TycDX!-V+!*DNn;0W6}$(yOKA7t{(XC+r8MWd3yCE!i2r z5SHWFav(nV#A*SNNDYx!)Q8AyP?421SV9*hd9J6*v1#rCsDPSfE-X%}@i^6kz+h@7 z#04>+F8Z74;1M|0b1@OY;l9?k4S%n(e8v%RjS$7=`(N_{twg~zK%FKB&Age{zN zBAkStSwUOyN5prZ!D_^(+WsOsO(ZDpK@=Y)g+SX;A|tgQqswZuW&`yQl!OFSkbpfQ zeJZG6=U!kL-4Yyh-UJP7;R^4Z1deElHoV;@gpO4$D00jZ`Veswa9? zqN*NzpTt!!&w9m9bC&JVsl=^sKv%dZp^kYbH)am=_%XxTSwkK7fe5P|5H%UXRSQ

_uYNidm@V@^XW4RYYd=6`{UQS^5IuL*Sv`G z1Ryz#Mv3kq7HO2FlSJxPC8M;64zk4ebUP9vJWkTVVI}cCba%y7>9sGTLX`Q}McOZA zE^mT|L>@OF+Xv$`6IV=j@kCW&n zOT~Ded7D$@BQU55h>BbQNrvo_PVsFCx=AkhB>=cdrVM878s^~b;G2=DHRV7h^mU9%oI*1Jly1`7bOYoDJd^bqS~JIM zUT#1r#^|;zBPN;xV;Avy6tQ@GUa6mr0GOHnh*^$K;Goof>F2bT&zC-y&1j zjNIACYeqJKvHKnb15Ay#v3Tz0i*vPS+5=_o*+ik5M<=QB*Gz6``uXy9 zwJoD!&=AGc9K#9SiOV96s-%D}--1Z;E(q;F1xg1%9#K|)+fYVgU2Jl0K^bjTQK7jk zkCkSV`_OkJLere)9n>)4G{Q&r@fxPu2MJ^-QDX4O1SvMdZ&GR#yYQ;e{%+<>kyV?5e* zx}MkdwoFa)JHZQZH&FijS$Vu|{*c?|_oXE6KG^v0ti*hi_dw|Oa?TXf=6YYnc}!^& aU}Z;3#Jt_+n0pAU*;W@aze_#l@qYmU$;~qW literal 0 HcmV?d00001 diff --git a/data.py b/data.py new file mode 100644 index 0000000..79034ff --- /dev/null +++ b/data.py @@ -0,0 +1,62 @@ +import numpy as np + +#obvious solution +#(nicer version of:) +#[[ 0.95911082 0.27747321 0.0558132 ] +# [-0.14426784 0.64893967 -0.74703693]] +#(maybe) +#[[ 0.97660659 0.20377678 0.06866321] +# [-0.09744323 0.70403038 -0.70345294]] +#saw one (but did not copy) that looked like +#[[ 1.0, 0.0, 0.0], +# [ 0.0, 0.7,-0.7]] +#but there is a secondary solution +#[[ 0.34055104 0.35392664 -0.87106881] +# [-0.64936382 0.75853476 0.0543288 ]] +#x,x**2+y+z,z +#translated into +#a=x+x**2+y+z-2.5*z +#b=x**2+y+z-(13/15)*x +#find a(b) +#a-const*b=const +#x-(13/15)*x +#+(x**2+y+z)-(x**2+y+z) +#-2.5*z +#=2x/15-2.5z +#not great, but... + +#tertiary solution +#[[ 0.86908816 0.3797459 -0.31698411] +# [-0.49261647 0.60629139 -0.62429142]] +#which looks like a mirrored version of +#[[ 0.93229881 -0.19299482 0.30589537] +# [ 0.3481528 0.70805987 -0.61436217]] +#which of course makes some sense + + +def data(n=1000): + """ + Generate 3d data, where a and b have a relation, but x=x(a,b), y=y(a,b), z=z(a,b) will be returned + """ + a=np.random.uniform(-1.0,1.0,n) + b=a**2+np.random.uniform(-0.2,0.2,n) + c=np.random.uniform(-1.0,1.0,n) + + x=a + y=b+c + z=c + + + return x,y,z + + + +if __name__ == '__main__': + x,y,z=data() + + from plt import plt + + plt.plot(x,y,'.') + plt.show() + #print(x,y,z) + diff --git a/loss.py b/loss.py new file mode 100644 index 0000000..b606d59 --- /dev/null +++ b/loss.py @@ -0,0 +1,50 @@ +import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import backend as K + + +def running_mean(x,n=100,K=K,tf=tf): + """ + Calculate the running mean of an array + """ + cumsum = tf.cumsum(x) + return (cumsum[n:] - cumsum[:-n]) / float(n) +def running_variance(x,n=100,K=K,tf=tf): + """ + Calculate the running variance of an array + """ + return running_mean(x**2,n=n,K=K,tf=tf)-running_mean(x,n=n,K=K,tf=tf)**2 +def running_std(x,n=100,K=K,tf=tf): + """ + Calculate the running standard deviation of an array + """ + return K.sqrt(running_variance(x,n=n,K=K,tf=tf)) + +def loss2d(a,b,n=25,K=K,tf=tf): + q=b + x,y=q[:,0],q[:,1] + #sort by x, evaluate stds on y + dex=tf.argsort(x) + yy=tf.gather(y,dex) + + ss=running_std(yy,n=n) + s=K.std(yy) + + return K.mean(ss)/s + +def numpyloss2d(a,b,n=25): + import numpy as np + q=np.concatenate((np.expand_dims(a,1),np.expand_dims(b,1)),axis=1) + np.gather=np.take + return loss2d(q,q,n=n,K=np,tf=np) + + + +if __name__=='__main__': + import numpy as np + x=np.random.uniform(-1,1,size=(1000,2)) + print(numpyloss2d(x[:,0],x[:,1],n=25)) + + + + diff --git a/main.py b/main.py new file mode 100644 index 0000000..df34065 --- /dev/null +++ b/main.py @@ -0,0 +1,75 @@ +from data import data +import numpy as np + +x,y,z=data() +x=np.concatenate([np.expand_dims(zw,1) for zw in [x,y,z]],axis=1) + + +from tensorflow import keras +from mu import * +from n2ulayer import ulayer + +from loss import loss2d + +dim=int(x.shape[1]) +pdim=2 + +inp=keras.layers.Input(x.shape[1:]) +q=inp + +q=partr(q,pdim,dim,ulayer) +q=cutdown(q,pdim) + +model=keras.models.Model(inp,q) + +model.summary() + +#opt=keras.optimizers.Adam(lr=0.0001) +#opt=keras.optimizers.Adam(lr=0.001) +opt=keras.optimizers.Adam(lr=0.01) + +model.compile(opt,loss=loss2d) + +model.fit(x,x, + epochs=10000, + shuffle=False, + validation_split=0.2, + callbacks=[keras.callbacks.EarlyStopping(patience=250,monitor="loss",restore_best_weights=True)]) + + +mats=[] +for lay in model.layers[1:]: + if not ("ulayer" in str(type(lay))):continue + #print(dir(lay)) + #try: + mats.append(lay.numpify()) + #except: + # pass + +mat=None +for m in mats: + if mat is None: + mat=m + else: + mat=np.dot(m,mat) + +mat=mat[:pdim] + +print(mat) + + + + + +loss=model.evaluate(x[:800],x[:800]) +print(loss) +p=model.predict(x[:800]) +import matplotlib.pyplot as plt + +plt.plot(p[:,0],p[:,1],".",alpha=0.75) +plt.title(str(loss)) +plt.how() + + + + diff --git a/mu.py b/mu.py new file mode 100644 index 0000000..6bdaf3c --- /dev/null +++ b/mu.py @@ -0,0 +1,35 @@ +import numpy as np + + +def determu(q,dim,ulayer): + for i in range(dim): + for j in range(i+1,dim): + q=ulayer(dim,i,j)(q) + return q +def determr(q,dim,ulayer): + dex=[] + for i in range(dim): + for j in range(i+1,dim): + dex.append([i,j]) + np.random.shuffle(dex) + for i,j in dex: + q=ulayer(dim,i,j)(q) + return q +def partu(q,pdim,dim,ulayer): + for i in range(pdim): + for j in range(i+1,dim): + q=ulayer(dim,i,j)(q) + return q +def partr(q,pdim,dim,ulayer): + dex=[] + for i in range(pdim): + for j in range(i+1,dim): + dex.append([i,j]) + np.random.shuffle(dex) + for i,j in dex: + q=ulayer(dim,i,j)(q) + return q +def cutdown(q,pdim): + return q[:,:pdim] + + diff --git a/n2ulayer.py b/n2ulayer.py new file mode 100644 index 0000000..5724809 --- /dev/null +++ b/n2ulayer.py @@ -0,0 +1,72 @@ +#use sin cos to get better gradients (than nulayer) +#migth habe better gradients? (seems that way but not sure yet) + +#should rename it, but who cares +#now also able to export the given matrix +from tensorflow.keras.layers import Layer +from tensorflow.keras import backend as K +from tensorflow import keras +import tensorflow as tf + +import numpy as np + + + +class ulayer(Layer): + def __init__(self,siz,dex1,dex2, **kwargs): + self.siz = siz + self.dex1 = dex1 + self.dex2 = dex2 + super(ulayer, self).__init__(**kwargs) + + def build(self, input_shape): + # Create a trainable weight variable for this layer. + self.kernel = self.add_weight(name='kernel', + shape=(1,), + initializer=keras.initializers.RandomUniform(-0.5, 0.5), + trainable=True) + super(ulayer, self).build(input_shape) # Be sure to call this at the end + + def numpify(self): + mat=np.eye(self.siz) + val=self.weights[0].numpy()[0] + sin,cos=np.sin(val),np.cos(val) + mat[self.dex1,self.dex2]=sin + mat[self.dex2,self.dex1]=-sin + mat[self.dex1,self.dex1]=cos + mat[self.dex2,self.dex2]=cos + return mat + + + + def call(self, x): + kernel=self.kernel + sin=K.sin(kernel) + cos=K.cos(kernel) + tan=sin/cos#that should diverge? + rows=[tf.expand_dims(x[:,i],1) for i in range(self.siz)] + #instead of ((1,a),(-a,1)), I want this to be + #((1,a),(-a,1))/sqrt(1+a**2) + #and with trigonometry, I can get the same result by + #a=sin(kernel)? + #multiply to make 1->cos(x) (aka *cos(x)) + #so a actually tan(kernel) + z1=rows[self.dex2]*tan + z2=rows[self.dex1]*tan + rows[self.dex1]+=z1 + rows[self.dex2]-=z2 + rows[self.dex1]*=cos + rows[self.dex2]*=cos + rows=K.concatenate(rows,axis=1) + return rows + + + mat=tf.eye(self.siz) + tf.assign(mat[self.dex1,self.dex2],self.kernel) + #mat[self.dex2,self.dex1]=-self.kernel + return K.dot(x, mat) + + def compute_output_shape(self, input_shape): + return input_shape + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5816112 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy +tensorflow +keras +matplotlib diff --git a/show.py b/show.py new file mode 100644 index 0000000..bc756dc --- /dev/null +++ b/show.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt + + +from data2 import data + +from loss import numpyloss2d + +x,y,z=data() + +def plotone(x,y,show=True): + #plt.title(str(numpyloss2d(x,y)))#buggy + plt.plot(x,y,"o",alpha=0.75) + if show:plt.show() + +plotone(x,y) +plotone(x,z) +plotone(y,z) +print("desired") +plotone(x,y-z) + + + +