initial push

This commit is contained in:
Simon Klüttermann 2022-01-29 13:04:08 +01:00
commit 50638fbae0
79 changed files with 469 additions and 0 deletions

9
README Normal file
View File

@ -0,0 +1,9 @@
Instead of trying to find a model that is perfect at finding anomalies, ensembles try to combine multiple (maybe bad) models into one.
To do this, we need an algorithm to combine the predictions of different models. One way (that I commonly use) is to just average them in some way (score=sqrt(score_1**2+score_2**2)). This works only well if you have a huge number of mostly uncorrelated models.
If you have only a few models or correlated ones you can introduce bias this way. Assume we have three models: An isolation forest (iforest), an svm and a kNN algorithm. The iforest has low correlation to the other models (if finds different things anomalous compared to the svm and kNN), bzt the svm and the kNN find basically the same anomalies. If we just average each model, the svm and kNN have a much bigger influence on the result compared to the iforest. There is no real reason why this should be the case.
To solve this, you can add models depending on correlations between them. But instead of relying on the correlation existing between the models themself, this repository uses a special kind of neural network to find uncorrelated parts of the model predictions.
n2ulayer.py and mu.py define this special kind of neural network. loss.py defines the correlation we want to minimize for use in tensorflow.
onemodel.py generates a quick (and quite random) anomaly detection model for use on the data defined in data.py (just a 2d gaussian). 20 models are generated and their predictions (sorted from most normal (green) to most anomal (red)) drawn in the numbered images in imgs
If you use all 20 models and simply average them this results in imgs/recombine.png. Notice how the green points are much more centered.
choosenext

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
before.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

54
choosenext.py Normal file
View File

@ -0,0 +1,54 @@
import numpy as np
from tensorflow import keras
from mu import *
from n2ulayer import ulayer
from loss import loss
def choosenext(given,possble):
"""given is a list of scores. possble is a list of list of scores. We want to find the combination of elements in possble that has the lowest correlation to given"""
opt=len(possble)
np.random.shuffle(possble)
possble=np.transpose(possble)
given=np.expand_dims(given,axis=1)
#print("given",given.shape)
#print("possble",possble.shape)
#print(loss(given,possble,K=np))
#exit()
inp=keras.layers.Input(shape=possble.shape[1:])
q=inp
#q=ulayer(opt,0,1)(q)
q=partr(q,1,opt,ulayer)
model=keras.models.Model(inputs=inp,outputs=q)
model.compile(loss=loss,optimizer=keras.optimizers.Adam(lr=0.001))
model.summary()
model.fit(possble,given,
batch_size=32,
epochs=100,
verbose=1,
validation_split=0.0,#that stuff cant overfit
shuffle=True,
callbacks=[keras.callbacks.EarlyStopping(monitor='loss',patience=10,restore_best_weights=True)])
return model.predict(possble)
if __name__=="__main__":
f=np.load("merged.npz")
x=f["ps"]
given=x[0]
possble=x[1:5]
choosenext(given,possble)

20
data.py Normal file
View File

@ -0,0 +1,20 @@
import numpy as np
def data(n=1000):
"""
Generate 2d gaussian data. Few points but every model should have slightly different data.
Then use a big dataset as comparison algo. Basically subsampling instead of feature bagging.
"""
return np.random.normal(1.0,0.5,(n,2))
if __name__ == '__main__':
x=data()
from plt import plt
plt.plot(x[:,0],x[:,1],'.')
plt.how()
#print(x,y,z)

BIN
imgs/0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

BIN
imgs/1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
imgs/10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
imgs/11.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
imgs/12.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

BIN
imgs/13.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

BIN
imgs/14.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

BIN
imgs/15.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 105 KiB

BIN
imgs/16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
imgs/17.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
imgs/18.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
imgs/19.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

BIN
imgs/2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
imgs/3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB

BIN
imgs/4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
imgs/5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
imgs/6.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

BIN
imgs/7.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

BIN
imgs/8.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

BIN
imgs/9.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

BIN
imgs/recombine.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

23
loss.py Normal file
View File

@ -0,0 +1,23 @@
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
def loss(a,b,K=K):
"""correlation between a and the first entry of b should be zero. Correlations are hard to optimize. So use corvariance and metric keeping properties"""
if len(b.shape)>1:
b=b[:,0]
if len(a.shape)>1:
a=a[:,0]
return K.abs(K.mean((a-K.mean(a))*(b-K.mean(b))))
if __name__=='__main__':
import numpy as np
x=np.random.uniform(-1,1,size=(1000,2))
print(numpyloss2d(x[:,0],x[:,1],n=25))

63
main.py Normal file
View File

@ -0,0 +1,63 @@
import numpy as np
from choosenext import choosenext
from plt import plt
def draw(p,dat):
mp=np.mean(p)
d=np.square(p-mp)
sx=[(xx,dd) for xx,dd in zip(dat,d)]
sx.sort(key=lambda x:x[1])
sx=[zw[0] for zw in sx]
sx=np.array(sx)
col1=[1.0,0.0,0.0]
col2=[0.0,1.0,0.0]
col1,col2=np.array(col1),np.array(col2)
ln=len(sx)
cols=[col1*(i/ln)+col2*(1-i/ln) for i in range(ln)]
plt.scatter(sx[:,0],sx[:,1],c=cols)
def combine(a,b):
a=1+(a-np.mean(a))/np.std(a)
b=1+(b-np.mean(b))/np.std(b)
toc=np.concatenate((np.expand_dims(a,axis=1),np.expand_dims(b,axis=1)),axis=1)
toc=np.sqrt(np.mean(toc**2,axis=1))
return toc
if __name__=="__main__":
f=np.load("merged.npz")
dat=f["x"]
x=f["ps"]
np.random.shuffle(x)
given=x[0]
possble=x[1:5]
ac=choosenext(given,possble)
nextbest=ac[:,0]
remainder=ac[:,1:]
for row in np.transpose(ac):
print(np.corrcoef(given,row)[0,1])
#as you see: the correlation is the lowest for the first row.
#so lets combine it
updated=combine(given,nextbest)
draw(given,dat)
plt.savefig("before.png")
plt.show()
draw(nextbest,dat)
plt.savefig("suggestion.png")
plt.show()
draw(updated,dat)
plt.savefig("updated.png")
plt.show()

BIN
merged.npz Normal file

Binary file not shown.

36
mu.py Normal file
View File

@ -0,0 +1,36 @@
import numpy as np
def determu(q,dim,ulayer):
for i in range(dim):
for j in range(i+1,dim):
q=ulayer(dim,i,j)(q)
return q
def determr(q,dim,ulayer):
dex=[]
for i in range(dim):
for j in range(i+1,dim):
dex.append([i,j])
np.random.shuffle(dex)
for i,j in dex:
q=ulayer(dim,i,j)(q)
return q
def partu(q,pdim,dim,ulayer):
for i in range(pdim):
for j in range(i+1,dim):
q=ulayer(dim,i,j)(q)
return q
def partr(q,pdim,dim,ulayer):
#this is often just an approximation. But for sqrt runtime....
dex=[]
for i in range(pdim):
for j in range(i+1,dim):
dex.append([i,j])
np.random.shuffle(dex)
for i,j in dex:
q=ulayer(dim,i,j)(q)
return q
def cutdown(q,pdim):
return q[:,:pdim]

6
multimodel.py Normal file
View File

@ -0,0 +1,6 @@
import os
#yes I know, not the best way to do this...
for i in range(20):
os.system(f"python3 onemodel.py {i}")

72
n2ulayer.py Normal file
View File

@ -0,0 +1,72 @@
#use sin cos to get better gradients (than nulayer)
#migth habe better gradients? (seems that way but not sure yet)
#should rename it, but who cares
#now also able to export the given matrix
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
from tensorflow import keras
import tensorflow as tf
import numpy as np
class ulayer(Layer):
def __init__(self,siz,dex1,dex2, **kwargs):
self.siz = siz
self.dex1 = dex1
self.dex2 = dex2
super(ulayer, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
self.kernel = self.add_weight(name='kernel',
shape=(1,),
initializer=keras.initializers.RandomUniform(-0.5, 0.5),
trainable=True)
super(ulayer, self).build(input_shape) # Be sure to call this at the end
def numpify(self):
mat=np.eye(self.siz)
val=self.weights[0].numpy()[0]
sin,cos=np.sin(val),np.cos(val)
mat[self.dex1,self.dex2]=sin
mat[self.dex2,self.dex1]=-sin
mat[self.dex1,self.dex1]=cos
mat[self.dex2,self.dex2]=cos
return mat
def call(self, x):
kernel=self.kernel
sin=K.sin(kernel)
cos=K.cos(kernel)
tan=sin/cos#that should diverge?
rows=[tf.expand_dims(x[:,i],1) for i in range(self.siz)]
#instead of ((1,a),(-a,1)), I want this to be
#((1,a),(-a,1))/sqrt(1+a**2)
#and with trigonometry, I can get the same result by
#a=sin(kernel)?
#multiply to make 1->cos(x) (aka *cos(x))
#so a actually tan(kernel)
z1=rows[self.dex2]*tan
z2=rows[self.dex1]*tan
rows[self.dex1]+=z1
rows[self.dex2]-=z2
rows[self.dex1]*=cos
rows[self.dex2]*=cos
rows=K.concatenate(rows,axis=1)
return rows
mat=tf.eye(self.siz)
tf.assign(mat[self.dex1,self.dex2],self.kernel)
#mat[self.dex2,self.dex1]=-self.kernel
return K.dot(x, mat)
def compute_output_shape(self, input_shape):
return input_shape

BIN
old/imgs/0.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 110 KiB

BIN
old/imgs/1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

BIN
old/imgs/2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

BIN
old/imgs/3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

BIN
old/imgs/4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

BIN
old/imgs/5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

BIN
old/imgs/6.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 108 KiB

BIN
old/imgs/7.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

BIN
old/imgs/8.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

BIN
old/runs/0.npz Normal file

Binary file not shown.

BIN
old/runs/1.npz Normal file

Binary file not shown.

BIN
old/runs/2.npz Normal file

Binary file not shown.

BIN
old/runs/3.npz Normal file

Binary file not shown.

BIN
old/runs/4.npz Normal file

Binary file not shown.

BIN
old/runs/5.npz Normal file

Binary file not shown.

BIN
old/runs/6.npz Normal file

Binary file not shown.

BIN
old/runs/7.npz Normal file

Binary file not shown.

BIN
old/runs/8.npz Normal file

Binary file not shown.

105
onemodel.py Normal file
View File

@ -0,0 +1,105 @@
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import sys
from data import data
os.makedirs('./runs', exist_ok=True)
os.makedirs('./imgs', exist_ok=True)
dex=0
if len(sys.argv)>1:
dex=int(sys.argv[1])
seed=np.random.randint(100000)
x=data(1000)
np.random.seed(12)
X=data(10000)
np.random.seed(seed)
inp=keras.layers.Input(shape=x.shape[1:])
q=inp
q=keras.layers.Dense(5,activation='relu')(q)
q=keras.layers.Dense(5,activation='relu')(q)
q=keras.layers.Dense(1,activation='linear')(q)
model=keras.models.Model(inputs=inp,outputs=q)
model.compile(optimizer='adam',loss='mse')
model.fit(x,np.ones(len(x)),
epochs=500,
batch_size=25,
validation_split=0.2,
verbose=1,
shuffle=True,
callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',patience=10)])
#Evaluation phase
x=X
p=model.predict(x)
mp=np.mean(p)
d=(p-mp)**2
d=np.sqrt(np.mean(d,axis=-1))
np.savez_compressed(f"runs/{dex}",d=d,x=x,p=p,mp=mp)
sx=[(xx,dd) for xx,dd in zip(x,d)]
sx.sort(key=lambda x:x[1])
print(sx[0],sx[-1])
sx=[xx for xx,dd in sx]
sx=np.array(sx)
from plt import plt
col1=[1.0,0.0,0.0]
col2=[0.0,1.0,0.0]
col1,col2=np.array(col1),np.array(col2)
ln=len(sx)
cols=[col1*(i/ln)+col2*(1-i/ln) for i in range(ln)]
plt.scatter(sx[:,0],sx[:,1],c=cols)
plt.savefig(f"imgs/{dex}.png")
#plt.plot(sx[:,0],sx[:,1],'.')
plt.how()

77
recombine.py Normal file
View File

@ -0,0 +1,77 @@
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
import sys
from data import data
fns=[f"runs/{zw}" for zw in os.listdir("runs")]
fs=[np.load(fn) for fn in fns if os.path.isfile(fn)]
x=fs[0]["x"]
ds=[f["d"] for f in fs]
ps=[f["p"][:,0] for f in fs]
ds=np.array(ds)
ps=np.array(ps)
d=np.sqrt(np.mean(np.square(ds),axis=0))
np.savez_compressed("merged",x=x,ds=ds,ps=ps,d=d)
print(np.corrcoef(ps))
sx=[(xx,dd) for xx,dd in zip(x,d)]
sx.sort(key=lambda x:x[1])
sx=[xx for xx,dd in sx]
sx=np.array(sx)
from plt import plt
col1=[1.0,0.0,0.0]
col2=[0.0,1.0,0.0]
col1,col2=np.array(col1),np.array(col2)
ln=len(sx)
cols=[col1*(i/ln)+col2*(1-i/ln) for i in range(ln)]
plt.scatter(sx[:,0],sx[:,1],c=cols)
plt.savefig(f"imgs/recombine.png")
#plt.plot(sx[:,0],sx[:,1],'.')
plt.how()

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
numpy
tensorflow
keras
matplotlib

BIN
runs/0.npz Normal file

Binary file not shown.

BIN
runs/1.npz Normal file

Binary file not shown.

BIN
runs/10.npz Normal file

Binary file not shown.

BIN
runs/11.npz Normal file

Binary file not shown.

BIN
runs/12.npz Normal file

Binary file not shown.

BIN
runs/13.npz Normal file

Binary file not shown.

BIN
runs/14.npz Normal file

Binary file not shown.

BIN
runs/15.npz Normal file

Binary file not shown.

BIN
runs/16.npz Normal file

Binary file not shown.

BIN
runs/17.npz Normal file

Binary file not shown.

BIN
runs/18.npz Normal file

Binary file not shown.

BIN
runs/19.npz Normal file

Binary file not shown.

BIN
runs/2.npz Normal file

Binary file not shown.

BIN
runs/3.npz Normal file

Binary file not shown.

BIN
runs/4.npz Normal file

Binary file not shown.

BIN
runs/5.npz Normal file

Binary file not shown.

BIN
runs/6.npz Normal file

Binary file not shown.

BIN
runs/7.npz Normal file

Binary file not shown.

BIN
runs/8.npz Normal file

Binary file not shown.

BIN
runs/9.npz Normal file

Binary file not shown.

BIN
suggestion.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB

BIN
updated.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB