initial push

This commit is contained in:
Simon Klüttermann 2021-10-02 18:51:26 +02:00
commit 36f11cf818
10 changed files with 181586 additions and 0 deletions

23
README Normal file
View File

@ -0,0 +1,23 @@
This git repository represents a comparison of two algorithms for anomaly detection on a (not completely) random dataset "thyroid" from here: http://odds.cs.stonybrook.edu/
Next to a classical autoencoder (ae.py) you find the algorithm in question (traf.py), which is explained further down.
As it is usually stated in anomaly detection, both algorithms are given a set of normal datapoints and try to understand some structure from them. Afterwards this is evaluated with some known anomalies (and some known normals). This is done by calculating a score known as AUC. Important here is only that a higher AUC is a better AUC (and maybe that an AUC~=0.5 represents randomly guessing if an event is normal or abnormal), but you can find more Informations here: https://en.wikipedia.org/wiki/Receiver_operating_characteristic
This score is printed after running each code. It is subject to statistics, but the best scores that I could achieve were about ~0.7 for the autoencoder, while my proposed algorithm reached always an AUC score>0.9. And even when it is probable that this improvement does not hold for every dataset, this algorithmus still has some usage, as it has much fewer parameters and hyperparameters while being much easier to interpret (every parameter is given in a 6x6 Matrix (since every datapoint is 6 dimensional), instead of a complicated neuronal network).
To explain the algorithms:
Autoencoders are a common enough algorithm for somebody having explained them better than I ever could. Here is a random article that I just googled: https://towardsdatascience.com/anomaly-detection-with-autoencoder-b4cdce4866a6
Assuming you understand them, my (still unnamed) algorithm could be seen as a simplification of them.
Instead of having multiple layers, it uses just one. This removes the encoder part of the Autoencoder and creates the thing that makes this algorithm most interesting to me. Finding a Matrix that maps x to x should be trivial (x=1*x) and a trivial Matrix would not at all be able to differentiate between normal and abnormal events. And this is basically what happens if you use tensorflow to optimize this matrix. And even though also here the AUC score is not random (0.5), it is still much lower and represents a much less decisive anomaly detection.
The interesting part comes when you use another optimizer, namely evolutionary optimisation. Ive been interested in evolutionary optimisation for a while, and when I noticed that this should be evolutionary optimisable fairly well, (since it is only one relatively small matrix) I used this opportunity to try out a new optimizer that I was thinking about (these are most of the other .py files in this git) and suddently this anomaly detection algorithm becomes good (good in the sense that it is able to differentiate well between normal and abnormal. It is much slower (I just stop traf.py at 10min of optimisation) and scales terrible to higher sizes (without thinking much, I would say with n**4)).
And I think that is super interesting: This algorithm should not work at all. But for some reason, it is actually quite good.
T task for a thesis would be to try to understand why this works and then maybe try to improve its speed, its scaling and or its quality.
If you have any questions, please feel free to write an email to Simon.Kluettermann@cs.tu-dortmund.de

68
acc2d.py Normal file
View File

@ -0,0 +1,68 @@
"""A couple of functions to tread an 1d array as a 2d one. Used by ising2d"""
def dex(i1,i2,m1,m2):
return i1*m2+i2
def nextto8(i1,i2,m1,m2):
p1=(i1+1)%m1
p2=(i2+1)%m2
s1=(i1-1+m1)%m1
s2=(i2-1+m2)%m2
ret=[]
for d1 in [s1,i1,p1]:
for d2 in [s2,i2,p2]:
if d1==i1 and d2==i2:continue
ret.append(dex(d1,d2,m1,m2))
return ret
nextto=nextto8
def nextto9(i1,i2,m1,m2):
p1=(i1+1)%m1
p2=(i2+1)%m2
s1=(i1-1+m1)%m1
s2=(i2-1+m2)%m2
ret=[]
for d1 in [s1,i1,p1]:
for d2 in [s2,i2,p2]:
ret.append(dex(d1,d2,m1,m2))
return ret
def nextto4(i1,i2,m1,m2):
return [
dex((i1+1)%m1,i2,m1,m2),
dex((i1-1+m1)%m1,i2,m1,m2),
dex(i1,(i2+1)%m2,m1,m2),
dex(i1,(i2-1+m2)%m2,m1,m2)
]
def nextto5(i1,i2,m1,m2):
return [
dex(i1,i2,m1,m2),
dex((i1+1)%m1,i2,m1,m2),
dex((i1-1+m1)%m1,i2,m1,m2),
dex(i1,(i2+1)%m2,m1,m2),
dex(i1,(i2-1+m2)%m2,m1,m2)
]
def range2d(m1,m2):
for i1 in range(m1):
for i2 in range(m2):
yield dex(i1,i2,m1,m2),i1,i2
def gt(q,i1,i2,m1,m2):
assert len(q)==m1*m2
return q[dex(i1,i2,m1,m2)]
def st(q,val,i1,i2,m1,m2):
assert len(q)==m1*m2
q[dex(i1,i2,m1,m2)]=val
return q

116
ae.py Normal file
View File

@ -0,0 +1,116 @@
# TensorFlow and tf.keras
#import tensorflow as tf
import keras
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from data import loadfile
from sklearn.metrics import roc_curve, auc
latent=5
def cauc(n,a):
p=np.concatenate((n,a),axis=0)
l=np.concatenate((np.zeros(len(n)),np.ones(len(a))),axis=0)
fpr, tpr, threshold = roc_curve(l, p)
auc_score=auc(fpr,tpr)
return auc_score
def train(x,y,pth,typ,retry=0):
x=x.astype("float32")
y=y.astype("float32")
#x-=np.mean(x,axis=0)#no correcting here
#x/=(np.std(x,axis=0)+0.00001)
train=x[np.where(y[:,0]==0)]
test_a=x[np.where(y[:,0]==1)]
test_n=train[:len(test_a)]
train=train[len(test_a):]
#global dd
#dd=train
#exit()
dim=x.shape[1]
#denses=[0.8,0.6,0.5,0.6,0.8,]
#denses=[int(math.ceil(zw*dim)) for zw in denses]
d1=(latent*dim*dim)**(1/3)
d2=(latent*latent*dim)**(1/3)
d1=int(d1)
d2=int(d2)
denses=[d1,d2,latent,d2,d1]
#print(train.shape)
model = keras.Sequential([
keras.layers.Dense(dim,activation='relu',input_shape=train.shape[1:]),
*[keras.layers.Dense(dense, activation='relu') for dense in denses],
keras.layers.Dense(dim,activation="linear")
])
model.compile(optimizer='adam',
loss="mse",
metrics=[])
model.summary()
model.fit(train,
train,
epochs=25,
validation_split=0.2,
callbacks=[
#keras.callbacks.CSVLogger(pth+"history.csv"),
#keras.callbacks.ModelCheckpoint(pth+"epoch{epoch:02d}.tf",
# save_weights_only=False)
])
#model.save(pth+"whole.tf")
#model.save_weights(pth+"weigths.tf")
lss = model.evaluate(train,train, verbose=2)
#if lss>500 and retry<3:
# print("retrying")
# return train_func(x,y,pth,typ,retry=retry+1)
p=model.predict(train)
pn=model.predict(test_n)
pa=model.predict(test_a)
mp=np.mean(p,axis=0)
dn=np.mean((pn-mp)**2,axis=1)
da=np.mean((pa-mp)**2,axis=1)
auc=cauc(dn,da)
print(f"auc={auc}")
if __name__ == '__main__':
x,y=loadfile()
train(x,y,".","")

53
data.py Normal file
View File

@ -0,0 +1,53 @@
import os
import scipy.io as sio
import numpy as np
import h5py
def remending(q):
return q.replace(".mat","").replace(".npz","")
def listfiles():
for zw in os.listdir("adata"):
fn="adata/"+zw
yield remending(zw),fn
def loadfile73(fn):
f = h5py.File(fn,'r')
x = f.get('X')
x = np.array(x) # For converting to a NumPy array
y = f.get('y')
y = np.array(y) # For converting to a NumPy array
x=np.transpose(x)
y=np.transpose(y)
return x,y
def loadfile(fn="thyroid.mat"):
if ".npz" in fn:
f=np.load(fn)
return f["x"],f["y"]
try:
mat=sio.loadmat(fn)
return mat["X"],mat["y"]
except:
return loadfile73(fn)
def loadfiles():
for f,fn in listfiles():
yield f,*loadfile(fn)
def filterfiles():
for f,x,y in loadfiles():
cou=np.sum(y)
if cou<10:continue
yield f,x,y
if __name__ == '__main__':
if False:
for f,x,y in filterfiles():
#print(f,x.shape[1],y.shape[0],np.mean(y))
les=[len(set(x[:,i]))/len(x) for i in range(x.shape[1])]
lestr=f"{np.mean(les)}+-{np.std(les)}"
print(f'"{f}":{int(x.shape[1]/2)+1},{lestr}')
for f,x,y in filterfiles():
if "wbc" in f:break

90
ising2d.py Normal file
View File

@ -0,0 +1,90 @@
from xevo.evo import evo
import numpy as np
import time
from acc2d import st,gt,dex,nextto,range2d
from acc2d import nextto5,nextto4,nextto9,nextto8
from numpy import mean,argmax
class ising2devo(evo):
"""2d ising optimizer with periodic boundary conditions
"""
def __init__(s,m1=4,m2=5,temp=1.0,garant=0.0,mergews=0.0,mean_func="mean",close="o"):
"""m1, m2: sizes of the fields. Have to match the population size. Standart values multiply to standart values for quickmut
temp: Temperature of the ising model, higher temperature->can also switch to worse result, not scale independent
garant: difference above this->will be updated definitely. Depends on temperature
mergews: if above zero: can add together elemts instead of replacing them (not implemented atm)
mean_func: average function in the ising loss (average(neighbourhood)-mine), understands mean,max,min or callable
close: type of neighbourhood, understands o (8 around) c (checkerboard, 4 around), 9 (o+ self), + (4+self) or callable
"""
s.initial()
s.m1=m1
s.m2=m2
s.temp=temp
s.garant=garant
s.mergews=mergews
if mean_func=="mean":mean_func=mean
if mean_func=="max":mean_func=max
if mean_func=="min":mean_func=min
if close=="o":close=nextto8
if close=="c":close=nextto4
if close=="9":close=nextto9
if close=="+":close=nextto5
s.mean_func=mean_func
s.close=close
def generation(s)->None:
#s.q.sort(key=lambda x:x.ortho()[s.dex])
nq=[zw for zw in s.q]
lsq=len(s.q)
sm=s.q[0].shallmaximize()
for i,i1,i2 in range2d(s.m1,s.m2):
acs=s.q[i].strength()
ic=s.close(i1,i2,s.m1,s.m2)
acc=[s.q[ii].strength() for ii in ic]
if not sm:#so now shallmaximize true
acs*=-1
acc=[zw*-1 for zw in acc]
delta=(s.mean_func(acc)-acs)#the higher, the more probable switch
if np.random.random()<np.exp((delta-s.garant)/s.temp):
dex=argmax(acc)
if acc[dex]<acs or ic[dex]==i:
nq[i]=s.q[i].copy()
continue
if s.mergews>0.0 and np.random.random()<s.mergews:
nq[i]=s.q[ic[dex]]+s.q[i]
else:
nq[i]=s.q[ic[dex]].mutate()
s.q=nq
def _copy(s):
return ising2devo(s,m1=s.m1,m2=s.m2,temp=s.temp,garant=s.garant,mergews=s.mergews,mean_func=s.mean_func,close=s.close)
def to_array(s,odex=0):
ret=np.zeros((s.m1,s.m2))
for i1 in range(s.m1):
for i2 in range(s.m2):
ret[i1,i2]=s.q[dex(i1,i2,s.m1,s.m2)].ortho()[odex]
return ret

181019
log.csv Normal file

File diff suppressed because it is too large Load Diff

30
onceasecond.py Normal file
View File

@ -0,0 +1,30 @@
import time
def onceasecond(func):
#global lastt
lastt=0
def fun(*args,**kwargs):
#global lastt
t=time.time()
if t-fun.lastt>1:
fun.lastt=t
func(*args,**kwargs)
fun.lastt=lastt
return fun
if __name__=="__main__":
@onceasecond
def printi(i):
print(i)
i=0
while True:
i+=1
printi(i)

9
requirements.txt Normal file
View File

@ -0,0 +1,9 @@
keras
tensorflow
numpy
matplotlib
sklearn
h5py
scipy
xevo

BIN
thyroid.mat Normal file

Binary file not shown.

178
traf.py Normal file
View File

@ -0,0 +1,178 @@
# TensorFlow and tf.keras
#import tensorflow as tf#dont need it here;)
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import math
from xevo import eobj, quickmut,semiquickmut
from ising2d import ising2devo
import os
from onceasecond import onceasecond
from time import time
from data import loadfile
from sklearn.metrics import roc_curve,auc
def cauc(n,a):
p=np.concatenate((n,a),axis=0)
l=np.concatenate((np.zeros(len(n)),np.ones(len(a))),axis=0)
fpr, tpr, threshold = roc_curve(l, p)
auc_score=auc(fpr,tpr)
return auc_score
class matrixject(eobj):
def norm(s):
return np.mean(s.q**2)*len(s.q)
def __init__(s,mat=None,dim=2):
if mat is None:
s.q=np.random.normal(0,1,(dim,dim))
s.q/=s.norm()+0.00001
else:
s.q=mat
s.initial()
def __add__(a,b):
return a.__class__((a.q+b.q)/2)
def randomize(s):
s=s.copy()
s.q=np.random.normal(0,1,s.q.shape)
s.q/=s.norm()+0.00001
return s
def mutate(s):
s=s.copy()
i1,i2=np.random.randint(0,len(s.q),2)
val=np.random.normal(0,1)
exp=np.random.random()*5-3
s.q[i1,i2]+=val*exp
return s
def predict_data(s,data):
mat=(s.q-1)
pred=np.dot(data,mat)
return pred
def eval_data(s,data):
return np.mean(np.abs(s.predict_data(data)),axis=-1)
def calcstrength(s):
#mat*x=x
#(mat-1)*x=0
mat=(s.q-1)
loss=np.mean(s.eval_data(data))
#loss+=1/(1+np.mean(np.abs(mat)))#punish trivial solution
return loss
def shallmaximize(s):
return False
def _copy(s):
return s.__class__(s.q)
def __repr__(s):
return str(s.q)
class ising2dlog(ising2devo):
def __init__(s,*args,**kwargs):
ising2devo.__init__(s,*args,**kwargs)
s.starttime=time()
@onceasecond
def stepsave(s):
return #disabled here
np.savez_compressed(logfile.replace("log.csv",f"step{s.i}"),mats=[zw.q for zw in s.q],stres=[zw.strength() for zw in s.q],i=s.i)
def logeneration(s,show=True):
mx,mn,std=ising2devo.logeneration(s=s,show=show)
global logfile
with open(logfile,"a") as f:
f.write(f"{mx},{mn},{std},")
f.write(",".join([str(qq.strength()) for qq in s.q]))
f.write("\n")
s.stepsave()
t=time()
if t-s.starttime>600:#10min
mx=-1.0
return mx,mn,std
def train(x,y,pth,typ):
x=x.astype("float32")
y=y.astype("float32")
x/=np.mean(np.abs(x))
global logfile
logfile=pth+"/log.csv"
#x-=np.mean(x,axis=0)#no correcting here
#x/=(np.std(x,axis=0)+0.00001)
train=x[np.where(y[:,0]==0)]
test_a=x[np.where(y[:,0]==1)]
test_n=train[:len(test_a)]
train=train[len(test_a):]
#global dd
#dd=train
#exit()
global data
data=train
obj=matrixject(dim=int(data.shape[1]))
boardsize=10
opt=ising2dlog(m1=boardsize,m2=boardsize,#dimension of the board, resulting in population of 49
temp=10.0,#semi arbitrary hyper param
mergews=0.3,#high merge ws since (1+1)/2=1
close="c")#classical ising neighbourhood
solv=semiquickmut(obj,goal=0.0,maxsteps=500000,population=boardsize*boardsize,opt=opt)
stres=[zw.strength() for zw in solv]
matrices=[zw.q for zw in solv]
sol=solv[np.argmin(stres)]
alle=[zw.eval_data(train) for zw in solv]
allen=[zw.eval_data(test_n) for zw in solv]
allea=[zw.eval_data(test_a) for zw in solv]
lss = sol.calcstrength()
#p=sol.predict_data(train)
#pn=sol.predict_data(test_n)
#pa=sol.predict_data(test_a)
e=sol.eval_data(train)
en=sol.eval_data(test_n)
ea=sol.eval_data(test_a)
mn=np.mean(e)
dn=(en-mn)**2
da=(ea-mn)**2
auc=cauc(dn,da)
print(f"auc={auc}")
if __name__ == '__main__':
x,y=loadfile()
train(x,y,".","")