initial push

2021-10-02 18:51:26 +02:00 · 2021-10-02 18:51:26 +02:00 · 36f11cf818
commit 36f11cf818
10 changed files with 181586 additions and 0 deletions
--- a/23
+++ b/23
@ -0,0 +1,23 @@
+This git repository represents a comparison of two algorithms for anomaly detection on a (not completely) random dataset "thyroid" from here: http://odds.cs.stonybrook.edu/
+
+Next to a classical autoencoder (ae.py) you find the algorithm in question (traf.py), which is explained further down.
+
+As it is usually stated in anomaly detection, both algorithms are given a set of normal datapoints and try to understand some structure from them. Afterwards this is evaluated with some known anomalies (and some known normals). This is done by calculating a score known as AUC. Important here is only that a higher AUC is a better AUC (and maybe that an AUC~=0.5 represents randomly guessing if an event is normal or abnormal), but you can find more Informations here: https://en.wikipedia.org/wiki/Receiver_operating_characteristic
+
+This score is printed after running each code. It is subject to statistics, but the best scores that I could achieve were about ~0.7 for the autoencoder, while my proposed algorithm reached always an AUC score>0.9. And even when it is probable that this improvement does not hold for every dataset, this algorithmus still has some usage, as it has much fewer parameters and hyperparameters while being much easier to interpret (every parameter is given in a 6x6 Matrix (since every datapoint is 6 dimensional), instead of a complicated neuronal network).
+
+To explain the algorithms:
+
+Autoencoders are a common enough algorithm for somebody having explained them better than I ever could. Here is a random article that I just googled: https://towardsdatascience.com/anomaly-detection-with-autoencoder-b4cdce4866a6
+
+Assuming you understand them, my (still unnamed) algorithm could be seen as a simplification of them.
+Instead of having multiple layers, it uses just one. This removes the encoder part of the Autoencoder and creates the thing that makes this algorithm most interesting to me. Finding a Matrix that maps x to x should be trivial (x=1*x) and a trivial Matrix would not at all be able to differentiate between normal and abnormal events. And this is basically what happens if you use tensorflow to optimize this matrix. And even though also here the AUC score is not random (0.5), it is still much lower and represents a much less decisive anomaly detection.
+The interesting part comes when you use another optimizer, namely evolutionary optimisation. Ive been interested in evolutionary optimisation for a while, and when I noticed that this should be evolutionary optimisable fairly well, (since it is only one relatively small matrix) I used this opportunity to try out a new optimizer that I was thinking about (these are most of the other .py files in this git) and suddently this anomaly detection algorithm becomes good (good in the sense that it is able to differentiate well between normal and abnormal. It is much slower (I just stop traf.py at 10min of optimisation) and scales terrible to higher sizes (without thinking much, I would say with n**4)).
+
+And I think that is super interesting: This algorithm should not work at all. But for some reason, it is actually quite good.
+
+T task for a thesis would be to try to understand why this works and then maybe try to improve its speed, its scaling and or its quality.
+If you have any questions, please feel free to write an email to Simon.Kluettermann@cs.tu-dortmund.de
+
+
+
--- a/acc2d.py
+++ b/acc2d.py
@ -0,0 +1,68 @@
+"""A couple of functions to tread an 1d array as a 2d one. Used by ising2d"""
+
+
+
+def dex(i1,i2,m1,m2):
+    return i1*m2+i2
+
+def nextto8(i1,i2,m1,m2):
+    p1=(i1+1)%m1
+    p2=(i2+1)%m2
+    s1=(i1-1+m1)%m1
+    s2=(i2-1+m2)%m2
+    ret=[]
+    for d1 in [s1,i1,p1]:
+        for d2 in [s2,i2,p2]:
+            if d1==i1 and d2==i2:continue
+            ret.append(dex(d1,d2,m1,m2))
+    return ret
+nextto=nextto8
+def nextto9(i1,i2,m1,m2):
+    p1=(i1+1)%m1
+    p2=(i2+1)%m2
+    s1=(i1-1+m1)%m1
+    s2=(i2-1+m2)%m2
+    ret=[]
+    for d1 in [s1,i1,p1]:
+        for d2 in [s2,i2,p2]:
+            ret.append(dex(d1,d2,m1,m2))
+    return ret
+
+def nextto4(i1,i2,m1,m2):
+    return [
+            dex((i1+1)%m1,i2,m1,m2),
+            dex((i1-1+m1)%m1,i2,m1,m2),
+            dex(i1,(i2+1)%m2,m1,m2),
+            dex(i1,(i2-1+m2)%m2,m1,m2)
+            ]
+
+def nextto5(i1,i2,m1,m2):
+    return [
+            dex(i1,i2,m1,m2),
+            dex((i1+1)%m1,i2,m1,m2),
+            dex((i1-1+m1)%m1,i2,m1,m2),
+            dex(i1,(i2+1)%m2,m1,m2),
+            dex(i1,(i2-1+m2)%m2,m1,m2)
+            ]
+def range2d(m1,m2):
+    for i1 in range(m1):
+        for i2 in range(m2):
+            yield dex(i1,i2,m1,m2),i1,i2
+
+def gt(q,i1,i2,m1,m2):
+    assert len(q)==m1*m2
+
+    return q[dex(i1,i2,m1,m2)]
+
+def st(q,val,i1,i2,m1,m2):
+    assert len(q)==m1*m2
+
+    q[dex(i1,i2,m1,m2)]=val
+
+    return q
+
+
+
+
+
+
--- a/ae.py
+++ b/ae.py
@ -0,0 +1,116 @@
+# TensorFlow and tf.keras
+#import tensorflow as tf
+import keras
+
+# Helper libraries
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+
+import time
+
+from data import loadfile
+
+from sklearn.metrics import roc_curve, auc
+
+latent=5
+
+
+def cauc(n,a):
+    p=np.concatenate((n,a),axis=0)
+    l=np.concatenate((np.zeros(len(n)),np.ones(len(a))),axis=0)
+    fpr, tpr, threshold = roc_curve(l, p)
+
+    auc_score=auc(fpr,tpr)
+    return auc_score
+
+
+
+def train(x,y,pth,typ,retry=0):
+
+    x=x.astype("float32")
+    y=y.astype("float32")
+    #x-=np.mean(x,axis=0)#no correcting here
+    #x/=(np.std(x,axis=0)+0.00001)
+
+    train=x[np.where(y[:,0]==0)]
+    test_a=x[np.where(y[:,0]==1)]
+    test_n=train[:len(test_a)]
+    train=train[len(test_a):]
+
+    #global dd
+    #dd=train
+
+    #exit()
+
+
+    dim=x.shape[1]
+
+    #denses=[0.8,0.6,0.5,0.6,0.8,]
+    #denses=[int(math.ceil(zw*dim)) for zw in denses]
+   
+    d1=(latent*dim*dim)**(1/3)
+    d2=(latent*latent*dim)**(1/3)
+    d1=int(d1)
+    d2=int(d2)
+    denses=[d1,d2,latent,d2,d1]
+
+    #print(train.shape)
+
+    model = keras.Sequential([
+        keras.layers.Dense(dim,activation='relu',input_shape=train.shape[1:]),
+        *[keras.layers.Dense(dense, activation='relu') for dense in denses],
+        keras.layers.Dense(dim,activation="linear")
+    ])
+
+    model.compile(optimizer='adam',
+              loss="mse",
+              metrics=[])
+
+    model.summary()
+
+
+
+
+    model.fit(train,
+            train,
+            epochs=25,
+            validation_split=0.2,
+            callbacks=[
+                #keras.callbacks.CSVLogger(pth+"history.csv"), 
+                #keras.callbacks.ModelCheckpoint(pth+"epoch{epoch:02d}.tf",
+                #    save_weights_only=False)
+                ])
+
+    #model.save(pth+"whole.tf")
+    #model.save_weights(pth+"weigths.tf")
+
+
+    lss = model.evaluate(train,train, verbose=2)
+
+
+    #if lss>500 and retry<3:
+    #    print("retrying")
+    #    return train_func(x,y,pth,typ,retry=retry+1)
+
+
+    p=model.predict(train)
+    pn=model.predict(test_n)
+    pa=model.predict(test_a)
+    mp=np.mean(p,axis=0)
+    dn=np.mean((pn-mp)**2,axis=1)
+    da=np.mean((pa-mp)**2,axis=1)
+     
+
+    auc=cauc(dn,da)
+    print(f"auc={auc}")
+
+
+
+
+if __name__ == '__main__':
+    x,y=loadfile()
+    train(x,y,".","")
+
+
+
--- a/data.py
+++ b/data.py
@ -0,0 +1,53 @@
+import os
+import scipy.io as sio
+import numpy as np
+import h5py
+
+
+def remending(q):
+    return q.replace(".mat","").replace(".npz","")
+
+def listfiles():
+    for zw in os.listdir("adata"):
+        fn="adata/"+zw
+        yield remending(zw),fn
+
+def loadfile73(fn):
+    f = h5py.File(fn,'r')
+    x = f.get('X')
+    x = np.array(x) # For converting to a NumPy array
+    y = f.get('y')
+    y = np.array(y) # For converting to a NumPy array
+    x=np.transpose(x)
+    y=np.transpose(y)
+    return x,y
+def loadfile(fn="thyroid.mat"):
+    if ".npz" in fn:
+        f=np.load(fn)
+        return f["x"],f["y"]
+    try:
+        mat=sio.loadmat(fn)
+        return mat["X"],mat["y"]
+    except:
+        return loadfile73(fn)
+
+def loadfiles():
+    for f,fn in listfiles():
+        yield f,*loadfile(fn)
+
+def filterfiles():
+    for f,x,y in loadfiles():
+        cou=np.sum(y)
+        if cou<10:continue
+        yield f,x,y
+if __name__ == '__main__':
+    if False:
+        for f,x,y in filterfiles():
+            #print(f,x.shape[1],y.shape[0],np.mean(y))
+            les=[len(set(x[:,i]))/len(x) for i in range(x.shape[1])]
+            lestr=f"{np.mean(les)}+-{np.std(les)}"
+            print(f'"{f}":{int(x.shape[1]/2)+1},{lestr}')
+
+    for f,x,y in filterfiles():
+        if "wbc" in f:break
+
--- a/ising2d.py
+++ b/ising2d.py
@ -0,0 +1,90 @@
+from xevo.evo import evo
+
+import numpy as np
+import time
+
+from acc2d import st,gt,dex,nextto,range2d
+from acc2d import nextto5,nextto4,nextto9,nextto8
+
+
+from numpy import mean,argmax
+
+
+class ising2devo(evo):
+    """2d ising optimizer with periodic boundary conditions
+
+
+    """
+    
+    def __init__(s,m1=4,m2=5,temp=1.0,garant=0.0,mergews=0.0,mean_func="mean",close="o"):
+        """m1, m2: sizes of the fields. Have to match the population size. Standart values multiply to standart values for quickmut
+           temp: Temperature of the ising model, higher temperature->can also switch to worse result, not scale independent
+           garant: difference above this->will be updated definitely. Depends on temperature
+           mergews: if above zero: can add together elemts instead of replacing them (not implemented atm)
+           mean_func: average function in the ising loss (average(neighbourhood)-mine), understands mean,max,min or callable
+           close: type of neighbourhood, understands o (8 around) c (checkerboard, 4 around), 9 (o+ self), + (4+self) or callable
+
+           """
+
+        s.initial()
+        s.m1=m1
+        s.m2=m2
+        s.temp=temp
+        s.garant=garant
+        s.mergews=mergews
+
+        if mean_func=="mean":mean_func=mean
+        if mean_func=="max":mean_func=max
+        if mean_func=="min":mean_func=min
+
+        if close=="o":close=nextto8
+        if close=="c":close=nextto4
+        if close=="9":close=nextto9
+        if close=="+":close=nextto5
+
+        s.mean_func=mean_func
+        s.close=close
+
+
+
+    def generation(s)->None:
+        #s.q.sort(key=lambda x:x.ortho()[s.dex])
+        nq=[zw for zw in s.q]
+        lsq=len(s.q)
+        sm=s.q[0].shallmaximize()
+        for i,i1,i2 in range2d(s.m1,s.m2):
+            acs=s.q[i].strength()
+            ic=s.close(i1,i2,s.m1,s.m2)
+            acc=[s.q[ii].strength() for ii in ic]
+            
+            if not sm:#so now shallmaximize true
+                acs*=-1
+                acc=[zw*-1 for zw in acc]
+            delta=(s.mean_func(acc)-acs)#the higher, the more probable switch
+            if np.random.random()<np.exp((delta-s.garant)/s.temp):
+                dex=argmax(acc)
+                if acc[dex]<acs or ic[dex]==i:
+                    nq[i]=s.q[i].copy()
+                    continue
+                if s.mergews>0.0 and np.random.random()<s.mergews:
+                    nq[i]=s.q[ic[dex]]+s.q[i]
+                else:
+                    nq[i]=s.q[ic[dex]].mutate()
+
+        s.q=nq
+
+
+    def _copy(s):
+        return ising2devo(s,m1=s.m1,m2=s.m2,temp=s.temp,garant=s.garant,mergews=s.mergews,mean_func=s.mean_func,close=s.close)
+
+
+    def to_array(s,odex=0):
+        ret=np.zeros((s.m1,s.m2))
+        for i1 in range(s.m1):
+            for i2 in range(s.m2):
+                ret[i1,i2]=s.q[dex(i1,i2,s.m1,s.m2)].ortho()[odex]
+        return ret
+
+
+
+
--- a/log.csv
+++ b/log.csv
--- a/onceasecond.py
+++ b/onceasecond.py
@ -0,0 +1,30 @@
+import time
+
+
+
+
+def onceasecond(func):
+    #global lastt
+    lastt=0
+    def fun(*args,**kwargs):
+        #global lastt
+        t=time.time()
+        if t-fun.lastt>1:
+            fun.lastt=t
+            func(*args,**kwargs)
+    fun.lastt=lastt
+    return fun
+
+
+
+
+
+
+if __name__=="__main__":
+    @onceasecond
+    def printi(i):
+        print(i)
+    i=0
+    while True:
+        i+=1
+        printi(i)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,9 @@
+keras
+tensorflow
+numpy
+matplotlib
+sklearn
+h5py
+scipy
+xevo
+
--- a/thyroid.mat
+++ b/thyroid.mat
--- a/traf.py
+++ b/traf.py
@ -0,0 +1,178 @@
+# TensorFlow and tf.keras
+#import tensorflow as tf#dont need it here;)
+
+# Helper libraries
+import numpy as np
+import matplotlib.pyplot as plt
+import math
+
+from xevo import eobj, quickmut,semiquickmut
+from ising2d import ising2devo
+
+import os
+from onceasecond import onceasecond
+from time import time
+
+from data import loadfile
+from sklearn.metrics import roc_curve,auc
+
+def cauc(n,a):
+    p=np.concatenate((n,a),axis=0)
+    l=np.concatenate((np.zeros(len(n)),np.ones(len(a))),axis=0)
+    fpr, tpr, threshold = roc_curve(l, p)
+
+    auc_score=auc(fpr,tpr)
+    return auc_score
+
+
+
+class matrixject(eobj):
+
+    def norm(s):
+        return np.mean(s.q**2)*len(s.q)
+
+    def __init__(s,mat=None,dim=2):
+
+        if mat is None:
+            s.q=np.random.normal(0,1,(dim,dim))
+            s.q/=s.norm()+0.00001
+        else:
+            s.q=mat
+
+
+        s.initial()
+
+    def __add__(a,b):
+        return a.__class__((a.q+b.q)/2)
+    def randomize(s):
+        s=s.copy()
+        s.q=np.random.normal(0,1,s.q.shape)
+        s.q/=s.norm()+0.00001
+        return s
+
+    def mutate(s):
+        s=s.copy()
+        i1,i2=np.random.randint(0,len(s.q),2)
+        val=np.random.normal(0,1)
+        exp=np.random.random()*5-3
+        s.q[i1,i2]+=val*exp
+        return s
+    def predict_data(s,data):
+        mat=(s.q-1)
+        pred=np.dot(data,mat)
+        return pred
+    def eval_data(s,data):
+        return np.mean(np.abs(s.predict_data(data)),axis=-1)
+    def calcstrength(s):
+        #mat*x=x
+        #(mat-1)*x=0
+        mat=(s.q-1)
+        loss=np.mean(s.eval_data(data))
+        #loss+=1/(1+np.mean(np.abs(mat)))#punish trivial solution
+        return loss
+
+    def shallmaximize(s):
+        return False
+
+    def _copy(s):
+        return s.__class__(s.q)
+
+    def __repr__(s):
+        return str(s.q)
+
+
+class ising2dlog(ising2devo):
+    def __init__(s,*args,**kwargs):
+        ising2devo.__init__(s,*args,**kwargs)
+        s.starttime=time()
+    @onceasecond
+    def stepsave(s):
+        return #disabled here
+        np.savez_compressed(logfile.replace("log.csv",f"step{s.i}"),mats=[zw.q for zw in s.q],stres=[zw.strength() for zw in s.q],i=s.i)
+    def logeneration(s,show=True):
+        mx,mn,std=ising2devo.logeneration(s=s,show=show)
+        global logfile
+        with open(logfile,"a") as f:
+            f.write(f"{mx},{mn},{std},")
+            f.write(",".join([str(qq.strength()) for qq in s.q]))
+            f.write("\n")
+        s.stepsave()
+        t=time()
+        if t-s.starttime>600:#10min
+            mx=-1.0
+        return mx,mn,std
+
+
+
+def train(x,y,pth,typ):
+    x=x.astype("float32")
+    y=y.astype("float32")
+
+    x/=np.mean(np.abs(x))
+
+    global logfile
+    logfile=pth+"/log.csv"
+
+    #x-=np.mean(x,axis=0)#no correcting here
+    #x/=(np.std(x,axis=0)+0.00001)
+
+    train=x[np.where(y[:,0]==0)]
+    test_a=x[np.where(y[:,0]==1)]
+    test_n=train[:len(test_a)]
+    train=train[len(test_a):]
+
+    #global dd
+    #dd=train
+
+    #exit()
+
+    global data
+    data=train
+
+
+    obj=matrixject(dim=int(data.shape[1]))
+
+    boardsize=10
+
+    opt=ising2dlog(m1=boardsize,m2=boardsize,#dimension of the board, resulting in population of 49
+            temp=10.0,#semi arbitrary hyper param
+            mergews=0.3,#high merge ws since (1+1)/2=1
+            close="c")#classical ising neighbourhood
+
+
+
+    solv=semiquickmut(obj,goal=0.0,maxsteps=500000,population=boardsize*boardsize,opt=opt)
+
+    stres=[zw.strength() for zw in solv]
+    matrices=[zw.q for zw in solv]
+    sol=solv[np.argmin(stres)]
+
+
+    alle=[zw.eval_data(train) for zw in solv]
+    allen=[zw.eval_data(test_n) for zw in solv]
+    allea=[zw.eval_data(test_a) for zw in solv]
+
+    lss = sol.calcstrength()
+    #p=sol.predict_data(train)
+    #pn=sol.predict_data(test_n)
+    #pa=sol.predict_data(test_a)
+
+    e=sol.eval_data(train)
+    en=sol.eval_data(test_n)
+    ea=sol.eval_data(test_a)
+
+    mn=np.mean(e)
+    dn=(en-mn)**2
+    da=(ea-mn)**2
+
+    auc=cauc(dn,da)
+    print(f"auc={auc}")
+
+
+
+if __name__ == '__main__':
+    x,y=loadfile()
+    train(x,y,".","")
+
+
+