2021-09-19 14:38:23 +02:00
|
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
from sortedcontainers import SortedList as slist
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
from sklearn.metrics import roc_curve,auc
|
|
|
|
import keras
|
|
|
|
|
|
|
|
|
|
|
|
def ngrams(q,n=3):
|
|
|
|
return [q[i:i+n] for i in range(len(q)-n)]
|
|
|
|
|
|
|
|
def multigramm(q,n=3):
|
|
|
|
ret=set()
|
|
|
|
print("listing ngrams")
|
|
|
|
for zw in tqdm(q):
|
|
|
|
ret=ret|set(ngrams(zw,n=n))
|
|
|
|
return list(ret)
|
|
|
|
|
|
|
|
def ngramtrafo(q,gram,n=3):
|
|
|
|
ret=[]
|
|
|
|
gram=slist(gram)#speed up. Trust me, you want this
|
|
|
|
print("converting to vector")
|
|
|
|
for qq in tqdm(q):
|
|
|
|
g=ngrams(qq,n=n)
|
|
|
|
ac=[0 for i in range(len(gram))]
|
|
|
|
for gg in g:
|
|
|
|
try:
|
|
|
|
dex=gram.index(gg)
|
|
|
|
except:continue
|
|
|
|
if dex>0:ac[dex]=1
|
|
|
|
ret.append(ac)
|
|
|
|
return np.array(ret)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class stroo():
|
|
|
|
"""simple merge class for tf model and ngrams"""
|
|
|
|
def __init__(s,model,grams,n,m):
|
|
|
|
"""model:tf object
|
|
|
|
grams: list of ngrams
|
|
|
|
n: len of each ngram
|
|
|
|
m: mean of trained data for tf model"""
|
|
|
|
|
|
|
|
s.model=model
|
|
|
|
s.grams=grams
|
|
|
|
s.n=n
|
|
|
|
s.m=m
|
|
|
|
|
|
|
|
def predict(s,data):
|
|
|
|
"""generate normality scores for the provided data"""
|
|
|
|
ng=ngramtrafo(data,s.grams, n=s.n)
|
|
|
|
p=s.model.predict(ng)
|
|
|
|
p=(p-s.m)**2
|
2021-09-19 19:19:53 +02:00
|
|
|
while len(p.shape)>1:
|
|
|
|
p=np.mean(p,axis=1)
|
2021-09-19 14:38:23 +02:00
|
|
|
return p
|
|
|
|
|
|
|
|
|
|
|
|
def eval(s,x,y):
|
|
|
|
"""evaluates a model by a list of strings x that are normal(y=0) or abnormal (y=1)"""
|
|
|
|
p=s.predict(x)
|
|
|
|
fpr,tpr,_=roc_curve(y,p)
|
|
|
|
return auc(fpr,tpr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def train_model(data,n=3):
|
|
|
|
"""trains an oneoff network for a list of (normal) strings data by using (n)grams, returns a model"""
|
|
|
|
ngrams=multigramm(data,n=n)
|
|
|
|
data=ngramtrafo(data,ngrams,n=n)
|
|
|
|
|
2021-09-19 19:19:53 +02:00
|
|
|
pm=[0.0]
|
|
|
|
while np.mean(pm)**2<0.0001:
|
2021-09-19 19:08:06 +02:00
|
|
|
#tensorflow stuff (not at all optimised)
|
|
|
|
inp=keras.Input(data.shape[1:])
|
|
|
|
q=inp
|
|
|
|
q=keras.layers.Dense(10,activation="relu",use_bias=False)(q)
|
2021-09-19 19:19:53 +02:00
|
|
|
q=keras.layers.Dense(7,activation="relu",use_bias=False)(q)
|
|
|
|
os=3
|
|
|
|
q=keras.layers.Dense(os,activation="relu",use_bias=False)(q)
|
2021-09-19 19:08:06 +02:00
|
|
|
|
|
|
|
model=keras.models.Model(inp,q)
|
|
|
|
|
|
|
|
model.compile("adam","mse")
|
2021-09-19 19:19:53 +02:00
|
|
|
model.fit(data,np.ones((len(data),os),dtype="float"),
|
2021-09-19 14:38:23 +02:00
|
|
|
batch_size=100,
|
|
|
|
epochs=50,
|
|
|
|
validation_split=0.1)
|
|
|
|
|
2021-09-19 19:19:53 +02:00
|
|
|
pm=np.mean(model.predict(data),axis=0)
|
2021-09-19 14:38:23 +02:00
|
|
|
|
|
|
|
return stroo(model,ngrams,n=n,m=pm)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|