stroo/stroo.py

104 lines
2.2 KiB
Python

from tqdm import tqdm
from sortedcontainers import SortedList as slist
import numpy as np
from sklearn.metrics import roc_curve,auc
import keras
def ngrams(q,n=3):
return [q[i:i+n] for i in range(len(q)-n)]
def multigramm(q,n=3):
ret=set()
print("listing ngrams")
for zw in tqdm(q):
ret=ret|set(ngrams(zw,n=n))
return list(ret)
def ngramtrafo(q,gram,n=3):
ret=[]
gram=slist(gram)#speed up. Trust me, you want this
print("converting to vector")
for qq in tqdm(q):
g=ngrams(qq,n=n)
ac=[0 for i in range(len(gram))]
for gg in g:
try:
dex=gram.index(gg)
except:continue
if dex>0:ac[dex]=1
ret.append(ac)
return np.array(ret)
class stroo():
"""simple merge class for tf model and ngrams"""
def __init__(s,model,grams,n,m):
"""model:tf object
grams: list of ngrams
n: len of each ngram
m: mean of trained data for tf model"""
s.model=model
s.grams=grams
s.n=n
s.m=m
def predict(s,data):
"""generate normality scores for the provided data"""
ng=ngramtrafo(data,s.grams, n=s.n)
p=s.model.predict(ng)
p=(p-s.m)**2
return p
def eval(s,x,y):
"""evaluates a model by a list of strings x that are normal(y=0) or abnormal (y=1)"""
p=s.predict(x)
fpr,tpr,_=roc_curve(y,p)
return auc(fpr,tpr)
def train_model(data,n=3):
"""trains an oneoff network for a list of (normal) strings data by using (n)grams, returns a model"""
ngrams=multigramm(data,n=n)
data=ngramtrafo(data,ngrams,n=n)
#tensorflow stuff (not at all optimised)
inp=keras.Input(data.shape[1:])
q=inp
q=keras.layers.Dense(10,activation="relu",use_bias=False)(q)
q=keras.layers.Dense(4,activation="relu",use_bias=False)(q)
q=keras.layers.Dense(1,activation="relu",use_bias=False)(q)
model=keras.models.Model(inp,q)
model.compile("adam","mse")
model.fit(data,np.ones(len(data),dtype="float"),
batch_size=100,
epochs=50,
validation_split=0.1)
pm=np.mean(model.predict(data))
return stroo(model,ngrams,n=n,m=pm)