from tqdm import tqdm from sortedcontainers import SortedList as slist import numpy as np from sklearn.metrics import roc_curve,auc import keras def ngrams(q,n=3): return [q[i:i+n] for i in range(len(q)-n)] def multigramm(q,n=3): ret=set() print("listing ngrams") for zw in tqdm(q): ret=ret|set(ngrams(zw,n=n)) return list(ret) def ngramtrafo(q,gram,n=3): ret=[] gram=slist(gram)#speed up. Trust me, you want this print("converting to vector") for qq in tqdm(q): g=ngrams(qq,n=n) ac=[0 for i in range(len(gram))] for gg in g: try: dex=gram.index(gg) except:continue if dex>0:ac[dex]=1 ret.append(ac) return np.array(ret) class stroo(): """simple merge class for tf model and ngrams""" def __init__(s,model,grams,n,m): """model:tf object grams: list of ngrams n: len of each ngram m: mean of trained data for tf model""" s.model=model s.grams=grams s.n=n s.m=m def predict(s,data): """generate normality scores for the provided data""" ng=ngramtrafo(data,s.grams, n=s.n) p=s.model.predict(ng) p=(p-s.m)**2 return p def eval(s,x,y): """evaluates a model by a list of strings x that are normal(y=0) or abnormal (y=1)""" p=s.predict(x) fpr,tpr,_=roc_curve(y,p) return auc(fpr,tpr) def train_model(data,n=3): """trains an oneoff network for a list of (normal) strings data by using (n)grams, returns a model""" ngrams=multigramm(data,n=n) data=ngramtrafo(data,ngrams,n=n) #tensorflow stuff (not at all optimised) inp=keras.Input(data.shape[1:]) q=inp q=keras.layers.Dense(10,activation="relu",use_bias=False)(q) q=keras.layers.Dense(4,activation="relu",use_bias=False)(q) q=keras.layers.Dense(1,activation="relu",use_bias=False)(q) model=keras.models.Model(inp,q) model.compile("adam","mse") model.fit(data,np.ones(len(data),dtype="float"), batch_size=100, epochs=50, validation_split=0.1) pm=np.mean(model.predict(data)) return stroo(model,ngrams,n=n,m=pm)