commit c4eb649d28ef05a9a586832f6ce0205777dd7c1c Author: Simon Klüttermann Date: Sun Sep 19 14:38:23 2021 +0200 initial commit diff --git a/__pycache__/stroo.cpython-39.pyc b/__pycache__/stroo.cpython-39.pyc new file mode 100644 index 0000000..8d9d41b Binary files /dev/null and b/__pycache__/stroo.cpython-39.pyc differ diff --git a/main.py b/main.py new file mode 100644 index 0000000..8d35c0d --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +import numpy as np + +from stroo import * + + +f=np.load("useragents.npz") + +model=train_model(f["train"],n=2) + +print(model.eval(f["testx"],f["testy"])) + + + + diff --git a/stroo.py b/stroo.py new file mode 100644 index 0000000..835bca0 --- /dev/null +++ b/stroo.py @@ -0,0 +1,103 @@ + +from tqdm import tqdm +from sortedcontainers import SortedList as slist + +import numpy as np + +from sklearn.metrics import roc_curve,auc +import keras + + +def ngrams(q,n=3): + return [q[i:i+n] for i in range(len(q)-n)] + +def multigramm(q,n=3): + ret=set() + print("listing ngrams") + for zw in tqdm(q): + ret=ret|set(ngrams(zw,n=n)) + return list(ret) + +def ngramtrafo(q,gram,n=3): + ret=[] + gram=slist(gram)#speed up. Trust me, you want this + print("converting to vector") + for qq in tqdm(q): + g=ngrams(qq,n=n) + ac=[0 for i in range(len(gram))] + for gg in g: + try: + dex=gram.index(gg) + except:continue + if dex>0:ac[dex]=1 + ret.append(ac) + return np.array(ret) + + + +class stroo(): + """simple merge class for tf model and ngrams""" + def __init__(s,model,grams,n,m): + """model:tf object + grams: list of ngrams + n: len of each ngram + m: mean of trained data for tf model""" + + s.model=model + s.grams=grams + s.n=n + s.m=m + + def predict(s,data): + """generate normality scores for the provided data""" + ng=ngramtrafo(data,s.grams, n=s.n) + p=s.model.predict(ng) + p=(p-s.m)**2 + return p + + + def eval(s,x,y): + """evaluates a model by a list of strings x that are normal(y=0) or abnormal (y=1)""" + p=s.predict(x) + fpr,tpr,_=roc_curve(y,p) + return auc(fpr,tpr) + + + +def train_model(data,n=3): + """trains an oneoff network for a list of (normal) strings data by using (n)grams, returns a model""" + ngrams=multigramm(data,n=n) + data=ngramtrafo(data,ngrams,n=n) + + + #tensorflow stuff (not at all optimised) + inp=keras.Input(data.shape[1:]) + q=inp + q=keras.layers.Dense(10,activation="relu",use_bias=False)(q) + q=keras.layers.Dense(4,activation="relu",use_bias=False)(q) + q=keras.layers.Dense(1,activation="relu",use_bias=False)(q) + + model=keras.models.Model(inp,q) + + model.compile("adam","mse") + model.fit(data,np.ones(len(data),dtype="float"), + batch_size=100, + epochs=50, + validation_split=0.1) + + pm=np.mean(model.predict(data)) + + return stroo(model,ngrams,n=n,m=pm) + + + + + + + + + + + + + diff --git a/useragents.npz b/useragents.npz new file mode 100644 index 0000000..65470c9 Binary files /dev/null and b/useragents.npz differ