1123 lines
30 KiB
TeX
1123 lines
30 KiB
TeX
\UseRawInputEncoding
|
|
%\documentclass[hyperref={pdfpagelabels=false}]{beamer}
|
|
\documentclass[hyperref={pdfpagelabels=false},aspectratio=169]{beamer}
|
|
% Die Hyperref Option hyperref={pdfpagelabels=false} verhindert die Warnung:
|
|
% Package hyperref Warning: Option `pdfpagelabels' is turned off
|
|
% (hyperref) because \thepage is undefined.
|
|
% Hyperref stopped early
|
|
%
|
|
|
|
\usepackage{lmodern}
|
|
% Das Paket lmodern erspart die folgenden Warnungen:
|
|
% LaTeX Font Warning: Font shape `OT1/cmss/m/n' in size <4> not available
|
|
% (Font) size <5> substituted on input line 22.
|
|
% LaTeX Font Warning: Size substitutions with differences
|
|
% (Font) up to 1.0pt have occurred.
|
|
%
|
|
|
|
% Wenn \titel{\ldots} \author{\ldots} erst nach \begin{document} kommen,
|
|
% kommt folgende Warnung:
|
|
% Package hyperref Warning: Option `pdfauthor' has already been used,
|
|
% (hyperref) ...
|
|
% Daher steht es hier vor \begin{document}
|
|
|
|
\title[yano]{pip install yano}
|
|
\author{Simon Kluettermann}
|
|
\date{\today}
|
|
|
|
|
|
\institute{ls9 tu Dortmund}
|
|
|
|
|
|
% Dadurch wird verhindert, dass die Navigationsleiste angezeigt wird.
|
|
\setbeamertemplate{navigation symbols}{}
|
|
|
|
% zusaetzlich ist das usepackage{beamerthemeshadow} eingebunden
|
|
\usepackage{beamerthemeshadow}
|
|
|
|
\hypersetup{pdfstartview={Fit}} % fits the presentation to the window when first displayed
|
|
|
|
\usepackage{appendixnumberbeamer}
|
|
\usepackage{listings}
|
|
|
|
|
|
\usetheme{CambridgeUS}
|
|
\usepackage{ngerman}
|
|
\usecolortheme{dolphin}
|
|
|
|
|
|
% \beamersetuncovermixins{\opaqueness<1>{25}}{\opaqueness<2$\Rightarrow${15}}
|
|
% sorgt dafuer das die Elemente die erst noch (zukuenftig) kommen
|
|
% nur schwach angedeutet erscheinen
|
|
%\beamersetuncovermixins{\opaqueness<1>{25}}{\opaqueness<2$\Rightarrow${15}}%here disabled
|
|
% klappt auch bei Tabellen, wenn teTeX verwendet wird\ldots
|
|
\renewcommand{\figurename}{}
|
|
|
|
\setbeamertemplate{footline}
|
|
{
|
|
\leavevmode%
|
|
\hbox{%
|
|
\begin{beamercolorbox}[wd=.4\paperwidth,ht=2.25ex,dp=1ex,center]{author in head/foot}%
|
|
\usebeamerfont{author in head/foot}\insertshorttitle
|
|
\end{beamercolorbox}%
|
|
\begin{beamercolorbox}[wd=.25\paperwidth,ht=2.25ex,dp=1ex,center]{title in head/foot}%
|
|
\usebeamerfont{title in head/foot}\insertsection
|
|
\end{beamercolorbox}%
|
|
\begin{beamercolorbox}[wd=.3499\paperwidth,ht=2.25ex,dp=1ex,right]{date in head/foot}%
|
|
\usebeamerfont{date in head/foot}\insertshortdate{}\hspace*{2em}
|
|
\hyperlink{toc}{\insertframenumber{} / \inserttotalframenumber\hspace*{2ex}}
|
|
\end{beamercolorbox}}%
|
|
\vskip0pt%
|
|
}
|
|
|
|
\usepackage[absolute,overlay]{textpos}
|
|
\usepackage{graphicx}
|
|
|
|
\newcommand{\source}[1]{\begin{textblock*}{9cm}(0.1cm,8.9cm)
|
|
\begin{beamercolorbox}[ht=0.5cm,left]{framesource}
|
|
\usebeamerfont{framesource}\usebeamercolor[fg!66]{framesource} Source: {#1}
|
|
\end{beamercolorbox}
|
|
\end{textblock*}}
|
|
|
|
|
|
\begin{document}
|
|
|
|
|
|
|
|
%from file ../yano//data/000.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{titlepage}
|
|
|
|
\centering
|
|
{\huge\bfseries \par}
|
|
\vspace{2cm}
|
|
{\LARGE\itshape Simon Kluettermann\par}
|
|
\vspace{1.5cm}
|
|
{\scshape\Large Master Thesis in Physics\par}
|
|
\vspace{0.2cm}
|
|
{\Large submitted to the \par}
|
|
\vspace{0.2cm}
|
|
{\scshape\Large Faculty of Mathematics Computer Science and Natural Sciences \par}
|
|
\vspace{0.2cm}
|
|
{\Large \par}
|
|
\vspace{0.2cm}
|
|
{\scshape\Large RWTH Aachen University}
|
|
\vspace{1cm}
|
|
|
|
\vfill
|
|
{\scshape\Large Department of Physics\par}
|
|
\vspace{0.2cm}
|
|
{\scshape\Large Insitute for theoretical Particle Physics and Cosmology\par}
|
|
\vspace{0.2cm}
|
|
{ \Large\par}
|
|
\vspace{0.2cm}
|
|
{\Large First Referee: Prof. Dr. Michael Kraemer \par}
|
|
{\Large Second Referee: Prof. Dr. Felix Kahlhoefer}
|
|
|
|
\vfill
|
|
|
|
% Bottom of the page
|
|
{\large November 2020 \par}
|
|
\end{titlepage}
|
|
\pagenumbering{roman}
|
|
\thispagestyle{empty}
|
|
\null
|
|
\newpage
|
|
\setcounter{page}{1}
|
|
\pagenumbering{arabic}
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/001Problem.txt
|
|
\begin{frame}[label=Problem]
|
|
\frametitle{Problem}
|
|
\begin{itemize}
|
|
|
|
\item Paper with Benedikt
|
|
|
|
\item require multiple very specific datasets
|
|
|
|
\begin{itemize}
|
|
|
|
\item many but not to many features
|
|
|
|
\item at least some samples (for the NN)
|
|
|
|
\item Only numerical attributes best
|
|
|
|
\item specific quality
|
|
|
|
\item unrelated datasets
|
|
|
|
|
|
\end{itemize}
|
|
\item Requires you to search for many datasets and filter them
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/002Students.txt
|
|
\begin{frame}[label=Students]
|
|
\frametitle{Students}
|
|
\begin{itemize}
|
|
|
|
\item Not clear what you can use
|
|
|
|
\item Many different formats
|
|
|
|
\item train/test splits
|
|
|
|
\item So for Students I just do this work and send them archives directly
|
|
|
|
\item $\Rightarrow$Not a good solution
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/003yano.txt
|
|
\begin{frame}[label=yano]
|
|
\frametitle{yano}
|
|
\begin{itemize}
|
|
|
|
\item So I have been packaging all my scripts
|
|
|
|
\item I had surprisingly much fun doing this
|
|
|
|
\begin{itemize}
|
|
|
|
\item More than just standard functions
|
|
|
|
\item A couple of weird decisions
|
|
|
|
\item And this will likely grow further
|
|
|
|
|
|
\end{itemize}
|
|
\item $\Rightarrow$So I would like to discuss some parts with you and maybe you even have more features you might want
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/004yano.txt
|
|
\begin{frame}[label=yano]
|
|
\frametitle{yano}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Simply install it over pip
|
|
|
|
\item Contains 187 real-World Datasets
|
|
|
|
\item $\Rightarrow$biggest library of datasets explicitely for anomaly detection
|
|
|
|
\item not yet happy with this
|
|
|
|
\item especially only mostly contains numerical and nominal attributes
|
|
|
|
\item $\Rightarrow$few categorical and no time-series attributes
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../prep/04yano/a.png}
|
|
\label{fig:prep04yanoapng}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/005selector.txt
|
|
\newpage
|
|
\section{Basics}\label{sec:Basics}
|
|
%{{{for_Basics}}}
|
|
|
|
\begin{frame}[label=selector,containsverbatim]
|
|
\frametitle{selector}
|
|
\begin{lstlisting}[language=Python]
|
|
import yano
|
|
from yano.symbols import *
|
|
condition= (number_of_features>5) &
|
|
(number_of_features<100) &
|
|
(number_of_samples>100) &
|
|
(number_of_samples<10000) &
|
|
(number_of_samples>2*number_of_features) &
|
|
~index
|
|
print(len(condition), "Datasets found")
|
|
\end{lstlisting}
|
|
|
|
$\Rightarrow$33 Datasets found
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/006selectors.txt
|
|
\begin{frame}[label=selectors]
|
|
\frametitle{selectors}
|
|
\begin{itemize}
|
|
|
|
\item Lots of symbols like this
|
|
|
|
\begin{itemize}
|
|
|
|
\item name
|
|
|
|
\item number\_of\_features
|
|
|
|
\item number\_of\_samples
|
|
|
|
\item index (correlated datasets)
|
|
|
|
|
|
\end{itemize}
|
|
\item Feature types
|
|
|
|
\begin{itemize}
|
|
|
|
\item numeric
|
|
|
|
\item nominal
|
|
|
|
\item categorical
|
|
|
|
\item (textual)
|
|
|
|
|
|
\end{itemize}
|
|
\item Count based
|
|
|
|
\begin{itemize}
|
|
|
|
\item number\_anomalies
|
|
|
|
\item number\_normals
|
|
|
|
\item fraction\_anomalies
|
|
|
|
|
|
\end{itemize}
|
|
\item Specific ones
|
|
|
|
\begin{itemize}
|
|
|
|
\item image\_based
|
|
|
|
\item (linearly\_seperable)
|
|
|
|
|
|
\end{itemize}
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/007iterating.txt
|
|
\begin{frame}[label=iterating,containsverbatim]
|
|
\frametitle{iterating}
|
|
\begin{lstlisting}[language=Python]
|
|
for dataset in condition:
|
|
print(condition)
|
|
\end{lstlisting}
|
|
|
|
\begin{itemize}
|
|
|
|
\item \[annthyroid\]
|
|
|
|
\item \[breastw\]
|
|
|
|
\item \[cardio\]
|
|
|
|
\item \[...\]
|
|
|
|
\item \[Housing\_low\]
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/008iterating.txt
|
|
\begin{frame}[label=iterating,containsverbatim]
|
|
\frametitle{iterating}
|
|
\begin{lstlisting}[language=Python]
|
|
for dataset in condition:
|
|
x=dataset.getx()
|
|
y=dataset.gety()
|
|
\end{lstlisting}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/009pipeline.txt
|
|
\begin{frame}[label=pipeline,containsverbatim]
|
|
\frametitle{pipeline}
|
|
\begin{lstlisting}[language=Python]
|
|
from yano.iter import *
|
|
for dataset, x,tx,ty in pipeline(condition,
|
|
split,
|
|
shuffle,
|
|
normalize("minmax")):
|
|
...
|
|
\end{lstlisting}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/010pipeline.txt
|
|
\begin{frame}[label=pipeline]
|
|
\frametitle{pipeline}
|
|
\begin{itemize}
|
|
|
|
\item Again there are a couple modifiers possible
|
|
|
|
\begin{itemize}
|
|
|
|
\item nonconst$\Rightarrow$remove constant features
|
|
|
|
\item shuffle
|
|
|
|
\item normalize('zscore'/'minmax')
|
|
|
|
\item cut(10)$\Rightarrow$at most 10 datasets
|
|
|
|
\item split$\Rightarrow$train test split, all anomalies in test set
|
|
|
|
\item crossval(5)$\Rightarrow$similar to split, but do multiple times (crossvalidation)
|
|
|
|
|
|
\end{itemize}
|
|
\item modifiers interact with each other
|
|
|
|
\item For example: normalize('minmax'), split
|
|
|
|
\item $\Rightarrow$train set always below 1, but no guarantees for the test set
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/011CrossValidation.txt
|
|
\begin{frame}[label=CrossValidation]
|
|
\frametitle{CrossValidation}
|
|
\begin{itemize}
|
|
|
|
\item Learned from DMC: Crossvalidation is important
|
|
|
|
\item Rarely found in Anomaly Detection, why?
|
|
|
|
\item A bit more complicated (not all samples are equal), but no reason why not
|
|
|
|
\item $\Rightarrow$So I implemented it into yano
|
|
|
|
\begin{itemize}
|
|
|
|
\item folding only on normal data
|
|
|
|
\item How to handle anomalies?
|
|
|
|
\item If not folding them, cross-validation less useful
|
|
|
|
\item if folding them, often rare anomalies even more rare
|
|
|
|
\item $\Rightarrow$test set always 50\% anomalous
|
|
|
|
\item $\Rightarrow$Also improves simple evaluation metrics (accuracy)
|
|
|
|
|
|
\end{itemize}
|
|
\item Do you know a reason why Cross Validation is not common in AD?
|
|
|
|
\item Are there Problems with the way I fold my Anomalies?
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/012Logging.txt
|
|
\begin{frame}[label=Logging,containsverbatim]
|
|
\frametitle{Logging}
|
|
\begin{lstlisting}[language=Python]
|
|
from yano.logging import Logger
|
|
from pyod.models.iforest import IForest
|
|
from extended_iforest import train_extended_ifor
|
|
l=Logger({"IFor":IForest(n_estimators=100),
|
|
"eIFor":train_extended_ifor})
|
|
for dataset, folds in pipeline(condition,
|
|
crossval(5),
|
|
normalize("minmax"),
|
|
shuffle):
|
|
l.run_cross(dataset, folds)
|
|
latex=l.to_latex()
|
|
\end{lstlisting}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/013Seeding.txt
|
|
\begin{frame}[label=Seeding]
|
|
\frametitle{Seeding}
|
|
\begin{itemize}
|
|
|
|
\item If you dont do anything, everything is seeded.
|
|
|
|
\item Makes rerunning a Model until the performance is good quite obvious.
|
|
|
|
\item But as every Run is seeded itself, this might induce bias.
|
|
|
|
\item Do you think this is worth it?
|
|
|
|
\item Are there any Problems with this?
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/014.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{lll}
|
|
\hline
|
|
Dataset & eIFor & IFor \\
|
|
\hline
|
|
$pc3$ & $\textbf{0.7231} \pm 0.0153$ & $\textbf{0.7223} \pm 0.0178$ \\
|
|
$pima$ & $\textbf{0.7405} \pm 0.0110$ & $\textbf{0.7347} \pm 0.0126$ \\
|
|
$Diabetes\_present$ & $\textbf{0.7414} \pm 0.0195$ & $\textbf{0.7344} \pm 0.0242$ \\
|
|
$waveform-5000$ & $\textbf{0.7687} \pm 0.0123$ & $\textbf{0.7592} \pm 0.0206$ \\
|
|
$vowels$ & $\textbf{0.7843} \pm 0.0298$ & $\textbf{0.7753} \pm 0.0334$ \\
|
|
$Vowel\_0$ & $\textbf{0.8425} \pm 0.0698$ & $0.7193 \pm 0.0817$ \\
|
|
$Abalone\_1\_8$ & $\textbf{0.8525} \pm 0.0263$ & $0.8452 \pm 0.0257$ \\
|
|
$annthyroid$ & $0.8399 \pm 0.0135$ & $\textbf{0.9087} \pm 0.0090$ \\
|
|
$Vehicle\_van$ & $\textbf{0.8792} \pm 0.0265$ & $\textbf{0.8697} \pm 0.0383$ \\
|
|
$ionosphere$ & $\textbf{0.9320} \pm 0.0069$ & $0.9086 \pm 0.0142$ \\
|
|
$breastw$ & $\textbf{0.9948} \pm 0.0031$ & $\textbf{0.9952} \pm 0.0033$ \\
|
|
$segment$ & $\textbf{1.0}$ & $\textbf{0.9993} \pm 0.0015$ \\
|
|
$$ & $$ & $$ \\
|
|
$Average$ & $\textbf{0.8005}$ & $\textbf{0.7957}$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/015statistics.txt
|
|
\begin{frame}[label=statistics]
|
|
\frametitle{statistics}
|
|
\begin{itemize}
|
|
|
|
\item Friedman test to see if there is a difference between models
|
|
|
|
\item Nemenyi test to see which models are equal, mark those equal to the maximum
|
|
|
|
\item For 2 models, Friedman not defined $\Rightarrow$ use Wilcoxon test
|
|
|
|
\item Does this match your expectation from the table?
|
|
|
|
\item Two models are 'equal' if their probability of being from the same distribution is $p_{b} \leq p$, what value should $p_{b} = 0.1$ have?
|
|
|
|
\item Do I need to correct for p hacking (n experiments, so increase the difficulty for each, or is that clear from the table)
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/016Extended Isolation Forests.txt
|
|
\newpage
|
|
\section{Experiments 1}\label{sec:Experiments 1}
|
|
%{{{for_Experiments 1}}}
|
|
|
|
\begin{frame}[label=Extended Isolation Forests]
|
|
\frametitle{Extended Isolation Forests}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Isolation Forests are one algorithm for AD
|
|
|
|
\item Tries to isolate abnormal (rare) points instead of modelling normal ones
|
|
|
|
\item Creative approach$\Rightarrow$fairly successful (3000 Citations)
|
|
|
|
\item Many follow up papers
|
|
|
|
\item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)
|
|
|
|
\item Remove bias from the Isolation Forests
|
|
|
|
\item Also claim to improve their anomaly detection quality
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../imgs/ifor}
|
|
\label{fig:ifor}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}[label=Extended Isolation Forests]
|
|
\frametitle{Extended Isolation Forests}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Isolation Forests are one algorithm for AD
|
|
|
|
\item Tries to isolate abnormal (rare) points instead of modelling normal ones
|
|
|
|
\item Creative approach$\Rightarrow$fairly successful (3000 Citations)
|
|
|
|
\item Many follow up papers
|
|
|
|
\item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)
|
|
|
|
\item Remove bias from the Isolation Forests
|
|
|
|
\item Also claim to improve their anomaly detection quality
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../imgs/eifor}
|
|
\label{fig:eifor}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}[label=Extended Isolation Forests]
|
|
\frametitle{Extended Isolation Forests}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Isolation Forests are one algorithm for AD
|
|
|
|
\item Tries to isolate abnormal (rare) points instead of modelling normal ones
|
|
|
|
\item Creative approach$\Rightarrow$fairly successful (3000 Citations)
|
|
|
|
\item Many follow up papers
|
|
|
|
\item Extended Isolation Forest (Hariri et. al. 2018, 140 Citations)
|
|
|
|
\item Remove bias from the Isolation Forests
|
|
|
|
\item Also claim to improve their anomaly detection quality
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../imgs/qual}
|
|
\label{fig:qual}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/017.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{lll}
|
|
\hline
|
|
Dataset & eIFor & IFor \\
|
|
\hline
|
|
$Delft\_pump\_5x3\_noisy$ & $\textbf{0.3893} \pm 0.0345$ & $\textbf{0.4272} \pm 0.0680$ \\
|
|
$vertebral$ & $\textbf{0.4260} \pm 0.0111$ & $\textbf{0.4554} \pm 0.0416$ \\
|
|
$Liver\_1$ & $0.5367 \pm 0.0508$ & $\textbf{0.5474} \pm 0.0541$ \\
|
|
$Sonar\_mines$ & $\textbf{0.6882} \pm 0.1264$ & $0.6189 \pm 0.1301$ \\
|
|
$letter$ & $\textbf{0.6756} \pm 0.0119$ & $0.6471 \pm 0.0111$ \\
|
|
$Glass\_building\_float$ & $\textbf{0.6480} \pm 0.1012$ & $\textbf{0.6755} \pm 0.1117$ \\
|
|
$pc3$ & $\textbf{0.7231} \pm 0.0153$ & $\textbf{0.7223} \pm 0.0178$ \\
|
|
$pima$ & $\textbf{0.7405} \pm 0.0110$ & $\textbf{0.7347} \pm 0.0126$ \\
|
|
$Diabetes\_present$ & $\textbf{0.7414} \pm 0.0195$ & $\textbf{0.7344} \pm 0.0242$ \\
|
|
$waveform-5000$ & $\textbf{0.7687} \pm 0.0123$ & $\textbf{0.7592} \pm 0.0206$ \\
|
|
$steel-plates-fault$ & $\textbf{0.7735} \pm 0.0351$ & $\textbf{0.7682} \pm 0.0402$ \\
|
|
$vowels$ & $\textbf{0.7843} \pm 0.0298$ & $\textbf{0.7753} \pm 0.0334$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/018.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{lll}
|
|
\hline
|
|
Dataset & eIFor & IFor \\
|
|
\hline
|
|
$Vowel\_0$ & $\textbf{0.8425} \pm 0.0698$ & $0.7193 \pm 0.0817$ \\
|
|
$Housing\_low$ & $\textbf{0.7807} \pm 0.0333$ & $\textbf{0.7862} \pm 0.0336$ \\
|
|
$ozone-level-8hr$ & $\textbf{0.7904} \pm 0.0207$ & $\textbf{0.7768} \pm 0.0118$ \\
|
|
$Spectf\_0$ & $\textbf{0.8155} \pm 0.0255$ & $0.7535 \pm 0.0239$ \\
|
|
$HeartC$ & $0.7795 \pm 0.0258$ & $\textbf{0.8079} \pm 0.0255$ \\
|
|
$satellite$ & $\textbf{0.8125} \pm 0.0170$ & $\textbf{0.8103} \pm 0.0061$ \\
|
|
$optdigits$ & $\textbf{0.8099} \pm 0.0310$ & $\textbf{0.8142} \pm 0.0267$ \\
|
|
$spambase$ & $\textbf{0.8085} \pm 0.0110$ & $\textbf{0.8202} \pm 0.0042$ \\
|
|
$Abalone\_1\_8$ & $\textbf{0.8525} \pm 0.0263$ & $0.8452 \pm 0.0257$ \\
|
|
$qsar-biodeg$ & $\textbf{0.8584} \pm 0.0119$ & $\textbf{0.8628} \pm 0.0135$ \\
|
|
$annthyroid$ & $0.8399 \pm 0.0135$ & $\textbf{0.9087} \pm 0.0090$ \\
|
|
$Vehicle\_van$ & $\textbf{0.8792} \pm 0.0265$ & $\textbf{0.8697} \pm 0.0383$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/019.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{lll}
|
|
\hline
|
|
Dataset & eIFor & IFor \\
|
|
\hline
|
|
$ionosphere$ & $\textbf{0.9320} \pm 0.0069$ & $0.9086 \pm 0.0142$ \\
|
|
$page-blocks$ & $0.9189 \pm 0.0061$ & $\textbf{0.9299} \pm 0.0016$ \\
|
|
$Ecoli$ & $\textbf{0.9418} \pm 0.0292$ & $0.9192 \pm 0.0332$ \\
|
|
$cardio$ & $\textbf{0.9564} \pm 0.0043$ & $\textbf{0.9535} \pm 0.0036$ \\
|
|
$wbc$ & $\textbf{0.9611} \pm 0.0121$ & $\textbf{0.9607} \pm 0.0107$ \\
|
|
$pendigits$ & $\textbf{0.9641} \pm 0.0097$ & $\textbf{0.9652} \pm 0.0076$ \\
|
|
$thyroid$ & $0.9818 \pm 0.0024$ & $\textbf{0.9871} \pm 0.0025$ \\
|
|
$breastw$ & $\textbf{0.9948} \pm 0.0031$ & $\textbf{0.9952} \pm 0.0033$ \\
|
|
$segment$ & $\textbf{1.0}$ & $\textbf{0.9993} \pm 0.0015$ \\
|
|
$$ & $$ & $$ \\
|
|
$Average$ & $\textbf{0.8005} \pm 0.1458$ & $\textbf{0.7957} \pm 0.1431$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/020highdim.txt
|
|
\newpage
|
|
\section{Experiments 2}\label{sec:Experiments 2}
|
|
%{{{for_Experiments 2}}}
|
|
|
|
\begin{frame}[label=highdim]
|
|
\frametitle{highdim}
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../prep/19highdim/a.png}
|
|
\label{fig:prep19highdimapng}
|
|
\end{figure}
|
|
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/021New Condition.txt
|
|
\begin{frame}[label=New Condition,containsverbatim]
|
|
\frametitle{New Condition}
|
|
\begin{lstlisting}[language=Python]
|
|
condition= (number_of_samples>200) &
|
|
(number_of_samples<10000) &
|
|
(number_of_features>50) &
|
|
(number_of_features<500) &
|
|
~index
|
|
print(len(condition),"Datasets found")
|
|
\end{lstlisting}
|
|
|
|
$\Rightarrow$13 Datasets found
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/022New Models.txt
|
|
\begin{frame}[label=New Models,containsverbatim]
|
|
\frametitle{New Models}
|
|
\begin{lstlisting}[language=Python]
|
|
from pyod.models.iforest import IForest
|
|
from pyod.models.knn import KNN
|
|
from pyod.models.lof import LOF
|
|
l=Logger({"IFor":Iforest(n_estimators=100),
|
|
"Lof":LOF(),
|
|
"Knn": KNN()}, addfeat=True)
|
|
\end{lstlisting}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/023.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{llll}
|
|
\hline
|
|
Dataset & Knn & Lof & IFor \\
|
|
\hline
|
|
$Delft\_pump\_5x3\_noisy(64)$ & $0.3800 \pm 0.0475$ & $0.3462 \pm 0.0327$ & $\textbf{0.4272} \pm 0.0680$ \\
|
|
$hill-valley(100)$ & $0.4744 \pm 0.0269$ & $\textbf{0.5060} \pm 0.0327$ & $0.4720 \pm 0.0288$ \\
|
|
$speech(400)$ & $0.4903 \pm 0.0103$ & $\textbf{0.5104} \pm 0.0115$ & $0.4872 \pm 0.0184$ \\
|
|
$Sonar\_mines(60)$ & $\textbf{0.7284} \pm 0.0939$ & $0.6769 \pm 0.0933$ & $0.6189 \pm 0.1301$ \\
|
|
$ozone-level-8hr(72)$ & $\textbf{0.8051} \pm 0.0288$ & $0.7738 \pm 0.0292$ & $\textbf{0.7768} \pm 0.0118$ \\
|
|
$spambase(57)$ & $0.8038 \pm 0.0125$ & $0.7712 \pm 0.0055$ & $\textbf{0.8202} \pm 0.0042$ \\
|
|
$arrhythmia(274)$ & $\textbf{0.8137} \pm 0.0185$ & $0.8042 \pm 0.0186$ & $\textbf{0.8086} \pm 0.0099$ \\
|
|
$mnist(100)$ & $0.9345 \pm 0.0039$ & $\textbf{0.9548} \pm 0.0037$ & $0.8732 \pm 0.0069$ \\
|
|
$Concordia3\_32(256)$ & $0.9246 \pm 0.0107$ & $\textbf{0.9486} \pm 0.0099$ & $\textbf{0.9322} \pm 0.0178$ \\
|
|
$optdigits(64)$ & $0.9966 \pm 0.0012$ & $\textbf{0.9975} \pm 0.0012$ & $0.8142 \pm 0.0267$ \\
|
|
$gas-drift(128)$ & $\textbf{0.9790} \pm 0.0018$ & $0.9585 \pm 0.0055$ & $0.8764 \pm 0.0166$ \\
|
|
$Delft\_pump\_AR(160)$ & $\textbf{0.9965}$ & $\textbf{0.9953} \pm 0.0019$ & $0.9665 \pm 0.0096$ \\
|
|
$musk(166)$ & $\textbf{1.0}$ & $\textbf{1.0}$ & $0.9808 \pm 0.0117$ \\
|
|
$$ & $$ & $$ & $$ \\
|
|
$Average$ & $\textbf{0.7944}$ & $\textbf{0.7879}$ & $0.7580$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/024.txt
|
|
\begin{frame}[label=,containsverbatim]
|
|
\frametitle{}
|
|
\begin{itemize}
|
|
|
|
\item Hypothesis: Isolation Forests are better when there are numerical and nominal attributes
|
|
|
|
\item Easy to test
|
|
|
|
|
|
\end{itemize}
|
|
\begin{lstlisting}[language=Python]
|
|
condition=condition & (numeric & nominal)
|
|
\end{lstlisting}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/025.txt
|
|
\begin{frame}[label=]
|
|
\frametitle{}
|
|
\begin{tabular}{llll}
|
|
\hline
|
|
Dataset & Knn & IFor & Lof \\
|
|
\hline
|
|
$ozone-level-8hr(72)$ & $\textbf{0.8051} \pm 0.0288$ & $\textbf{0.7768} \pm 0.0118$ & $0.7738 \pm 0.0292$ \\
|
|
$spambase(57)$ & $0.8038 \pm 0.0125$ & $\textbf{0.8202} \pm 0.0042$ & $0.7712 \pm 0.0055$ \\
|
|
$arrhythmia(274)$ & $\textbf{0.8137} \pm 0.0185$ & $\textbf{0.8086} \pm 0.0099$ & $0.8042 \pm 0.0186$ \\
|
|
$musk(166)$ & $\textbf{1.0}$ & $0.9808 \pm 0.0117$ & $\textbf{1.0}$ \\
|
|
$$ & $$ & $$ & $$ \\
|
|
$Average$ & $\textbf{0.8556}$ & $\textbf{0.8466}$ & $\textbf{0.8373}$ \\
|
|
\hline
|
|
\end{tabular}
|
|
\begin{itemize}
|
|
|
|
\item Only 4 datasets, so not clear at all
|
|
|
|
\item $\Rightarrow$More datasets
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/026Unsupervised Optimization.txt
|
|
\newpage
|
|
\section{Experiments 3}\label{sec:Experiments 3}
|
|
%{{{for_Experiments 3}}}
|
|
|
|
\begin{frame}[label=Unsupervised Optimization]
|
|
\frametitle{Unsupervised Optimization}
|
|
\begin{itemize}
|
|
|
|
\item There are analysis that are only possible with many datasets
|
|
|
|
\item Here: unsupervised optimization
|
|
|
|
\item Given multiple AD models, find which is best:
|
|
|
|
\item Use AUC score? Requires Anomalies$\Rightarrow$Overfitting
|
|
|
|
\item Can you find an unsupervised Method?
|
|
|
|
\item In general very complicated, so here only focus on very small differences in the model.
|
|
|
|
\item So each model is an autoencoder, trained on the same dataset, where the difference is only in the initialisation
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/027Loss Optimization.txt
|
|
\begin{frame}[label=Loss Optimization]
|
|
\frametitle{Loss Optimization}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item First guess Loss of the Model on the training Data
|
|
|
|
\item How to evaluate this?
|
|
|
|
\item Train many models, look at the average AUC score.
|
|
|
|
\item For the alternative, take groups of 20 models, and look at the AUC score of the best model.
|
|
|
|
\item Is there a meaningfull difference between results? Give result as z\_score ($\frac{m_{1} - m_{2}}{\sqrt{s_{1}^{2} + s_{2}^{2}}}$)
|
|
|
|
\item This difference depends a lot on the dataset
|
|
|
|
\item $\Rightarrow$even $30 \leq z$ does not mean much
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../imgs/histone_page-blocks}
|
|
\label{fig:histone_page-blocks}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
\begin{frame}[label=Loss Optimization]
|
|
\frametitle{Loss Optimization}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item First guess Loss of the Model on the training Data
|
|
|
|
\item How to evaluate this?
|
|
|
|
\item Train many models, look at the average AUC score.
|
|
|
|
\item For the alternative, take groups of 20 models, and look at the AUC score of the best model.
|
|
|
|
\item Is there a meaningfull difference between results? Give result as z\_score ($\frac{m_{1} - m_{2}}{\sqrt{s_{1}^{2} + s_{2}^{2}}}$)
|
|
|
|
\item This difference depends a lot on the dataset
|
|
|
|
\item $\Rightarrow$even $30 \leq z$ does not mean much
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../imgs/histone_pima}
|
|
\label{fig:histone_pima}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/028loss.txt
|
|
\begin{frame}[label=loss]
|
|
\frametitle{loss}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Pick the Model with the lowest l2\-loss
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../prep/27loss/z_loss.pdf}
|
|
\label{fig:prep27lossz_losspdf}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/029Robustness.txt
|
|
\begin{frame}[label=Robustness]
|
|
\frametitle{Robustness}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Pick points with 1\% width difference in input space around each point.
|
|
|
|
\item for each point, find the maximum difference in output space.
|
|
|
|
\item average this difference
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../prep/28Robustness/z_robu.pdf}
|
|
\label{fig:prep28Robustnessz_robupdf}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/030Distance Correlation.txt
|
|
\begin{frame}[label=Distance Correlation]
|
|
\frametitle{Distance Correlation}
|
|
\begin{columns}[c] % align columns
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{itemize}
|
|
|
|
\item Pick random points in the input space.
|
|
|
|
\item measure the distance in input and output space
|
|
|
|
\item a low correlation is a good model
|
|
|
|
|
|
\end{itemize}
|
|
\end{column}%
|
|
\hfill%
|
|
\begin{column}{0.48\textwidth}%.48
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.9\textwidth]{../prep/29Distance_Correlation/z_dist.pdf}
|
|
\label{fig:prep29Distance_Correlationz_distpdf}
|
|
\end{figure}
|
|
|
|
|
|
\end{column}%
|
|
\hfill%
|
|
\end{columns}
|
|
|
|
\end{frame}
|
|
|
|
|
|
%from file ../yano//data/031Other.txt
|
|
\newpage
|
|
\section{Conclusion}\label{sec:Conclusion}
|
|
%{{{for_Conclusion}}}
|
|
|
|
\begin{frame}[label=Other]
|
|
\frametitle{Other}
|
|
\begin{itemize}
|
|
|
|
\item Things I still want to add:
|
|
|
|
\begin{itemize}
|
|
|
|
\item Ensemble Methods
|
|
|
|
\item Visualisation options
|
|
|
|
\item Alternative Evaluations
|
|
|
|
\item Hyperparameter optimisation (with crossvalidation)
|
|
|
|
\item Parallelisation
|
|
|
|
\item Contamination
|
|
|
|
\item Dokumentation
|
|
|
|
|
|
\end{itemize}
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
|
|
%from file ../yano//data/032Feedback.txt
|
|
\begin{frame}[label=Feedback]
|
|
\frametitle{Feedback}
|
|
\begin{itemize}
|
|
|
|
\item What do you think about this?
|
|
|
|
\item Is there something I should also add?
|
|
|
|
\item What would you need for you to actually use this?
|
|
|
|
|
|
\end{itemize}
|
|
\end{frame}
|
|
|
|
|
|
%from folder ../yano//data/Forests.txt
|
|
|
|
|
|
%from folder ../yano//data/Isolation
|
|
|
|
|
|
%from folder ../yano//data/Optimization.txt
|
|
|
|
|
|
|
|
\end{document}
|