From 62614ce1016c86e3f00f35b56399292ceabd486b Mon Sep 17 00:00:00 2001
From: Dongjia Zhang <tokazerkje@outlook.com>
Date: Thu, 27 Jan 2022 03:23:04 +0900
Subject: [PATCH] LLVM AutoTokens (#470)

* posix dict2file llvm pass

* new PM

* working

* clean up

* fmt

* fix

* silence clippy

* bring the println back

* early return

* rename

* weak symbols

* linux onky

* fuzzbench change

* only linux

* linux only

* cfg

* cfg

* fix

* fix

* fix

* why

* fix

* bug fix

* rename

* rename

* macros & rename

* add_from_autotokens

* fix fuzzbench

* std -> core

* builder pattern?

* clippy

* wrong cfg

* cfgstd

* fuzzbench fmt

* no unsafe

* update fuzzbench_text

* use TokenSectiopn

Co-authored-by: Andrea Fioraldi <andreafioraldi@gmail.com>
---
 fuzzers/fuzzbench/src/bin/libafl_cc.rs      |   4 +
 fuzzers/fuzzbench/src/lib.rs                |  21 +-
 fuzzers/fuzzbench_text/src/bin/libafl_cc.rs |   4 +
 fuzzers/fuzzbench_text/src/lib.rs           |  37 +-
 fuzzers/libfuzzer_libpng_ctx/Makefile       |   6 +-
 libafl/src/mutators/token_mutations.rs      | 100 ++-
 libafl_cc/build.rs                          |  11 +
 libafl_cc/src/autotokens-pass.cc            | 737 ++++++++++++++++++++
 libafl_cc/src/clang.rs                      |   5 +
 libafl_targets/src/common.h                 |   9 +
 libafl_targets/src/coverage.c               |  14 +
 libafl_targets/src/coverage.rs              |  15 +
 12 files changed, 947 insertions(+), 16 deletions(-)
 create mode 100644 libafl_cc/src/autotokens-pass.cc
diff --git a/fuzzers/fuzzbench/src/bin/libafl_cc.rs b/fuzzers/fuzzbench/src/bin/libafl_cc.rs
index 8c9e37f638..68a84b0333 100644
--- a/fuzzers/fuzzbench/src/bin/libafl_cc.rs
+++ b/fuzzers/fuzzbench/src/bin/libafl_cc.rs
@@ -16,6 +16,10 @@ pub fn main() {
         dir.pop();
 
         let mut cc = ClangWrapper::new();
+
+        #[cfg(target_os = "linux")]
+        cc.add_pass(LLVMPasses::AutoTokens);
+
         if let Some(code) = cc
             .cpp(is_cpp)
             // silence the compiler wrapper output, needed for some configure scripts.
diff --git a/fuzzers/fuzzbench/src/lib.rs b/fuzzers/fuzzbench/src/lib.rs
index 7af42e89c1..ad9de01327 100644
--- a/fuzzers/fuzzbench/src/lib.rs
+++ b/fuzzers/fuzzbench/src/lib.rs
@@ -38,7 +38,7 @@ use libafl::{
     monitors::SimpleMonitor,
     mutators::{
         scheduled::havoc_mutations, token_mutations::I2SRandReplace, tokens_mutations,
-        StdMOptMutator, StdScheduledMutator, Tokens,
+        StdMOptMutator, StdScheduledMutator, TokenSection, Tokens,
     },
     observers::{HitcountsMapObserver, StdMapObserver, TimeObserver},
     stages::{
@@ -54,6 +54,9 @@ use libafl_targets::{
     MAX_EDGES_NUM,
 };
 
+#[cfg(target_os = "linux")]
+use libafl_targets::token_section;
+
 /// The fuzzer main (as `no_mangle` C function)
 #[no_mangle]
 pub fn libafl_main() {
@@ -352,9 +355,19 @@ fn fuzz(
     let mut stages = tuple_list!(calibration, tracing, i2s, power);
 
     // Read tokens
-    if let Some(tokenfile) = tokenfile {
-        if state.metadata().get::<Tokens>().is_none() {
-            state.add_metadata(Tokens::from_tokens_file(tokenfile)?);
+    if state.metadata().get::<Tokens>().is_none() {
+        let mut toks = Tokens::default();
+        if let Some(tokenfile) = tokenfile {
+            toks = toks.parse_tokens_file(vec![tokenfile])?;
+        }
+        #[cfg(target_os = "linux")]
+        {
+            let token_section = TokenSection::new(token_section());
+            toks = toks.parse_autotokens(token_section)?;
+        }
+
+        if !toks.tokens().is_empty() {
+            state.add_metadata(toks);
         }
     }
 
diff --git a/fuzzers/fuzzbench_text/src/bin/libafl_cc.rs b/fuzzers/fuzzbench_text/src/bin/libafl_cc.rs
index 8c9e37f638..68a84b0333 100644
--- a/fuzzers/fuzzbench_text/src/bin/libafl_cc.rs
+++ b/fuzzers/fuzzbench_text/src/bin/libafl_cc.rs
@@ -16,6 +16,10 @@ pub fn main() {
         dir.pop();
 
         let mut cc = ClangWrapper::new();
+
+        #[cfg(target_os = "linux")]
+        cc.add_pass(LLVMPasses::AutoTokens);
+
         if let Some(code) = cc
             .cpp(is_cpp)
             // silence the compiler wrapper output, needed for some configure scripts.
diff --git a/fuzzers/fuzzbench_text/src/lib.rs b/fuzzers/fuzzbench_text/src/lib.rs
index 99fef86fe4..4be2e33846 100644
--- a/fuzzers/fuzzbench_text/src/lib.rs
+++ b/fuzzers/fuzzbench_text/src/lib.rs
@@ -44,7 +44,7 @@ use libafl::{
         },
         scheduled::havoc_mutations,
         token_mutations::I2SRandReplace,
-        tokens_mutations, StdMOptMutator, StdScheduledMutator, Tokens,
+        tokens_mutations, StdMOptMutator, StdScheduledMutator, TokenSection, Tokens,
     },
     observers::{HitcountsMapObserver, StdMapObserver, TimeObserver},
     stages::{
@@ -60,6 +60,9 @@ use libafl_targets::{
     MAX_EDGES_NUM,
 };
 
+#[cfg(target_os = "linux")]
+use libafl_targets::token_section;
+
 /// The fuzzer main (as `no_mangle` C function)
 #[no_mangle]
 pub fn libafl_main() {
@@ -413,9 +416,19 @@ fn fuzz_binary(
     let mut stages = tuple_list!(calibration, tracing, i2s, power);
 
     // Read tokens
-    if let Some(tokenfile) = tokenfile {
-        if state.metadata().get::<Tokens>().is_none() {
-            state.add_metadata(Tokens::from_tokens_file(tokenfile)?);
+    if state.metadata().get::<Tokens>().is_none() {
+        let mut toks = Tokens::default();
+        if let Some(tokenfile) = tokenfile {
+            toks = toks.parse_tokens_file(vec![tokenfile])?;
+        }
+        #[cfg(target_os = "linux")]
+        {
+            let token_section = TokenSection::new(token_section());
+            toks = toks.parse_autotokens(token_section)?;
+        }
+
+        if !toks.tokens().is_empty() {
+            state.add_metadata(toks);
         }
     }
 
@@ -623,9 +636,19 @@ fn fuzz_text(
     let mut stages = tuple_list!(generalization, calibration, tracing, i2s, power, grimoire);
 
     // Read tokens
-    if let Some(tokenfile) = tokenfile {
-        if state.metadata().get::<Tokens>().is_none() {
-            state.add_metadata(Tokens::from_tokens_file(tokenfile)?);
+    if state.metadata().get::<Tokens>().is_none() {
+        let mut toks = Tokens::default();
+        if let Some(tokenfile) = tokenfile {
+            toks = toks.parse_tokens_file(vec![tokenfile])?;
+        }
+        #[cfg(target_os = "linux")]
+        {
+            let token_section = TokenSection::new(token_section());
+            toks = toks.parse_autotokens(token_section)?;
+        }
+
+        if !toks.tokens().is_empty() {
+            state.add_metadata(toks);
         }
     }
 
diff --git a/fuzzers/libfuzzer_libpng_ctx/Makefile b/fuzzers/libfuzzer_libpng_ctx/Makefile
index ca0a514241..e7ed9d10c5 100644
--- a/fuzzers/libfuzzer_libpng_ctx/Makefile
+++ b/fuzzers/libfuzzer_libpng_ctx/Makefile
@@ -40,11 +40,11 @@ clean:
 	$(MAKE) -C libpng-1.6.37 clean
 
 run: all
-	./$(FUZZER_NAME) --cores 0 &
+	./$(FUZZER_NAME) --cores 0 --input ./corpus &
 
 short_test: all
 	rm -rf libafl_unix_shmem_server || true
-	timeout 10s ./$(FUZZER_NAME) --cores 0 &
+	timeout 10s ./$(FUZZER_NAME) --cores 0 --input ./corpus &
 
 test: all
-	timeout 60s ./$(FUZZER_NAME) --cores 0 &
+	timeout 60s ./$(FUZZER_NAME) --cores 0 --input ./corpus &
diff --git a/libafl/src/mutators/token_mutations.rs b/libafl/src/mutators/token_mutations.rs
index e7f3e389cf..ba3a4d16c3 100644
--- a/libafl/src/mutators/token_mutations.rs
+++ b/libafl/src/mutators/token_mutations.rs
@@ -22,8 +22,27 @@ use crate::{
     Error,
 };
 
+#[derive(Debug, Clone, Copy)]
+/// Struct for token start and end
+pub struct TokenSection {
+    start: *const u8,
+    stop: *const u8,
+}
+
+impl TokenSection {
+    /// Init
+    #[must_use]
+    pub fn new(section: (*const u8, *const u8)) -> Self {
+        Self {
+            start: section.0,
+            stop: section.1,
+        }
+    }
+}
+
 /// A state metadata holding a list of tokens
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Default, Serialize, Deserialize)]
+#[allow(clippy::unsafe_derive_deserialize)]
 pub struct Tokens {
     token_vec: Vec<Vec<u8>>,
 }
@@ -38,6 +57,83 @@ impl Tokens {
         Self { token_vec }
     }
 
+    #[must_use]
+    /// Build tokens from vec
+    pub fn parse_vec(mut self, vec: Vec<Vec<u8>>) -> Self {
+        self.token_vec = vec;
+        self
+    }
+
+    /// Build tokens from files
+    #[cfg(feature = "std")]
+    pub fn parse_tokens_file<P>(mut self, files: Vec<P>) -> Result<Self, Error>
+    where
+        P: AsRef<Path>,
+    {
+        for file in files {
+            self.add_tokens_from_file(file)?;
+        }
+        Ok(self)
+    }
+
+    /// Build tokens from autotokens
+    pub fn parse_autotokens(mut self, autotoken: TokenSection) -> Result<Self, Error> {
+        unsafe {
+            self.add_from_autotokens(autotoken)?;
+        }
+        Ok(self)
+    }
+
+    ///  Reads from an autotokens section, returning the count of new entries read
+    pub unsafe fn add_from_autotokens(&mut self, autotoken: TokenSection) -> Result<usize, Error> {
+        if cfg!(target_os = "linux") {
+            let mut entries = 0;
+            let token_start = autotoken.start;
+            let token_stop = autotoken.stop;
+            let section_size: usize = token_stop.offset_from(token_start).try_into().unwrap();
+            // println!("size: {}", section_size);
+            let slice = core::slice::from_raw_parts(token_start, section_size);
+
+            let mut head = 0;
+
+            // Now we know the beginning and the end of the token section.. let's parse them into tokens
+            loop {
+                if head >= section_size {
+                    // Sanity Check
+                    assert!(head == section_size);
+                    break;
+                }
+                let size = slice[head] as usize;
+                head += 1;
+                if size > 0 {
+                    self.add_token(&slice[head..head + size].to_vec());
+                    #[cfg(feature = "std")]
+                    println!(
+                        "Token size: {} content: {:x?}",
+                        size,
+                        &slice[head..head + size].to_vec()
+                    );
+                    head += size;
+                    entries += 1;
+                }
+            }
+
+            Ok(entries)
+        } else {
+            // TODO: Autodict for OSX and windows
+            Ok(0)
+        }
+    }
+
+    /// Creates a new token from autotokens
+    pub fn from_autotokens(autotoken: TokenSection) -> Result<Self, Error> {
+        let mut ret = Self::new(vec![]);
+        unsafe {
+            ret.add_from_autotokens(autotoken)?;
+        }
+        Ok(ret)
+    }
+
     /// Creates a new instance from a file
     #[cfg(feature = "std")]
     pub fn from_tokens_file<P>(file: P) -> Result<Self, Error>
@@ -62,7 +158,7 @@ impl Tokens {
 
     /// Reads a tokens file, returning the count of new entries read
     #[cfg(feature = "std")]
-    pub fn add_tokens_from_file<P>(&mut self, file: P) -> Result<u32, Error>
+    pub fn add_tokens_from_file<P>(&mut self, file: P) -> Result<usize, Error>
     where
         P: AsRef<Path>,
     {
diff --git a/libafl_cc/build.rs b/libafl_cc/build.rs
index dd52f6452f..cecc1e5a10 100644
--- a/libafl_cc/build.rs
+++ b/libafl_cc/build.rs
@@ -142,6 +142,7 @@ fn main() {
 
         println!("cargo:rerun-if-changed=src/cmplog-routines-pass.cc");
         println!("cargo:rerun-if-changed=src/afl-coverage-pass.cc");
+        println!("cargo:rerun-if-changed=src/autotokens-pass.cc");
 
         let _ = Command::new(llvm_bindir.join("clang++"))
             .args(&cxxflags)
@@ -162,6 +163,16 @@ fn main() {
             .arg(out_dir.join(format!("afl-coverage-pass.{}", dll_extension())))
             .status()
             .expect("Failed to compile afl-coverage-pass.cc");
+
+        let _ = Command::new(llvm_bindir.join("clang++"))
+            .args(&cxxflags)
+            .args(&custom_flags)
+            .arg(src_dir.join("autotokens-pass.cc"))
+            .args(&ldflags)
+            .args(&["-fPIC", "-shared", "-o"])
+            .arg(out_dir.join(format!("autotokens-pass.{}", dll_extension())))
+            .status()
+            .expect("Failed to compile autotokens-pass.cc");
     } else {
         write!(
             &mut clang_constants_file,
diff --git a/libafl_cc/src/autotokens-pass.cc b/libafl_cc/src/autotokens-pass.cc
new file mode 100644
index 0000000000..ab17fad60b
--- /dev/null
+++ b/libafl_cc/src/autotokens-pass.cc
@@ -0,0 +1,737 @@
+/*
+   american fuzzy lop++ - LLVM LTO instrumentation pass
+   ----------------------------------------------------
+
+   Written by Marc Heuse <mh@mh-sec.de>
+
+   Copyright 2019-2020 AFLplusplus Project. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at:
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   This library is plugged into LLVM when invoking clang through afl-clang-lto.
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+
+#include <list>
+#include <string>
+#include <fstream>
+#include <set>
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Constants.h"
+
+#ifndef O_DSYNC
+  #define O_DSYNC O_SYNC
+#endif
+
+// The max length of a token
+#define MAX_AUTO_EXTRA 32
+
+#define USE_AUTO_EXTRAS 4096
+#define MAX_AUTO_EXTRAS (USE_AUTO_EXTRAS * 8)
+
+#include <iostream>
+
+#define FATAL(x...)                                                      \
+  do {                                                                   \
+                                                                         \
+    fprintf(stderr, "FATAL: " x);                                        \
+    exit(1);                                                             \
+                                                                         \
+  } while (0)
+
+using namespace llvm;
+
+namespace {
+
+/* Function that we never instrument or analyze */
+/* Note: this ignore check is also called in isInInstrumentList() */
+bool isIgnoreFunction(const llvm::Function *F) {
+
+  // Starting from "LLVMFuzzer" these are functions used in libfuzzer based
+  // fuzzing campaign installations, e.g. oss-fuzz
+
+  static constexpr const char *ignoreList[] = {
+
+      "asan.",
+      "llvm.",
+      "sancov.",
+      "__ubsan",
+      "ign.",
+      "__afl",
+      "_fini",
+      "__libc_",
+      "__asan",
+      "__msan",
+      "__cmplog",
+      "__sancov",
+      "__san",
+      "__cxx_",
+      "__decide_deferred",
+      "_GLOBAL",
+      "_ZZN6__asan",
+      "_ZZN6__lsan",
+      "msan.",
+      "LLVMFuzzerM",
+      "LLVMFuzzerC",
+      "LLVMFuzzerI",
+      "maybe_duplicate_stderr",
+      "discard_output",
+      "close_stdout",
+      "dup_and_close_stderr",
+      "maybe_close_fd_mask",
+      "ExecuteFilesOnyByOne"
+
+  };
+
+  for (auto const &ignoreListFunc : ignoreList) {
+
+    if (F->getName().startswith(ignoreListFunc)) { return true; }
+
+  }
+
+  static constexpr const char *ignoreSubstringList[] = {
+
+      "__asan",       "__msan",     "__ubsan", "__lsan",
+      "__san",        "__sanitize", "__cxx",   "_GLOBAL__",
+      "DebugCounter", "DwarfDebug", "DebugLoc"
+
+  };
+
+  for (auto const &ignoreListFunc : ignoreSubstringList) {
+
+    // hexcoder: F->getName().contains() not avaiilable in llvm 3.8.0
+    if (StringRef::npos != F->getName().find(ignoreListFunc)) { return true; }
+
+  }
+
+  return false;
+
+}
+
+class AutoTokensPass : public ModulePass {
+
+ public:
+  static char ID;
+
+  AutoTokensPass() : ModulePass(ID) {
+
+
+  }
+
+  bool runOnModule(Module &M) override;
+
+  protected:
+
+  private:
+    std::vector<std::string>  dictionary;
+
+};
+
+}  // namespace
+
+char AutoTokensPass::ID = 0;
+
+
+void dict2file(int fd, uint8_t *mem, uint32_t len) {
+  uint32_t  i, j, binary = 0;
+  char line[MAX_AUTO_EXTRA * 8], tmp[8];
+
+  strcpy(line, "\"");
+  j = 1;
+  for (i = 0; i < len; i++) {
+
+    if (isprint(mem[i]) && mem[i] != '\\' && mem[i] != '"') {
+
+      line[j++] = mem[i];
+
+    } else {
+
+      if (i + 1 != len || mem[i] != 0 || binary || len == 4 || len == 8) {
+
+        line[j] = 0;
+        sprintf(tmp, "\\x%02x", (uint8_t)mem[i]);
+        strcat(line, tmp);
+        j = strlen(line);
+
+      }
+
+      binary = 1;
+
+    }
+
+  }
+
+  line[j] = 0;
+  strcat(line, "\"\n");
+  if (write(fd, line, strlen(line)) <= 0)
+    FATAL("Could not write to dictionary file");
+  fsync(fd);
+
+}
+
+bool AutoTokensPass::runOnModule(Module &M) {
+
+  DenseMap<Value *, std::string *> valueMap;
+  char *                           ptr;
+  int                              fd, found = 0;
+  bool use_file = true;
+
+  /* Show a banner */
+  setvbuf(stdout, NULL, _IONBF, 0);
+
+  ptr = getenv("AFL_LLVM_DICT2FILE");
+
+  if (!ptr || *ptr != '/') {
+    fprintf(stderr, "AFL_LLVM_DICT2FILE is not set to an absolute path: %s\n", ptr);
+    fprintf(stderr, "Writing tokens into libafl_tokens section\n");
+
+    use_file = false;
+  }
+
+  if(use_file) {
+    if ((fd = open(ptr, O_WRONLY | O_APPEND | O_CREAT | O_DSYNC, 0644)) < 0)
+      FATAL("Could not open/create %s.", ptr);
+  }
+
+
+  /* Instrument all the things! */
+
+  for (auto &F : M) {
+
+    if (isIgnoreFunction(&F)) continue;
+
+    /*  Some implementation notes.
+     *
+     *  We try to handle 3 cases:
+     *  - memcmp("foo", arg, 3) <- literal string
+     *  - static char globalvar[] = "foo";
+     *    memcmp(globalvar, arg, 3) <- global variable
+     *  - char localvar[] = "foo";
+     *    memcmp(locallvar, arg, 3) <- local variable
+     *
+     *  The local variable case is the hardest. We can only detect that
+     *  case if there is no reassignment or change in the variable.
+     *  And it might not work across llvm version.
+     *  What we do is hooking the initializer function for local variables
+     *  (llvm.memcpy.p0i8.p0i8.i64) and note the string and the assigned
+     *  variable. And if that variable is then used in a compare function
+     *  we use that noted string.
+     *  This seems not to work for tokens that have a size <= 4 :-(
+     *
+     *  - if the compared length is smaller than the string length we
+     *    save the full string. This is likely better for fuzzing but
+     *    might be wrong in a few cases depending on optimizers
+     *
+     *  - not using StringRef because there is a bug in the llvm 11
+     *    checkout I am using which sometimes points to wrong strings
+     *
+     *  Over and out. Took me a full day. damn. mh/vh
+     */
+
+    for (auto &BB : F) {
+
+      for (auto &IN : BB) {
+
+        CallInst *callInst = nullptr;
+        CmpInst * cmpInst = nullptr;
+
+        if ((cmpInst = dyn_cast<CmpInst>(&IN))) {
+
+          Value *      op = cmpInst->getOperand(1);
+          ConstantInt *ilen = dyn_cast<ConstantInt>(op);
+
+          /* We skip > 64 bit integers. why? first because their value is
+             difficult to obtain, and second because clang does not support
+             literals > 64 bit (as of llvm 12) */
+
+          if (ilen && ilen->uge(0xffffffffffffffff) == false) {
+
+            uint64_t val2 = 0, val = ilen->getZExtValue();
+            uint32_t len = 0;
+            if (val > 0x10000 && val < 0xffffffff) len = 4;
+            if (val > 0x100000001 && val < 0xffffffffffffffff) len = 8;
+
+            if (len) {
+
+              auto c = cmpInst->getPredicate();
+
+              switch (c) {
+
+                case CmpInst::FCMP_OGT:  // fall through
+                case CmpInst::FCMP_OLE:  // fall through
+                case CmpInst::ICMP_SLE:  // fall through
+                case CmpInst::ICMP_SGT:
+
+                  // signed comparison and it is a negative constant
+                  if ((len == 4 && (val & 80000000)) ||
+                      (len == 8 && (val & 8000000000000000))) {
+
+                    if ((val & 0xffff) != 1) val2 = val - 1;
+                    break;
+
+                  }
+
+                  // fall through
+
+                case CmpInst::FCMP_UGT:  // fall through
+                case CmpInst::FCMP_ULE:  // fall through
+                case CmpInst::ICMP_UGT:  // fall through
+                case CmpInst::ICMP_ULE:
+                  if ((val & 0xffff) != 0xfffe) val2 = val + 1;
+                  break;
+
+                case CmpInst::FCMP_OLT:  // fall through
+                case CmpInst::FCMP_OGE:  // fall through
+                case CmpInst::ICMP_SLT:  // fall through
+                case CmpInst::ICMP_SGE:
+
+                  // signed comparison and it is a negative constant
+                  if ((len == 4 && (val & 80000000)) ||
+                      (len == 8 && (val & 8000000000000000))) {
+
+                    if ((val & 0xffff) != 1) val2 = val - 1;
+                    break;
+
+                  }
+
+                  // fall through
+
+                case CmpInst::FCMP_ULT:  // fall through
+                case CmpInst::FCMP_UGE:  // fall through
+                case CmpInst::ICMP_ULT:  // fall through
+                case CmpInst::ICMP_UGE:
+                  if ((val & 0xffff) != 1) val2 = val - 1;
+                  break;
+
+                default:
+                  val2 = 0;
+
+              }
+
+              if(use_file) {
+                dict2file(fd, (uint8_t *)&val, len);
+              }
+              else{
+                dictionary.push_back(std::string((char *)&val, len));
+              }
+
+              found++;
+              if (val2) {
+
+                if(use_file) {
+                  dict2file(fd, (uint8_t *)&val2, len);
+                }
+                else{
+                  dictionary.push_back(std::string((char *)&val2, len));
+                }
+                found++;
+
+              }
+
+            }
+
+          }
+
+        }
+
+        if ((callInst = dyn_cast<CallInst>(&IN))) {
+
+          bool   isStrcmp = true;
+          bool   isMemcmp = true;
+          bool   isStrncmp = true;
+          bool   isStrcasecmp = true;
+          bool   isStrncasecmp = true;
+          bool   isIntMemcpy = true;
+          bool   isStdString = true;
+          bool   addedNull = false;
+          size_t optLen = 0;
+
+          Function *Callee = callInst->getCalledFunction();
+          if (!Callee) continue;
+          if (callInst->getCallingConv() != llvm::CallingConv::C) continue;
+          std::string FuncName = Callee->getName().str();
+          isStrcmp &= !FuncName.compare("strcmp");
+          isMemcmp &=
+              (!FuncName.compare("memcmp") || !FuncName.compare("bcmp"));
+          isStrncmp &= !FuncName.compare("strncmp");
+          isStrcasecmp &= !FuncName.compare("strcasecmp");
+          isStrncasecmp &= !FuncName.compare("strncasecmp");
+          isIntMemcpy &= !FuncName.compare("llvm.memcpy.p0i8.p0i8.i64");
+          isStdString &= ((FuncName.find("basic_string") != std::string::npos &&
+                           FuncName.find("compare") != std::string::npos) ||
+                          (FuncName.find("basic_string") != std::string::npos &&
+                           FuncName.find("find") != std::string::npos));
+
+          if (!isStrcmp && !isMemcmp && !isStrncmp && !isStrcasecmp &&
+              !isStrncasecmp && !isIntMemcpy && !isStdString)
+            continue;
+
+          /* Verify the strcmp/memcmp/strncmp/strcasecmp/strncasecmp function
+           * prototype */
+          FunctionType *FT = Callee->getFunctionType();
+
+          isStrcmp &=
+              FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) &&
+              FT->getParamType(0) == FT->getParamType(1) &&
+              FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext());
+          isStrcasecmp &=
+              FT->getNumParams() == 2 && FT->getReturnType()->isIntegerTy(32) &&
+              FT->getParamType(0) == FT->getParamType(1) &&
+              FT->getParamType(0) == IntegerType::getInt8PtrTy(M.getContext());
+          isMemcmp &= FT->getNumParams() == 3 &&
+                      FT->getReturnType()->isIntegerTy(32) &&
+                      FT->getParamType(0)->isPointerTy() &&
+                      FT->getParamType(1)->isPointerTy() &&
+                      FT->getParamType(2)->isIntegerTy();
+          isStrncmp &= FT->getNumParams() == 3 &&
+                       FT->getReturnType()->isIntegerTy(32) &&
+                       FT->getParamType(0) == FT->getParamType(1) &&
+                       FT->getParamType(0) ==
+                           IntegerType::getInt8PtrTy(M.getContext()) &&
+                       FT->getParamType(2)->isIntegerTy();
+          isStrncasecmp &= FT->getNumParams() == 3 &&
+                           FT->getReturnType()->isIntegerTy(32) &&
+                           FT->getParamType(0) == FT->getParamType(1) &&
+                           FT->getParamType(0) ==
+                               IntegerType::getInt8PtrTy(M.getContext()) &&
+                           FT->getParamType(2)->isIntegerTy();
+          isStdString &= FT->getNumParams() >= 2 &&
+                         FT->getParamType(0)->isPointerTy() &&
+                         FT->getParamType(1)->isPointerTy();
+
+          if (!isStrcmp && !isMemcmp && !isStrncmp && !isStrcasecmp &&
+              !isStrncasecmp && !isIntMemcpy && !isStdString)
+            continue;
+
+          /* is a str{n,}{case,}cmp/memcmp, check if we have
+           * str{case,}cmp(x, "const") or str{case,}cmp("const", x)
+           * strn{case,}cmp(x, "const", ..) or strn{case,}cmp("const", x, ..)
+           * memcmp(x, "const", ..) or memcmp("const", x, ..) */
+          Value *Str1P = callInst->getArgOperand(0),
+                *Str2P = callInst->getArgOperand(1);
+          std::string Str1, Str2;
+          StringRef   TmpStr;
+          bool        HasStr1;
+          getConstantStringInfo(Str1P, TmpStr);
+
+          if (TmpStr.empty()) {
+
+            HasStr1 = false;
+
+          } else {
+
+            HasStr1 = true;
+            Str1 = TmpStr.str();
+
+          }
+
+          bool HasStr2;
+          getConstantStringInfo(Str2P, TmpStr);
+          if (TmpStr.empty()) {
+
+            HasStr2 = false;
+
+          } else {
+
+            HasStr2 = true;
+            Str2 = TmpStr.str();
+
+          }
+
+          // we handle the 2nd parameter first because of llvm memcpy
+          if (!HasStr2) {
+
+            auto *Ptr = dyn_cast<ConstantExpr>(Str2P);
+            if (Ptr && Ptr->isGEPWithNoNotionalOverIndexing()) {
+
+              if (auto *Var = dyn_cast<GlobalVariable>(Ptr->getOperand(0))) {
+
+                if (Var->hasInitializer()) {
+
+                  if (auto *Array =
+                          dyn_cast<ConstantDataArray>(Var->getInitializer())) {
+
+                    HasStr2 = true;
+                    Str2 = Array->getRawDataValues().str();
+
+                  }
+
+                }
+
+              }
+
+            }
+
+          }
+
+          // for the internal memcpy routine we only care for the second
+          // parameter and are not reporting anything.
+          if (isIntMemcpy == true) {
+
+            if (HasStr2 == true) {
+
+              Value *      op2 = callInst->getArgOperand(2);
+              ConstantInt *ilen = dyn_cast<ConstantInt>(op2);
+              if (ilen) {
+
+                uint64_t literalLength = Str2.length();
+                uint64_t optLength = ilen->getZExtValue();
+                if (literalLength + 1 == optLength) {
+
+                  Str2.append("\0", 1);  // add null byte
+
+                }
+
+                if (optLength > Str2.length()) { optLength = Str2.length(); }
+
+              }
+
+              valueMap[Str1P] = new std::string(Str2);
+              continue;
+
+            }
+
+            continue;
+
+          }
+
+          // Neither a literal nor a global variable?
+          // maybe it is a local variable that we saved
+          if (!HasStr2) {
+
+            std::string *strng = valueMap[Str2P];
+            if (strng && !strng->empty()) {
+
+              Str2 = *strng;
+              HasStr2 = true;
+
+            }
+
+          }
+
+          if (!HasStr1) {
+
+            auto Ptr = dyn_cast<ConstantExpr>(Str1P);
+
+            if (Ptr && Ptr->isGEPWithNoNotionalOverIndexing()) {
+
+              if (auto *Var = dyn_cast<GlobalVariable>(Ptr->getOperand(0))) {
+
+                if (Var->hasInitializer()) {
+
+                  if (auto *Array =
+                          dyn_cast<ConstantDataArray>(Var->getInitializer())) {
+
+                    HasStr1 = true;
+                    Str1 = Array->getRawDataValues().str();
+
+                  }
+
+                }
+
+              }
+
+            }
+
+          }
+
+          // Neither a literal nor a global variable?
+          // maybe it is a local variable that we saved
+          if (!HasStr1) {
+
+            std::string *strng = valueMap[Str1P];
+            if (strng && !strng->empty()) {
+
+              Str1 = *strng;
+              HasStr1 = true;
+
+            }
+
+          }
+
+          /* handle cases of one string is const, one string is variable */
+          if (!(HasStr1 ^ HasStr2)) continue;
+
+          std::string thestring;
+
+          if (HasStr1)
+            thestring = Str1;
+          else
+            thestring = Str2;
+
+          optLen = thestring.length();
+
+          if (optLen < 2 || (optLen == 2 && !thestring[1])) { continue; }
+
+          if (isMemcmp || isStrncmp || isStrncasecmp) {
+
+            Value *      op2 = callInst->getArgOperand(2);
+            ConstantInt *ilen = dyn_cast<ConstantInt>(op2);
+
+            if (ilen) {
+
+              uint64_t literalLength = optLen;
+              optLen = ilen->getZExtValue();
+              if (optLen > thestring.length()) { optLen = thestring.length(); }
+              if (optLen < 2) { continue; }
+              if (literalLength + 1 == optLen) {  // add null byte
+                thestring.append("\0", 1);
+                addedNull = true;
+
+              }
+
+            }
+
+          }
+
+          // add null byte if this is a string compare function and a null
+          // was not already added
+          if (!isMemcmp) {
+
+            if (addedNull == false && thestring[optLen - 1] != '\0') {
+
+              thestring.append("\0", 1);  // add null byte
+              optLen++;
+
+            }
+
+            if (!isStdString) {
+
+              // ensure we do not have garbage
+              size_t offset = thestring.find('\0', 0);
+              if (offset + 1 < optLen) optLen = offset + 1;
+              thestring = thestring.substr(0, optLen);
+
+            }
+
+          }
+
+          // we take the longer string, even if the compare was to a
+          // shorter part. Note that depending on the optimizer of the
+          // compiler this can be wrong, but it is more likely that this
+          // is helping the fuzzer
+          if (optLen != thestring.length()) optLen = thestring.length();
+          if (optLen > MAX_AUTO_EXTRA) optLen = MAX_AUTO_EXTRA;
+          if (optLen < 3)  // too short? skip
+            continue;
+
+          ptr = (char *)thestring.c_str();
+
+          if(use_file){
+            dict2file(fd, (uint8_t *)ptr, optLen);
+          }
+          else{
+            dictionary.push_back(thestring.substr(0, optLen));
+          }
+          found++;
+
+        }
+
+      }
+
+    }
+
+  }
+
+  if(use_file){
+    close(fd);
+    return true;
+  }
+
+  LLVMContext &Ctx = M.getContext();
+
+  if (dictionary.size()) {
+
+    size_t memlen = 0, count = 0, offset = 0;
+
+    // sort and unique the dictionary
+    std::sort(dictionary.begin(), dictionary.end());
+    auto last = std::unique(dictionary.begin(), dictionary.end());
+    dictionary.erase(last, dictionary.end());
+
+    for (auto token : dictionary) {
+
+      memlen += token.length();
+      count++;
+
+    }
+    if (count) {
+
+      auto ptrhld = std::unique_ptr<char[]>(new char[memlen + count]);
+
+      count = 0;
+
+      for (auto token : dictionary) {
+
+        if (offset + token.length() < 0xfffff0 && count < MAX_AUTO_EXTRAS) {
+
+          // This lenght is guranteed to be < MAX_AUTO_EXTRA
+          ptrhld.get()[offset++] = (uint8_t)token.length();
+          memcpy(ptrhld.get() + offset, token.c_str(), token.length());
+          offset += token.length();
+          count++;
+        }
+      }
+
+      // Type
+      ArrayType* arrayTy = ArrayType::get(IntegerType::get(Ctx, 8), offset);
+
+      // The actual dict
+      GlobalVariable *dict = new GlobalVariable(M, arrayTy, true, GlobalVariable::ExternalLinkage, ConstantDataArray::get(Ctx, *(new ArrayRef<char>(ptrhld.get(), offset))), "libafl_dictionary_" + M.getName());
+      dict->setSection("libafl_token");
+    }
+  }
+
+  return true;
+}
+
+
+static void registerAutoTokensPass(const PassManagerBuilder &,
+                                     legacy::PassManagerBase &PM) {
+
+  PM.add(new AutoTokensPass());
+
+}
+
+static RegisterPass<AutoTokensPass> X("autotokens",
+                                        "autotokens instrumentation pass",
+                                        false, false);
+
+static RegisterStandardPasses RegisterAutoTokensPass(
+    PassManagerBuilder::EP_OptimizerLast, registerAutoTokensPass);
+
+static RegisterStandardPasses RegisterAutoTokensPass0(
+    PassManagerBuilder::EP_EnabledOnOptLevel0, registerAutoTokensPass);
diff --git a/libafl_cc/src/clang.rs b/libafl_cc/src/clang.rs
index b6d77dbdf8..5f926f831b 100644
--- a/libafl_cc/src/clang.rs
+++ b/libafl_cc/src/clang.rs
@@ -31,6 +31,8 @@ pub enum LLVMPasses {
     CmpLogRtn,
     /// The AFL coverage pass
     AFLCoverage,
+    /// The Autotoken pass
+    AutoTokens,
 }
 
 impl LLVMPasses {
@@ -42,6 +44,9 @@ impl LLVMPasses {
                 .join(format!("cmplog-routines-pass.{}", dll_extension())),
             LLVMPasses::AFLCoverage => PathBuf::from(env!("OUT_DIR"))
                 .join(format!("afl-coverage-pass.{}", dll_extension())),
+            LLVMPasses::AutoTokens => {
+                PathBuf::from(env!("OUT_DIR")).join(format!("autotokens-pass.{}", dll_extension()))
+            }
         }
     }
 }
diff --git a/libafl_targets/src/common.h b/libafl_targets/src/common.h
index 97489b4ce9..46e8b5ef57 100644
--- a/libafl_targets/src/common.h
+++ b/libafl_targets/src/common.h
@@ -133,6 +133,10 @@
   #define EXT_FUNC_IMPL(NAME, RETURN_TYPE, FUNC_SIG, WARN) \
   __attribute__((weak, visibility("default"))) RETURN_TYPE NAME FUNC_SIG
 
+  // Weakly defined globals
+  #define EXT_VAR(NAME, TYPE) \
+  TYPE __attribute__((weak, visibility("default"))) NAME
+
 #else
 
 #define EXT_FUNC_IMPL(NAME, RETURN_TYPE, FUNC_SIG, WARN) \
@@ -141,6 +145,11 @@
 // Declare these symbols as weak to allow them to be optionally defined.
 #define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
   __attribute__((weak, visibility("default"))) RETURN_TYPE NAME FUNC_SIG
+
+// Weakly defined globals
+#define EXT_VAR(NAME, TYPE) \
+  TYPE __attribute__((weak, visibility("default"))) NAME
+
 #endif
 
 #define CHECK_WEAK_FN(Name) (Name != NULL)
diff --git a/libafl_targets/src/coverage.c b/libafl_targets/src/coverage.c
index cf28f51e51..cf9dbd6616 100644
--- a/libafl_targets/src/coverage.c
+++ b/libafl_targets/src/coverage.c
@@ -9,8 +9,22 @@ typedef uint32_t prev_loc_t;
 #define CTX_MAX_K 32U
 
 extern uint8_t __afl_area_ptr_local[EDGES_MAP_SIZE];
+
 uint8_t* __afl_area_ptr = __afl_area_ptr_local;
 
+
+// Weak symbols, LLVM Passes overwrites them if we really use it
+#ifdef __linux__
+extern EXT_VAR(__start_libafl_token, uint8_t);
+extern EXT_VAR(__stop_libafl_token, uint8_t);
+
+// Expose the start of libafl_token section as C symbols
+uint8_t* __token_start = &__start_libafl_token;
+uint8_t* __token_stop = &__stop_libafl_token;
+#endif
+
+
+
 //#if defined(__ANDROID__) || defined(__HAIKU__)
 MAYBE_THREAD_LOCAL prev_loc_t __afl_prev_loc[NGRAM_SIZE_MAX];
 MAYBE_THREAD_LOCAL prev_loc_t __afl_prev_caller[CTX_MAX_K];
diff --git a/libafl_targets/src/coverage.rs b/libafl_targets/src/coverage.rs
index feb2fb5b91..58e3eb99d2 100644
--- a/libafl_targets/src/coverage.rs
+++ b/libafl_targets/src/coverage.rs
@@ -13,9 +13,24 @@ pub static mut MAX_EDGES_NUM: usize = 0;
 extern "C" {
     /// The area pointer points to the edges map.
     pub static mut __afl_area_ptr: *mut u8;
+
+    /// Start of libafl token section
+    #[cfg(target_os = "linux")]
+    pub static __token_start: *const u8;
+
+    /// End of libafl token section
+    #[cfg(target_os = "linux")]
+    pub static __token_stop: *const u8;
 }
 pub use __afl_area_ptr as EDGES_MAP_PTR;
 
+/// Return token section's start and end as a tuple
+#[cfg(target_os = "linux")]
+#[must_use]
+pub fn token_section() -> (*const u8, *const u8) {
+    unsafe { (__token_start, __token_stop) }
+}
+
 /// The size of the map for edges.
 #[no_mangle]
 pub static mut __afl_map_size: usize = EDGES_MAP_SIZE;